/* Copyright 2016 Google Inc. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #ifndef RELEVANT_SCRIPT_FEATURE_H_ #define RELEVANT_SCRIPT_FEATURE_H_ #include "feature_extractor.h" #include "cld_3/protos/sentence.pb.h" #include "sentence_features.h" #include "task_context.h" #include "workspace.h" namespace chrome_lang_id { // Given a sentence, generates one FloatFeatureValue for each "relevant" Unicode // script (see below): each such feature indicates the script and the ratio of // UTF8 characters in that script, in the given sentence. // // What is a relevant script? Recognizing all 100+ Unicode scripts would // require too much code size and runtime. Instead, we focus only on a few // scripts that communicate a lot of language information: e.g., the use of // Hiragana characters almost always indicates Japanese, so Hiragana is a // "relevant" script for us. The Latin script is used by dozens of language, so // Latin is not relevant in this context. class RelevantScriptFeature : public WholeSentenceFeature { … }; } // namespace chrome_lang_id #endif // RELEVANT_SCRIPT_FEATURE_H_