nnet_language_identifier.cc | Explore in Territory

/* Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "nnet_language_identifier.h"

#include <math.h>

#include <algorithm>
#include <limits>
#include <string>

#include "base.h"
#include "embedding_network.h"
#include "registry.h"
#include "relevant_script_feature.h"
#include "script_span/generated_ulscript.h"
#include "script_span/getonescriptspan.h"
#include "script_span/text_processing.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"

namespace chrome_lang_id {
namespace {

// Struct for accumulating stats for a language as text subsequences of the same
// script are processed.
struct LangChunksStats { … };

// Compares two pairs based on their values.
bool OrderBySecondDescending(const std::pair<string, float> &x,
                             const std::pair<string, float> &y) { … }

// Returns "true" if the languge prediction is reliable based on the
// probability, and "false" otherwise.
bool ResultIsReliable(const string &language, float probability) { … }

// Finds the number of interchange-valid bytes to process.
int FindNumValidBytesToProcess(const string &text) { … }
}  // namespace

const int NNetLanguageIdentifier::kMinNumBytesToConsider = …;
const int NNetLanguageIdentifier::kMaxNumBytesToConsider = …;
const int NNetLanguageIdentifier::kMaxNumInputBytesToConsider = …;
const int NNetLanguageIdentifier::kNumSnippets = …;
const char NNetLanguageIdentifier::kUnknown[] = …;
const float NNetLanguageIdentifier::kReliabilityThreshold = …;
const float NNetLanguageIdentifier::kReliabilityHrBsThreshold = …;

const string LanguageIdEmbeddingFeatureExtractor::ArgPrefix() const { … }

NNetLanguageIdentifier::NNetLanguageIdentifier()
    : … { … }

static WholeSentenceFeature *cbog_factory() { … }

static WholeSentenceFeature *rsf_factory() { … }

static WholeSentenceFeature *sf_factory() { … }

NNetLanguageIdentifier::NNetLanguageIdentifier(int min_num_bytes,
                                               int max_num_bytes)
    : … { … }

NNetLanguageIdentifier::~NNetLanguageIdentifier() { … }

void NNetLanguageIdentifier::Setup(TaskContext *context) { … }

void NNetLanguageIdentifier::Init(TaskContext *context) { … }

void NNetLanguageIdentifier::GetFeatures(
    Sentence *sentence, std::vector<FeatureVector> *features) const { … }

// Returns the language name corresponding to the given id.
string NNetLanguageIdentifier::GetLanguageName(int language_id) const { … }

NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguage(
    const string &text) { … }

NNetLanguageIdentifier::Result NNetLanguageIdentifier::FindLanguageOfValidUTF8(
    const string &text) { … }

std::vector<NNetLanguageIdentifier::Result>
NNetLanguageIdentifier::FindTopNMostFreqLangs(const string &text,
                                              int num_langs) { … }

string NNetLanguageIdentifier::SelectTextGivenScriptSpan(
    const CLD2::LangSpan &script_span) { … }

string NNetLanguageIdentifier::SelectTextGivenBeginAndSize(
    const char *text_begin, int text_size) { … }

}  // namespace chrome_lang_id
chromium/third_party/cld_3/src/src/nnet_language_identifier.cc