chromium/chromeos/ash/components/local_search_service/content_extraction_utils.h

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROMEOS_ASH_COMPONENTS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_
#define CHROMEOS_ASH_COMPONENTS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_

#include <string>

#include "chromeos/ash/components/local_search_service/shared_structs.h"

namespace ash::local_search_service {

// Given a list of tokens, returns a list of tokens where each token has a
// unique content.
std::vector<Token> ConsolidateToken(const std::vector<Token>& tokens);

// Given a text document, returns a list of Tokens.
// Locale should be obtained using this function:
// base::i18n::GetConfiguredLocale(). The format of locale will be
// language-country@variant. Country and variant are optional.
std::vector<Token> ExtractContent(const std::string& content_id,
                                  const std::u16string& text,
                                  double weight,
                                  const std::string& locale);

// Checks if the locale is non Latin locales.
// Locale should be obtained using this function:
// base::i18n::GetConfiguredLocale(). The format of locale will be
// language-country@variant. Country and variant are optional.
bool IsNonLatinLocale(const std::string& locale);

// Checks if a word is a stopword given a locale. Locale will be in the
// following format: language-country@variant (country and variant are
// optional).
bool IsStopword(const std::u16string& word, const std::string& locale);

// Returns a normalized version of a string16: removes diacritics, convert to
// lower-case and possibly remove hyphen from the text (set to true by default).
std::u16string Normalizer(const std::u16string& word,
                          bool remove_hyphen = true);

}  // namespace ash::local_search_service

#endif  // CHROMEOS_ASH_COMPONENTS_LOCAL_SEARCH_SERVICE_CONTENT_EXTRACTION_UTILS_H_