chromium/components/language_detection/core/ngram_hash_ops_utils.h

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_LANGUAGE_DETECTION_CORE_NGRAM_HASH_OPS_UTILS_H_
#define COMPONENTS_LANGUAGE_DETECTION_CORE_NGRAM_HASH_OPS_UTILS_H_

#include <string>
#include <utility>
#include <vector>

namespace language_detection {

class TokenizedOutput {};

// Tokenizes the given input string on Unicode token boundaries, with a maximum
// of `max_tokens` tokens.
//
// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
// non-alphanumeric tokens, and replaces them with a replacement token (" ").
//
// The method returns the output in the `TokenizedOutput` struct, which stores
// both, the processed input string, and the indices and sizes of each token
// within that string.
TokenizedOutput Tokenize(const char* input_str,
                         size_t len,
                         size_t max_tokens,
                         bool exclude_nonalphaspace_tokens);

// Converts the given unicode string (`input_str`) with the specified length
// (`len`) to a lowercase string.
//
// The method populates the lowercased string in `output_str`.
void LowercaseUnicodeStr(const char* input_str,
                         int len,
                         std::string* output_str);

}  // namespace language_detection

#endif  // COMPONENTS_LANGUAGE_DETECTION_CORE_NGRAM_HASH_OPS_UTILS_H_