
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.


#include <string>
#include <utility>
#include <vector>

namespace language_detection {

class TokenizedOutput {};

// Tokenizes the given input string on Unicode token boundaries, with a maximum
// of `max_tokens` tokens.
// If `exclude_nonalphaspace_tokens` is enabled, the tokenization ignores
// non-alphanumeric tokens, and replaces them with a replacement token (" ").
// The method returns the output in the `TokenizedOutput` struct, which stores
// both, the processed input string, and the indices and sizes of each token
// within that string.
TokenizedOutput Tokenize(const char* input_str,
                         size_t len,
                         size_t max_tokens,
                         bool exclude_nonalphaspace_tokens);

// Converts the given unicode string (`input_str`) with the specified length
// (`len`) to a lowercase string.
// The method populates the lowercased string in `output_str`.
void LowercaseUnicodeStr(const char* input_str,
                         int len,
                         std::string* output_str);

}  // namespace language_detection