chromium/chrome/browser/ash/input_method/text_utils.cc

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/ash/input_method/text_utils.h"

// TODO(crbug/1223597) The rules to detect sentence end is not perfect, and we
// may want to use regex to improve readability.
namespace ash {
namespace input_method {
namespace {

const int kMaxSearchRange = 200;
const int kSpecialWordMaxLength = 6;
// The index difference between a sentence end and the next sentence start.
// Setting it to 1 is sufficient for grammar check model, but 2 is better since
// according to current rules, there is always a space or '\n' or '\r' after a
// sentence end.
const int kGapBetweenSentenceEndAndNextStart = 2;

bool IsSentenceEndCharacter(char16_t c) {
  return (c == u'.' || c == u'?' || c == u'!' || c == u'。' || c == u'。' ||
          c == u'.' || c == u'.' || c == u'?' || c == u'?' || c == u'!' ||
          c == u'!' || c == u'…');
}

bool EndsInSpecialPeriodWord(const std::u16string& text, uint32_t pos) {
  uint32_t idx = pos;
  while (idx <= pos && pos - idx <= kSpecialWordMaxLength &&
         text[idx] != u' ' && text[idx] != u'(') {
    idx--;
  }
  if (idx > pos || pos - idx > kSpecialWordMaxLength) {
    return false;
  }
  std::u16string last_word = text.substr(idx + 1, pos - idx);
  return (last_word == u"c.f." || last_word == u"cf." || last_word == u"e.g." ||
          last_word == u"eg." || last_word == u"i.e." || last_word == u"ie." ||
          last_word == u"Mmes." || last_word == u"Mr." ||
          last_word == u"Mrs." || last_word == u"Ms." ||
          last_word == u"Mses." || last_word == u"Mssrs." ||
          last_word == u"Prof." || last_word == u"n.b." || last_word == u"nb.");
}

bool IsSentenceEndSectionCharacter(char16_t c) {
  return (c == u')' || c == u']' || c == u'}' || c == u'\'' || c == u'\"' ||
          c == u'ʺ' || c == u'˝' || c == u'ˮ' || c == u'"' || c == u'″' ||
          c == u'”' || c == u'»');
}

bool IsEmoticonEyes(char16_t c) {
  return (c == ':' || c == ';');
}

bool IsEmoticonNose(char16_t c) {
  return (c == u'-' || c == u'^' || c == u'{' || c == u'*');
}

bool IsEmoticonMouth(char16_t c) {
  return (c == u')' || c == u'(' || c == u'\\' || c == u'|' || c == u'/');
}

bool EndsInEmoticon(const std::u16string& text, uint32_t pos) {
  return ((pos >= 1 && IsEmoticonEyes(text[pos - 1]) &&
           IsEmoticonMouth(text[pos])) ||
          (pos >= 2 && IsEmoticonEyes(text[pos - 2]) &&
           IsEmoticonNose(text[pos - 1]) && IsEmoticonMouth(text[pos])));
}

bool IsSentenceEnd(const std::u16string& text, uint32_t pos) {
  if (pos < text.size() - 1 &&
      (text[pos + 1] == '\n' || text[pos + 1] == '\r')) {
    return true;
  }

  // The character after the sentence end must be a space or the end of the
  // text.
  if (pos < 2 || (pos < text.size() - 1 && text[pos + 1] != u' ')) {
    return false;
  }

  if (IsSentenceEndCharacter(text[pos]) &&
      !EndsInSpecialPeriodWord(text, pos)) {
    return true;
  }

  if (IsSentenceEndCharacter(text[pos - 1]) &&
      IsSentenceEndSectionCharacter(text[pos])) {
    return true;
  }

  if (EndsInEmoticon(text, pos)) {
    return true;
  }

  return false;
}

}  // namespace

Sentence::Sentence() {}

Sentence::Sentence(const gfx::Range& original_range, const std::u16string& text)
    : original_range(original_range), text(text) {}

Sentence::Sentence(const Sentence& other) = default;

Sentence::~Sentence() = default;

bool Sentence::operator==(const Sentence& other) const {
  return original_range == other.original_range && text == other.text;
}

bool Sentence::operator!=(const Sentence& other) const {
  return !(*this == other);
}

uint32_t FindLastSentenceEnd(const std::u16string& text, uint32_t pos) {
  if (pos == 0 || pos > text.size()) {
    return kUndefined;
  }

  for (size_t i = pos - 1; i > 0 && pos - i <= kMaxSearchRange; i--) {
    if (IsSentenceEnd(text, i)) {
      return i;
    }
  }
  return kUndefined;
}

uint32_t FindNextSentenceEnd(const std::u16string& text, uint32_t pos) {
  if (pos >= text.size()) {
    return kUndefined;
  }

  for (size_t i = pos; i < text.size() && i - pos <= kMaxSearchRange; i++) {
    if (IsSentenceEnd(text, i)) {
      return i;
    }
  }
  return kUndefined;
}

Sentence FindLastSentence(const std::u16string& text, uint32_t pos) {
  if (pos > text.size()) {
    return Sentence();
  }
  if (pos > 0 &&
      (pos == text.size() || text[pos] == '\n' || text[pos] == '\r')) {
    pos--;
  }
  uint32_t end = FindLastSentenceEnd(text, pos);
  if (end == kUndefined) {
    return Sentence();
  }
  uint32_t start = FindLastSentenceEnd(text, end);
  if (start == kUndefined) {
    start = 0;
  } else {
    start = start + kGapBetweenSentenceEndAndNextStart;
  }
  if (start >= end || end - start > kMaxSearchRange) {
    return Sentence();
  }
  return Sentence(gfx::Range(start, end + 1),
                  text.substr(start, end - start + 1));
}

Sentence FindCurrentSentence(const std::u16string& text, uint32_t pos) {
  if (pos > text.size()) {
    return Sentence();
  }
  if (pos > 0 &&
      (pos == text.size() || text[pos] == '\n' || text[pos] == '\r')) {
    pos--;
  }
  uint32_t start = FindLastSentenceEnd(text, pos);
  if (start == kUndefined) {
    start = 0;
  } else {
    start = start + kGapBetweenSentenceEndAndNextStart;
  }

  uint32_t end = FindNextSentenceEnd(text, pos);
  if (end == kUndefined) {
    end = text.length() - 1;
  }

  if (start >= end || end - start > kMaxSearchRange) {
    return Sentence();
  }

  return Sentence(gfx::Range(start, end + 1),
                  text.substr(start, end - start + 1));
}

}  // namespace input_method
}  // namespace ash