chromium/third_party/tflite_support/src/tensorflow_lite_support/python/test/task/text/text_searcher_test.py

# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for text_searcher."""

import enum

from absl.testing import parameterized

import tensorflow as tf
from tensorflow_lite_support.python.task.core import base_options as base_options_module
from tensorflow_lite_support.python.task.processor.proto import embedding_options_pb2
from tensorflow_lite_support.python.task.processor.proto import search_options_pb2
from tensorflow_lite_support.python.task.processor.proto import search_result_pb2
from tensorflow_lite_support.python.task.text import text_searcher
from tensorflow_lite_support.python.test import test_util

_BaseOptions = base_options_module.BaseOptions
_EmbeddingOptions = embedding_options_pb2.EmbeddingOptions
_SearchOptions = search_options_pb2.SearchOptions
_SearchResult = search_result_pb2.SearchResult
_NearestNeighbor = search_result_pb2.NearestNeighbor
_TextSearcher = text_searcher.TextSearcher
_TextSearcherOptions = text_searcher.TextSearcherOptions

_REGEX_EMBEDDER_MODEL = 'regex_one_embedding_with_metadata.tflite'
_REGEX_SEARCHER_MODEL = 'regex_searcher.tflite'
_REGEX_INDEX = 'regex_index.ldb'
_EXPECTED_REGEX_SEARCH_RESULT = _SearchResult(nearest_neighbors=[
    _NearestNeighbor(
        metadata=bytearray(b'The weather was excellent.'), distance=0.0),
    _NearestNeighbor(
        metadata=bytearray(b'The sun was shining on that day.'),
        distance=5.7e-5),
    _NearestNeighbor(
        metadata=bytearray(b'The cat is chasing after the mouse.'),
        distance=8.9e-5),
    _NearestNeighbor(
        metadata=bytearray(b'It was a sunny day.'), distance=0.000113),
    _NearestNeighbor(
        metadata=bytearray(b'He was very happy with his newly bought car.'),
        distance=0.000119)
])

_EXPECTED_REGEX_DEFAULT_OPTIONS_SEARCH_RESULT = _SearchResult(
    nearest_neighbors=[
        _NearestNeighbor(
            metadata=bytearray(b'The weather was excellent.'),
            distance=0.889665),
        _NearestNeighbor(
            metadata=bytearray(b'The sun was shining on that day.'),
            distance=0.889668),
        _NearestNeighbor(
            metadata=bytearray(b'The cat is chasing after the mouse.'),
            distance=0.88967),
        _NearestNeighbor(
            metadata=bytearray(b'It was a sunny day.'), distance=0.889671),
        _NearestNeighbor(
            metadata=bytearray(b'He was very happy with his newly bought car.'),
            distance=0.889672)
    ])

_BERT_EMBEDDER_MODEL = 'mobilebert_embedding_with_metadata.tflite'
_BERT_SEARCHER_MODEL = 'mobilebert_searcher.tflite'
_BERT_INDEX = 'mobilebert_index.ldb'
_EXPECTED_BERT_SEARCH_RESULT = _SearchResult(nearest_neighbors=[
    _NearestNeighbor(
        metadata=bytearray(b'The weather was excellent.'), distance=0.0),
    _NearestNeighbor(
        metadata=bytearray(b'It was a sunny day.'), distance=0.115369),
    _NearestNeighbor(
        metadata=bytearray(b'The sun was shining on that day.'),
        distance=0.230017),
    _NearestNeighbor(
        metadata=bytearray(b'He was very happy with his newly bought car.'),
        distance=0.324563),
    _NearestNeighbor(
        metadata=bytearray(b'The cat is chasing after the mouse.'),
        distance=0.966928)
])

_USE_EMBEDDER_MODEL = 'universal_sentence_encoder_qa_with_metadata.tflite'
_USE_SEARCHER_MODEL = 'universal_sentence_encoder_searcher.tflite'
_USE_INDEX = 'universal_sentence_encoder_index.ldb'
_EXPECTED_USE_SEARCH_RESULT = _SearchResult(nearest_neighbors=[
    _NearestNeighbor(
        metadata=bytearray(b'The weather was excellent.'), distance=0.0),
    _NearestNeighbor(
        metadata=bytearray(b'It was a sunny day.'), distance=0.146359),
    _NearestNeighbor(
        metadata=bytearray(b'The sun was shining on that day.'),
        distance=0.152225),
    _NearestNeighbor(
        metadata=bytearray(b'The cat is chasing after the mouse.'),
        distance=0.359965),
    _NearestNeighbor(
        metadata=bytearray(b'He was very happy with his newly bought car.'),
        distance=0.366927)
])

_MAX_RESULTS = 2


class ModelFileType(enum.Enum):
  FILE_CONTENT = 1
  FILE_NAME = 2


class IndexFileType(enum.Enum):
  NONE = 1
  FILE_CONTENT = 2
  FILE_NAME = 3


class TextSearcherTest(parameterized.TestCase, tf.test.TestCase):

  def setUp(self):
    super().setUp()
    self.embedder_model_path = test_util.get_test_data_path(
        _REGEX_EMBEDDER_MODEL)
    self.searcher_model_path = test_util.get_test_data_path(
        _REGEX_SEARCHER_MODEL)
    self.index_path = test_util.get_test_data_path(_REGEX_INDEX)

  def test_create_from_file_succeeds_with_valid_embedder_and_index_paths(self):
    # Creates with default option and valid model and index files successfully.
    searcher = _TextSearcher.create_from_file(self.embedder_model_path,
                                              self.index_path)
    self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_file_succeeds_with_valid_searcher_path(self):
    # Creates with default option and valid model and index files successfully.
    searcher = _TextSearcher.create_from_file(self.searcher_model_path)
    self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_succeeds_with_valid_embedder_and_index_paths(
      self):
    options = _TextSearcherOptions(
        base_options=_BaseOptions(file_name=self.embedder_model_path),
        search_options=_SearchOptions(index_file_name=self.index_path))
    searcher = _TextSearcher.create_from_options(options)
    self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_succeeds_with_valid_searcher_path(self):
    options = _TextSearcherOptions(
        base_options=_BaseOptions(file_name=self.searcher_model_path),
        search_options=_SearchOptions())
    searcher = _TextSearcher.create_from_options(options)
    self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_succeeds_with_valid_embedder_content(self):
    # Creates with options containing model content successfully.
    with open(self.embedder_model_path, 'rb') as f:
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_content=f.read()),
          search_options=_SearchOptions(index_file_name=self.index_path))
      searcher = _TextSearcher.create_from_options(options)
      self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_succeeds_with_valid_searcher_content(self):
    # Creates with options containing model content successfully.
    with open(self.searcher_model_path, 'rb') as f:
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_content=f.read()),
          search_options=_SearchOptions(index_file_name=self.index_path))
      searcher = _TextSearcher.create_from_options(options)
      self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_succeeds_with_valid_index_content(self):
    # Creates with options containing index content successfully.
    with open(self.index_path, 'rb') as f:
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_name=self.embedder_model_path),
          search_options=_SearchOptions(index_file_content=f.read()))
      searcher = _TextSearcher.create_from_options(options)
      self.assertIsInstance(searcher, _TextSearcher)

  def test_create_from_options_fails_with_invalid_index_path(self):
    # Invalid index path.
    with self.assertRaisesRegex(
        ValueError,
        r'Unable to find index file: SearchOptions.index_file is not set and '
        r'no AssociatedFile with type SCANN_INDEX_FILE could be found in the '
        r'output tensor metadata.'):
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_name=self.embedder_model_path))
      _TextSearcher.create_from_options(options)

  def test_create_from_options_fails_with_invalid_model_path(self):
    # Invalid empty model path.
    with self.assertRaisesRegex(
        ValueError,
        r"ExternalFile must specify at least one of 'file_content', "
        r"'file_name' or 'file_descriptor_meta'."):
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_name=''),
          search_options=_SearchOptions(index_file_name=self.index_path))
      _TextSearcher.create_from_options(options)

  def test_create_from_options_fails_with_invalid_quantization(self):
    # Invalid quantization option.
    with self.assertRaisesRegex(
        ValueError,
        r'Setting EmbeddingOptions.quantize = true is not allowed in '
        r'searchers.'):
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_name=self.embedder_model_path),
          embedding_options=_EmbeddingOptions(quantize=True),
          search_options=_SearchOptions(index_file_name=self.index_path))
      _TextSearcher.create_from_options(options)

  def test_create_from_options_fails_with_invalid_max_results(self):
    # Invalid max results option.
    with self.assertRaisesRegex(
        ValueError, r'SearchOptions.max_results must be > 0, found -1.'):
      options = _TextSearcherOptions(
          base_options=_BaseOptions(file_name=self.embedder_model_path),
          search_options=_SearchOptions(
              index_file_name=self.index_path, max_results=-1))
      _TextSearcher.create_from_options(options)

  def test_search_with_default_options(self):
    # Create searcher.
    searcher = _TextSearcher.create_from_file(self.embedder_model_path,
                                              self.index_path)

    # Perform text search.
    text_search_result = searcher.search('The weather was excellent.')

    self.assertProtoEquals(
        text_search_result.to_pb2(),
        _EXPECTED_REGEX_DEFAULT_OPTIONS_SEARCH_RESULT.to_pb2())

  @parameterized.parameters(
      (_REGEX_EMBEDDER_MODEL, _REGEX_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_NAME, _EXPECTED_REGEX_SEARCH_RESULT),
      (_REGEX_EMBEDDER_MODEL, _REGEX_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_NAME, _EXPECTED_REGEX_SEARCH_RESULT),
      (_REGEX_EMBEDDER_MODEL, _REGEX_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_CONTENT, _EXPECTED_REGEX_SEARCH_RESULT),
      (_REGEX_EMBEDDER_MODEL, _REGEX_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_CONTENT, _EXPECTED_REGEX_SEARCH_RESULT),
      (_REGEX_SEARCHER_MODEL, None, ModelFileType.FILE_NAME, IndexFileType.NONE,
       _EXPECTED_REGEX_SEARCH_RESULT),
      (_REGEX_SEARCHER_MODEL, None, ModelFileType.FILE_CONTENT,
       IndexFileType.NONE, _EXPECTED_REGEX_SEARCH_RESULT),
      (_BERT_EMBEDDER_MODEL, _BERT_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_NAME, _EXPECTED_BERT_SEARCH_RESULT),
      (_BERT_EMBEDDER_MODEL, _BERT_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_NAME, _EXPECTED_BERT_SEARCH_RESULT),
      (_BERT_EMBEDDER_MODEL, _BERT_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_CONTENT, _EXPECTED_BERT_SEARCH_RESULT),
      (_BERT_EMBEDDER_MODEL, _BERT_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_CONTENT, _EXPECTED_BERT_SEARCH_RESULT),
      (_BERT_SEARCHER_MODEL, None, ModelFileType.FILE_NAME, IndexFileType.NONE,
       _EXPECTED_BERT_SEARCH_RESULT),
      (_BERT_SEARCHER_MODEL, None, ModelFileType.FILE_CONTENT,
       IndexFileType.NONE, _EXPECTED_BERT_SEARCH_RESULT),
      (_USE_EMBEDDER_MODEL, _USE_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_NAME, _EXPECTED_USE_SEARCH_RESULT),
      (_USE_EMBEDDER_MODEL, _USE_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_NAME, _EXPECTED_USE_SEARCH_RESULT),
      (_USE_EMBEDDER_MODEL, _USE_INDEX, ModelFileType.FILE_NAME,
       IndexFileType.FILE_CONTENT, _EXPECTED_USE_SEARCH_RESULT),
      (_USE_EMBEDDER_MODEL, _USE_INDEX, ModelFileType.FILE_CONTENT,
       IndexFileType.FILE_CONTENT, _EXPECTED_USE_SEARCH_RESULT),
      (_USE_SEARCHER_MODEL, None, ModelFileType.FILE_NAME, IndexFileType.NONE,
       _EXPECTED_USE_SEARCH_RESULT),
      (_USE_SEARCHER_MODEL, None, ModelFileType.FILE_CONTENT,
       IndexFileType.NONE, _EXPECTED_USE_SEARCH_RESULT),
  )
  def test_search(self, model_name, index_name, model_file_type,
                  index_file_type, expected_search_result):
    # Create BaseOptions.
    model_path = test_util.get_test_data_path(model_name)
    if model_file_type is ModelFileType.FILE_NAME:
      base_options = _BaseOptions(file_name=model_path)
    elif model_file_type is ModelFileType.FILE_CONTENT:
      with open(model_path, 'rb') as f:
        model_content = f.read()
      base_options = _BaseOptions(file_content=model_content)
    else:
      # Should never happen
      raise ValueError('model_file_type is invalid.')

    # Create SearchOptions.
    if index_file_type is IndexFileType.NONE:
      search_options = _SearchOptions()
    else:
      index_path = test_util.get_test_data_path(index_name)
      if index_file_type is IndexFileType.FILE_NAME:
        search_options = _SearchOptions(index_file_name=index_path)
      elif index_file_type is IndexFileType.FILE_CONTENT:
        with open(index_path, 'rb') as f:
          index_content = f.read()
        search_options = _SearchOptions(index_file_content=index_content)
      else:
        # Should never happen
        raise ValueError('index_file_type is invalid.')

    # Create searcher.
    options = _TextSearcherOptions(
        base_options, _EmbeddingOptions(l2_normalize=True, quantize=False),
        search_options)
    searcher = _TextSearcher.create_from_options(options)

    # Perform text search.
    text_search_result = searcher.search('The weather was excellent.')

    # Comparing results.
    self.assertProtoEquals(text_search_result.to_pb2(),
                           expected_search_result.to_pb2())

    # Get user info and compare values.
    self.assertEqual(searcher.get_user_info(), 'userinfo')

  def test_max_results_option(self):
    # Create searcher.
    base_options = _BaseOptions(file_name=self.embedder_model_path)
    search_options = _SearchOptions(
        index_file_name=self.index_path, max_results=_MAX_RESULTS)
    options = _TextSearcherOptions(base_options,
                                   _EmbeddingOptions(l2_normalize=True),
                                   search_options)
    searcher = _TextSearcher.create_from_options(options)

    # Perform text search.
    text_search_result = searcher.search('The weather was excellent.')
    nearest_neighbors = text_search_result.nearest_neighbors

    self.assertLessEqual(
        len(nearest_neighbors), _MAX_RESULTS, 'Too many results returned.')


if __name__ == '__main__':
  tf.test.main()