bisect_failures.py | Explore in Territory

#!/usr/bin/env python3
# Copyright 2024 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Bisects a failure in the captured sites framework to find culprit CL.

  First, this script will retrieve test results from the given build number and
  the prior build number.

  It will then find sites that changed from passing to failing across those
  builds, and pick a site to use for bisection.

  Finally, it will then launch a local git bisect, using the 'Release' version
  (to match how the bots operate) and running tests in the background.'

  In order to run the bisect, no pending git changes can exist in the local
  checkout, and checkout_chromium_autofill_test_dependencies should be set to
  true in your .gclient file.

  This script requires Read permissions of ResultDB and Buildbucket RPCs of
  internal builders, so it is intended only for Googlers at this time.

  Common tasks:
  Bisect an autofill bot failure:
    1) First, find the first failing build you are interested in for the
       linux-autofill-captured-sites-rel bot.
    2) Note the build number {build_num}. The script will use the first site
       which failed in that run and not the previous to bisect with.
    3) Run:
      `tools/captured_sites/bisect_failures.py autofill {build_num}`

  Bisect an autofill bot failure using a specific site. Note: script does not
  verify that given site overrides are valid recorded site names.
    1) First, find the first failing build you are interested in for the
       linux-autofill-captured-sites-rel bot.
    2) Note the build number {build_num} and site {site_name}.
    3) Run:
      `tools/captured_sites/bisect_failures.py autofill {build_num}
        --site_name {site_name}`

  Bisect an password bot failure:
    1) First, find the first failing build you are interested in for the
       linux-password-manager-captured-sites-rel bot.
    2) Note the build number {build_num}.
    3) Run:
      `tools/captured_sites/bisect_failures.py password {build_num}`
"""

import os
import sys
import argparse

import captured_sites_commands

_TOOLS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')


def _JoinPath(*path_parts):
  return os.path.abspath(os.path.join(*path_parts))


def _InsertPath(path):
  assert os.path.isdir(path), 'Not a valid path: %s' % path
  if path not in sys.path:
    # Some call sites that use Telemetry assume that sys.path[0] is the
    # directory containing the script, so we add these extra paths to right
    # after sys.path[0].
    sys.path.insert(1, path)


def _AddDirToPythonPath(*path_parts):
  path = _JoinPath(*path_parts)
  _InsertPath(path)


_AddDirToPythonPath(_TOOLS_DIR, 'bisect')
import bisect_gtests

_AddDirToPythonPath(_TOOLS_DIR, 'perf')
from core.services import buildbucket_service
from core.services import resultdb_service


def _FindSiteRegressions(bad_buildbucket_id, good_buildbucket_id):
  """Retrieve failed sites from 2 builds, and return the new failures.

    Args:
        bad_buildbucket_id: A build with new failures.
        good_buildbucket_id: A baseline build.

    Returns:
        The sites which fail in the bad build but not in the good build.
    """
  bad_failed_sites = _GetTerminalSiteFailures(bad_buildbucket_id)
  good_failed_sites = _GetTerminalSiteFailures(good_buildbucket_id)
  site_regressions = [k for k in bad_failed_sites if k not in good_failed_sites]
  return site_regressions


# These are convenience shorthand names for command line usage.
_BOT_SHORT_LONG_MAPPING = {
    'autofill': 'linux-autofill-captured-sites-rel',
    'password': 'linux-password-manager-captured-sites-rel',
}


def _GetBotName(short_name):
  """Maps convenient short names to full bot names.

    Args:
        short_name: Either "autofill" or "password"

    Returns:
        The full bot name to be used in build retrievals.

    Raises:
        ValueError: If an invalid short_name is given.
    """
  if short_name not in _BOT_SHORT_LONG_MAPPING:
    raise ValueError(f'Unrecognized short bot name: "{short_name}".'
                     ' Only "autofill" or "password" is known.')
  return _BOT_SHORT_LONG_MAPPING[short_name]


def _GetTerminalSiteFailures(buildbucket_id):
  """Retrieves non-passing (CRASH/TIMEOUT/FAIL) test failures from a build.

    Args:
        buildbucket_id: A build with which to interrogate failures.

    Returns:
        The sites which do not pass in the build.
    """
  all_results = resultdb_service.GetQueryTestResult(buildbucket_id)
  if 'testResults' not in all_results:
    return set()

  site_statuses = {}
  for result in all_results['testResults']:
    test_id = result['testId']
    siteName = test_id[test_id.rfind('All.') + 4:]
    site_statuses[siteName] = result['status']
  site_final_failures = [k for k, v in site_statuses.items() if v != 'PASS']
  return set(site_final_failures)


def _ParseCommandLine(args):
  """Parses command line options from given args.

    Args:
        args: argument which to parse.

    Returns:
        An object containing parsed argument values.

    Raises:
        SystemExit: if invalid or unrecognized arguments were given.
    """
  parser = argparse.ArgumentParser(
      formatter_class=argparse.RawTextHelpFormatter)
  parser.usage = __doc__
  parser.add_argument(
      'bot_name',
      choices=list(_BOT_SHORT_LONG_MAPPING.keys()),
      help='Choose which linux captured sites builder flavor to bisect.')
  parser.add_argument('build_number',
                      type=int,
                      help='The failing build number to bisect')
  parser.add_argument(
      '-s',
      '--site_name',
      type=str,
      help=(
          'An explicit site to use for bisecting (versus picking a failing one'
          ' from build result).'))
  parser.add_argument(
      '-p',
      '--print_only',
      action='store_true',
      help='Only print the commands that would be used to bisect.')
  options = parser.parse_args(args)
  return options


def _RetrieveBuildbucketInfo(builder_name, number):
  """From a given builder name and number, retrieve relevant hash and id.

    Args:
        builder_name: The full builder name to query.
        number: The number of the builder to gather infro from.

    Returns:
        A tuple of the builds (hash, id).

    Raises:
        BuildRequestError: if improper permissions or query format is given.
  """
  build = buildbucket_service.GetBuild('chrome', 'ci', builder_name, number)
  build_bucket_hash = build['input']['gitilesCommit']['id']
  build_bucket_id = build['id']
  return build_bucket_hash, build_bucket_id


def _TranslateSiteName(site_name):
  """Translates complex site_name into array of necessary parts depending on
       if it is a password or autofill type name.

    Args:
        site_name: For autofill, basic site_name, but for Password, a site_name
                   with password scenario prefix.
    Returns:
        A list containing the parsed pieces describing the scenario & site_name.
  """
  password_prefixes = [
      'sign_up_fill', 'sign_up_pass', 'sign_in_pass', 'capture_update_pass'
  ]
  for password_prefix in password_prefixes:
    if site_name.startswith(password_prefix + '_'):
      return [password_prefix, site_name[len(password_prefix + '_'):]]
  return [site_name]


def DoBisect(bad_hash, good_hash, site_name, print_only=False):
  """Takes the comparison hashes and a site_name and peforms a local bisect
     to find the culprit CL which first causes the test for this site to fail.

    Args:
        bad_hash: The hash of a failing build.
        good_hash: The hash of the last known prior good build.
        site_name: A list of distinguishing site parts. For autofill, simply
                   the [site_name], but for password, this would be
                   [scenario_dir, site_name]. Consider _TranslateSiteName to
                   build this properly.
        print_only: A boolean that if True, only print the pieces that would go
                    into a potential bisect process, instead of actually
                     initiating one.
  """
  site_name = _TranslateSiteName(site_name)
  print('Translated to:', site_name)

  build_command = captured_sites_commands.initiate_command('build')
  build_command.build(['-r'])
  build_command_text = build_command.print()

  run_command = captured_sites_commands.initiate_command('run')
  run_command.build(['-r', '-b'] + site_name)
  run_command_text = run_command.print()

  print(f'Will bisect from {good_hash} to {bad_hash}.')
  print(f'Will build using:\n{build_command_text}')
  print(f'Will run using:\n{run_command_text}')
  if print_only:
    print('print_only is set, exiting before bisect begins.')
    return
  bisect_gtests.StartBisect(good_hash, bad_hash, build_command_text,
                            run_command_text)


def GetBuildInfo(bot_name, build_number, site_name=None):
  """Given a bot_name and build_number, returns the info necessary to perform a
     bisect of any novel changes between that build and the previous one.

    Args:
        bot_name: An acceptable short bot_name. See _GetBotName for more detail.
        build_number: The 'bad' build number from which to find novel failures.
        site_name: An optional site_name to force usage of. This likely will
                   want to be from new site failures in the given build, but is
                   not enforced so.
    Returns:
        bad_hash: Buildbucket hash from the given 'bad' build number.
        good_hash: Buildbucket hash from the prior 'good' build.
        site_name: Either the given optional override, or a single novel failure
                   that occurred between the given build_number and the prior.
  """
  full_bot_name = _GetBotName(bot_name)
  first_bad_number = build_number
  last_good_number = build_number - 1

  try:
    bad_hash, bad_id = _RetrieveBuildbucketInfo(full_bot_name, first_bad_number)
    good_hash, good_id = _RetrieveBuildbucketInfo(full_bot_name,
                                                  last_good_number)
  except Exception as e:
    print('Unable to retrieve build bucket info for builds to compare:', e)
    return None, None, None

  try:
    possible_site_regressions = _FindSiteRegressions(bad_id, good_id)
  except Exception as e:
    print('Unable to retrieve site failures from given buildbucket ids:', e)
    return None, None, None

  if not possible_site_regressions:
    print(
        f'Compared build numbers {first_bad_number} and {last_good_number}, but'
        ' found no clear site regressions. Make sure to add the first builder'
        ' that failed as the input. Also note this script cannot handle'
        ' bisecting infra failures.')
  else:
    print('All Site Regressions (%d):%s' %
          (len(possible_site_regressions), ' '.join(possible_site_regressions)))

  if site_name:
    if site_name not in possible_site_regressions:
      print(f'WARNING: given site "{site_name}" did not show as possbile site'
            ' regression but was given as override choice.')
  elif possible_site_regressions:
    site_name = possible_site_regressions[0]

  print(f'Choosing Site:"{site_name}".')
  return bad_hash, good_hash, site_name


def main():
  options = _ParseCommandLine(sys.argv[1:])
  bad_hash, good_hash, site_name = GetBuildInfo(options.bot_name,
                                                options.build_number,
                                                options.site_name)
  if not bad_hash or not good_hash or not site_name:
    print(f'Unable to gather enough info for a bisect, as do not have a'
          f' good hash:"{good_hash}",'
          f' bad hash:"{bad_hash}",'
          f' and/or site name:"{site_name}"".')
    return 1
  DoBisect(bad_hash, good_hash, site_name, options.print_only)
  return 0


if __name__ == '__main__':
  sys.exit(main())
chromium/tools/captured_sites/bisect_failures.py