chromium/tools/binary_size/find_large_commits.py

#!/usr/bin/env python3
# Copyright 2019 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Prints the large commits given a .csv file from a telemetry size graph."""

import argparse
import re
import subprocess


# Commit ranges where perf bot was giving invalid results.
# Range objects implement __contains__ for fast "in" operators.
_BAD_COMMIT_RANGES = [
    range(1045024, 1045552),  # https://crbug.com/1361952
]


def _ReadCsv(path):
  """Returns the contents of the .csv as a list of (int, int)."""
  ret = []
  with open(path) as f:
    for line in f:
      parts = line.rstrip().split(',')
      if len(parts) == 2 and parts[0] != 'revision':
        ret.append((int(parts[0]), int(float(parts[1]))))
  return ret


def _FindBigDeltas(revs_and_sizes, increase_threshold, decrease_threshold):
  """Filters revs_and_sizes for entries that grow/shrink too much."""
  big_jumps = []
  prev_rev, prev_size = revs_and_sizes[0]
  for rev, size in revs_and_sizes:
    delta = size - prev_size
    if delta > increase_threshold or -delta > decrease_threshold:
      big_jumps.append((rev, delta, prev_rev))
    prev_rev = rev
    prev_size = size
  return big_jumps


def _LookupCommitInfo(rev):
  sha1 = subprocess.check_output(
      ['git', 'crrev-parse', str(rev)], encoding="utf-8").strip()
  if not sha1:
    raise Exception(f'git crrev-parse for {rev} failed. Probably need to '
                    f'"git fetch origin main"')
  desc = subprocess.check_output(['git', 'log', '-n1', sha1], encoding="utf-8")
  author = re.search(r'Author: .*?<(.*?)>', desc).group(1)
  day, year = re.search(r'Date:\s+\w+\s+(\w+ \d+)\s+.*?\s+(\d+)', desc).groups()
  date = '{} {}'.format(day, year)
  title = re.search(r'\n +(\S.*)', desc).group(1).replace('\t', ' ')
  return sha1, author, date, title


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--increase-threshold',
      type=int,
      default=30 * 1024,
      help='Minimum number of bytes larger to be considered a notable.')
  parser.add_argument(
      '--decrease-threshold',
      type=int,
      default=30 * 1024,
      help='Minimum number of bytes smaller to be considered a notable.')
  parser.add_argument(
      'points_csv', help='Input .csv file with columns: revision,value')
  options = parser.parse_args()

  revs_and_sizes = _ReadCsv(options.points_csv)
  big_deltas = _FindBigDeltas(revs_and_sizes, options.increase_threshold,
                              options.decrease_threshold)

  print('Printing info for up to {} commits in the range {}-{}'.format(
      len(big_deltas), revs_and_sizes[0][0], revs_and_sizes[-1][0]))
  print('Revision,Hash,Title,Author,Delta,Date')
  num_bad_commits = 0
  for rev, delta, prev_rev in big_deltas:
    if any(rev in r for r in _BAD_COMMIT_RANGES):
      num_bad_commits += 1
      continue
    sha1, author, date, title = _LookupCommitInfo(rev)
    rev_str = str(rev)
    if rev - prev_rev > 1:
      rev_str = f'{prev_rev}..{rev}'
    print('\t'.join([rev_str, sha1, title, author, str(delta), date]))

  if num_bad_commits:
    print(f'Ignored {num_bad_commits} commits from bad ranges')


if __name__ == '__main__':
  main()