chromium/tools/git/suggest_owners.py

#!/usr/bin/env python3
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from __future__ import print_function

import argparse
import subprocess
import pickle
import re
import os
from pathlib import PurePath
from os import path
from datetime import date, timedelta
from collections import namedtuple, defaultdict

Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs'])

# dict mapping each subdirectory and author to the number of their commits and
# modifications in that directory
DIRECTORY_AUTHORS = defaultdict(dict)

# cache for directory owners for memoisation purposes
OWNERS_CACHE = {}

# filename for pickle cache
CACHE_FILENAME = 'suggest_owners.cache'


def _RunGitCommand(options, cmd_args, pipe_output=False):
  repo_path = path.join(options.repo_path, '.git')
  cmd = ['git', '--git-dir', repo_path] + cmd_args
  print('>', ' '.join(cmd))
  if not pipe_output:
    return subprocess.check_output(cmd, encoding='utf-8')
  else:
    return subprocess.Popen(cmd, encoding='utf-8',
                            stdout=subprocess.PIPE).stdout


def _ValidAuthor(author):
  return author.endswith(
      ('@chromium.org', '@google.com')) and 'roller' not in author


# Returns additions/deletions by a commit to a directory (and its descendants).
def getEditsForDirectory(commit, directory):
  additions = deletions = 0
  for commit_directory, (directory_additions, directory_deletions) \
      in commit.dirs.items():
    # check if commit_directory is same as or a descendant of directory
    if isSubDirectory(directory, commit_directory):
      additions += directory_additions
      deletions += directory_deletions
  return additions, deletions


# This propagates a commit touching a directory to also be touching all
# ancesstor directories.
def _PropagateCommit(options, commit):
  touched_dirs = set()
  # first get all the touched dirs and their ancestors
  for directory in commit.dirs.keys():
    # PurePath.parent returns '.' for non absolute paths in the limit.
    while str(directory) != '.':
      touched_dirs.add(str(directory))
      # get the parent directory
      directory = PurePath(directory).parent
  # loop over them and calculate the edits per directory
  for directory in touched_dirs:
    author_commits, author_additions, author_deletions = \
        DIRECTORY_AUTHORS[directory].get(commit.author, (0,0,0))
    directory_additions, directory_deletions = \
        getEditsForDirectory(commit, directory)
    DIRECTORY_AUTHORS[directory][commit.author] = \
        (author_commits + 1, author_additions + directory_additions,
         author_deletions + directory_deletions)


# Checks if child_directory is same as or below parent_directory. For some
# reason the os.path module does not have this functionality.
def isSubDirectory(parent_directory, child_directory):
  parent_directory = PurePath(parent_directory)
  child_directory = PurePath(child_directory)
  return child_directory.is_relative_to(parent_directory)


def _GetGitLogCmd(options):
  # TODO(mheikal): git-log with --numstat vs --name-only takes 10x the time to
  # complete. It takes >15 mins for git log --numstat to return the 1 year git
  # history of the full repo. Should probably add a script flag to switch off
  # keeping track of number of modifications per commit.
  date_limit = date.today() - timedelta(days=options.days_ago)
  format_string = "%h,%ae,%cI"
  cmd_args = [
    'log',
    '--since', date_limit.isoformat(),
    '--numstat',
    '--pretty=format:%s'%format_string,
  ]
  # has to be last arg
  if options.subdirectory:
    cmd_args += ['--', options.subdirectory]
  return cmd_args


def _ParseCommitLine(line):
  commit_hash, author, commit_date = line.split(",")
  return Commit(hash=commit_hash, author=author, commit_date=commit_date,
                dirs={})


def _ParseFileStatsLine(current_commit, line):
  try:
    additions, deletions, filepath = line.split('\t')
  except ValueError:
    return False
  if additions == '-':
    additions = 0
  else:
    additions = int(additions)
  if deletions == '-':
    deletions = 0
  else:
    deletions = int(deletions)
  if additions == 0 and deletions == 0:
    return True
  dir_path = path.dirname(filepath)
  # For git renames, we count the destination directory
  if '=>' in dir_path:
    dir_path = re.sub(r'\{[^=]* => ([^\}]*)\}', r'\1', dir_path)
    # remove possibly empty path parts.
    dir_path = dir_path.replace('//', '/')
  commit_additions, commit_deletions = \
      current_commit.dirs.get(dir_path, (0,0))
  current_commit.dirs[dir_path] = (
      additions + commit_additions, deletions + commit_deletions)
  return True


def processAllCommits(options):
  if not options.subdirectory and options.days_ago > 100:
    print('git log for your query might take > 5 minutes, limit by a '
          'subdirectory or reduce the number of days of history to low double '
          'digits to make this faster. There is no progress indicator, it is '
          'all waiting for single git log to finish.')
  output_pipe = _RunGitCommand(options,
                               _GetGitLogCmd(options),
                               pipe_output=True)
  current_commit = None
  for line in iter(output_pipe.readline, ''):
    line = line.rstrip('\n')
    if current_commit is None:
      current_commit = _ParseCommitLine(line)
    else:
      if line == '': # all commit details read
        if _ValidAuthor(current_commit.author):
          _PropagateCommit(options, current_commit)
        current_commit = None
      else:
        # Merge commits weird out git-log. If we fail to parse the line, then
        # the last commit was a merge and this line is actually another commit
        # description line.
        if not _ParseFileStatsLine(current_commit, line):
          current_commit = _ParseCommitLine(line)
  # process the final commit
  if _ValidAuthor(current_commit.author):
    _PropagateCommit(options, current_commit)
  print('Done parsing commit log.')


def _CountCommits(directory):
  return sum(
      [count for (count, _a, _d) in DIRECTORY_AUTHORS[directory].values()])


def _GetOwnerLevel(options, author, directory):
  sorted_owners = sorted(_GetOwners(options, directory), key=lambda e: e[1])
  for owner, level in sorted_owners:
    if author == owner:
      return level
  else:
    return -1


# Returns the owners for a repo subdirectory. This does not understand per-file
# directives.
# TODO(mheikal): use depot_tools owners.py for parsing owners files.
def _GetOwners(options, directory_path):
  if directory_path in OWNERS_CACHE:
    return OWNERS_CACHE[directory_path]
  owners_path = path.join(options.repo_path, directory_path, 'OWNERS')
  owners = set()
  parent_dir = directory_path
  owner_level = 0
  while parent_dir != '':
    if path.isfile(owners_path):
      parsed_owners, noparent = _ParseOwnersFile(options, owners_path)
      owners.update([(owner, owner_level) for owner in parsed_owners])
      owner_level += 1
      if noparent:
        break
    parent_dir = path.dirname(parent_dir)
    owners_path = path.join(parent_dir, 'OWNERS')
  OWNERS_CACHE[directory_path] = set(owners)
  return owners


# Parse an OWNERS file, returns set of owners and if the file sets noparent
def _ParseOwnersFile(options, filepath):
  owners = set()
  noparent = False
  with open(filepath) as f:
    for line in f.readlines():
      line = line.strip()
      # The script deals with directories so per-files are ignored.
      if line == '' or line[0] == '#' or line.startswith('per-file'):
        continue
      if line.startswith('file://'):
        relpath = line[7:]
        abspath = path.join(options.repo_path, relpath)
        parsed_owners, _ = _ParseOwnersFile(options, abspath)
        owners.update(parsed_owners)
      if line == 'set noparent':
        noparent = True
      index = line.find('@chromium.org')
      if index > -1:
        owners.add(line[:index + len('@chromium.org')])
  return owners, noparent


# Trivial directories are ones that just contain a single child subdir and
# nothing else.
def _IsTrivialDirectory(options, repo_subdir):
  try:
    return len(os.listdir(path.join(options.repo_path, repo_subdir))) == 1
  except OSError:
    # directory no longer exists
    return False


def computeSuggestions(options):
  directory_suggestions = []
  for directory, authors in sorted(DIRECTORY_AUTHORS.items()):
    if _IsTrivialDirectory(options, directory):
      continue
    if _CountCommits(directory) < options.dir_commit_limit:
      continue
    # skip suggestions for directories outside the passed in directory
    if (options.subdirectory
        and not isSubDirectory(options.subdirectory, directory)):
      continue
    # sort authors by descending number of commits
    sorted_authors = sorted(authors.items(), key=lambda entry: -entry[1][0])
    # keep only authors above the limit
    suggestions = [(a,c) for a,c in sorted_authors if \
                   a not in options.ignore_authors \
                   and c[0] >= options.author_cl_limit]
    directory_suggestions.append((directory, suggestions))
  return directory_suggestions


def _PrintSettings(options):
  print('Showing directories with at least ({}) commits in the last ({}) '
        'days.'.format(options.dir_commit_limit, options.days_ago))
  print('Showing top ({}) committers who have commited at least ({}) commits '
        'to the directory in the last ({}) days.'.format(
            options.max_suggestions, options.author_cl_limit,
            options.days_ago))
  print('(owners+N) represents distance through OWNERS files for said owner\n')


def printSuggestions(options, directory_suggestions):
  print('\nCommit stats:')
  _PrintSettings(options)
  for directory, suggestions in directory_suggestions:
    print('{}: {} commits in the last {} days'.format(
        directory, _CountCommits(directory), options.days_ago))
    non_owner_suggestions = 0
    for author, (commit_count, additions, deletions) in suggestions:
      owner_level = _GetOwnerLevel(options, author, directory)
      if owner_level > -1:
        owner_string = ' (owner+{})'.format(owner_level)
      else:
        non_owner_suggestions +=1
        owner_string = ''
      print('{}{}, commits: {}, additions:{}, deletions: {}'.format(
          author, owner_string, commit_count, additions, deletions))
      if non_owner_suggestions >= options.max_suggestions:
        break
    print()


def _GetHeadCommitHash(options):
  return _RunGitCommand(options, ['rev-parse', 'HEAD']).strip()


def _GetCacheMetadata(options):
  return _GetHeadCommitHash(options), options.days_ago, options.subdirectory


def _IsCacheValid(options, metadata):
  head_hash, days_ago, cached_subdirectory = metadata
  if head_hash != _GetHeadCommitHash(options):
    return False
  if days_ago != options.days_ago:
    return False
  if (cached_subdirectory is not None
      and not isSubDirectory(cached_subdirectory, options.subdirectory)):
    return False
  return True


def cacheProcessedCommits(options):
  metadata = _GetCacheMetadata(options)
  with open(CACHE_FILENAME, 'wb') as f:
    pickle.dump((metadata, DIRECTORY_AUTHORS), f)


def maybeRestoreProcessedCommits(options):
  global DIRECTORY_AUTHORS
  if not path.exists(CACHE_FILENAME):
    return False
  with open(CACHE_FILENAME, 'rb') as f:
    stored_metadata, cached_directory_authors = pickle.load(f)
    if _IsCacheValid(options, stored_metadata):
      print('Loading from cache')
      DIRECTORY_AUTHORS = cached_directory_authors
      return True
    else:
      print('Cache is stale or invalid, must rerun `git log`')
      return False

def do(options):
  if options.skip_cache or not maybeRestoreProcessedCommits(options):
    processAllCommits(options)
    cacheProcessedCommits(options)
  directory_suggestions = computeSuggestions(options)
  printSuggestions(options, directory_suggestions)


def main():
  parser = argparse.ArgumentParser(
      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('repo_path')
  parser.add_argument('--days-ago', type=int,
                      help='Number of days of history to search through.',
                      default=365, metavar='DAYS_AGO')
  parser.add_argument('--subdirectory',
                      help='Limit suggestions to this subdirectory', default='')
  parser.add_argument('--ignore-authors',
                      help='Ignore this comma separated list of authors')
  parser.add_argument('--max-suggestions', type=int, help='Maximum number of '
                      'suggested authors per directory.', default=5)
  parser.add_argument('--author-cl-limit', type=int, help='Do not suggest '
                      'authors who have commited less than this to the '
                      'directory in the last DAYS_AGO days.', default=10)
  parser.add_argument('--dir-commit-limit', type=int, help='Skip directories '
                      'with less than this number of commits in the last '
                      'DAYS_AGO days.', default=100)
  parser.add_argument('--skip-cache', action='store_true',
                      help='Do not read from cache.', default=False)
  options = parser.parse_args()
  if options.ignore_authors:
    options.ignore_authors = set(
        map(str.strip, options.ignore_authors.split(',')))
  else:
    options.ignore_authors = set()
  do(options)


if __name__ == '__main__':
  main()