chromium/tools/android/modularization/loc/modularization_loc_stat.py

#!/usr/bin/env python3
# Copyright 2020 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Generates stats on modularization efforts. Stats include:
- Percentage of added lines in modularized files over legacy ones.
- The top 50 contributors to the modularized files.
"""

import argparse
import datetime
import json
import os
import subprocess
import sys
from collections import OrderedDict
from collections import defaultdict
from typing import List, Tuple

# Output json keys
KEY_LOC_MODULARIZED = 'loc_modularized'
KEY_LOC_LEGACY = 'loc_legacy'
KEY_RANKINGS_MODULARIZED = 'rankings'
KEY_RANKINGS_LEGACY = 'rankings_legacy'
KEY_START_DATE = 'start_date'
KEY_END_DATE = 'end_date'

_M12N_DIRS = [
    'chrome/browser',
    'components',
]

_LEGACY_DIR = 'chrome/android'


def GenerateLOCStats(start_date,
                     end_date,
                     *,
                     quiet=False,
                     json_format=False,
                     git_dir=None):
  """Generate modulazation LOC stats.

  Args:
    start_date: The date to analyze the stat from.
    end_date: The date to analyze the stat to.
    quiet: True if no message is output during the processing.
    json_format: True if the output should be in json format. Otherwise
        a plain, human-readable table is generated.
    git_dir: Git repo directory to use for stats. If None, the current directory
        is used.

  Return:
    Text string containing the stat in a specified format.
  """

  #  Each CL is output in the following format:
  #
  #   :thanhdng:2020-08-17:Use vector icons for zero state file results
  #
  #  118     98      chromeos/ui/base/file_icon_util.cc
  #  2       1       chromeos/ui/base/file_icon_util.h
  #  0       20      chromeos/ui/base/file_icon_util_unittest.cc
  #
  #  i.e.:
  #
  #   :author:commit-date:subject
  #
  #  added-lines    deleted-lines  file-path1
  #  added-lines    deleted-lines  file-path2
  #  ...
  repo_dir = git_dir or os.getcwd()
  command = [
      'git', '-C', repo_dir, 'log', '--numstat', '--no-renames',
      '--format=#:%al:%cs:%s', '--after=' + start_date, '--before=' + end_date,
      'chrome', 'components'
  ]
  try:
    proc = subprocess.Popen(
        command,
        bufsize=1,  # buffered mode
        stdout=subprocess.PIPE,
        universal_newlines=True)
  except subprocess.SubprocessError as e:
    print(f'{command} failed with code {e.returncode}.', file=sys.stderr)
    print(f'\nSTDERR: {e.stderr}', file=sys.stderr)
    print(f'\nSTDOUT: {e.stdout}', file=sys.stderr)
    raise

  author_stat_m12n = defaultdict(int)
  author_stat_legacy = defaultdict(int)
  total_m12n = 0
  total_legacy = 0
  prev_msg_len = 0
  revert_cl = False
  for raw_line in proc.stdout:
    if raw_line.isspace():
      continue
    line = raw_line.strip()
    if line.startswith('#'):  # patch summary line
      _, author, commit_date, *subject = line.split(':', 4)
      revert_cl = (subject[0].startswith('Revert')
                   or subject[0].startswith('Reland'))
    else:
      if revert_cl or not line.endswith('.java'):
        continue

      # Do not take into account the number of deleted lines, which can
      # turn the overall changes to negative. If a class was renamed,
      # for instance, what's deleted is added somewhere else, so counting
      # only for addition works. Other kinds of deletion will be ignored.
      added, _deleted, path = line.split()
      diff = int(added)
      if _is_m12n_path(path):
        total_m12n += diff
        author_stat_m12n[author] += diff
      elif _is_legacy_path(path):
        total_legacy += diff
        author_stat_legacy[author] += diff

    msg = f'\rProcessing {commit_date} by {author}'
    if not quiet: _print_progress(msg, prev_msg_len)
    prev_msg_len = len(msg)

  if not quiet:
    _print_progress('Processing complete', prev_msg_len)
    print('\n')

  rankings_modularized = OrderedDict(
      sorted(author_stat_m12n.items(), key=lambda x: x[1], reverse=True))
  rankings_legacy = OrderedDict(
      sorted(author_stat_legacy.items(), key=lambda x: x[1], reverse=True))

  if json_format:
    return json.dumps({
        KEY_LOC_MODULARIZED: total_m12n,
        KEY_LOC_LEGACY: total_legacy,
        KEY_RANKINGS_MODULARIZED: rankings_modularized,
        KEY_RANKINGS_LEGACY: rankings_legacy,
        KEY_START_DATE: start_date,
        KEY_END_DATE: end_date,
    })
  else:
    output = []
    total = total_m12n + total_legacy
    percentage = 100.0 * total_m12n / total if total > 0 else 0
    output.append(f'# of lines added in modularized files: {total_m12n}')
    output.append(f'# of lines added in non-modularized files: {total_legacy}')
    output.append(f'% of lines landing in modularized files: {percentage:2.2f}')

    # Shows the top 50 contributors in each category.
    output.extend(
        _print_ranking(rankings_modularized, total_m12n,
                       'modules and components'))
    output.extend(
        _print_ranking(rankings_legacy, total_legacy, 'legacy and glue'))

    return '\n'.join(output)


def _print_ranking(rankings: OrderedDict, total: int, label: str) -> List[str]:
  if not rankings:
    return []

  output = []
  output.append(f'\nTop contributors ({label}):')
  output.append('No  lines    %    author')
  for rank, author in enumerate(list(rankings.keys())[:50], 1):
    lines = rankings[author]
    if lines == 0:
      break
    ratio = 100 * lines / total
    output.append(f'{rank:2d} {lines:6d} {ratio:5.1f}  {author}')
  return output


def _is_m12n_path(path):
  for prefix in _M12N_DIRS:
    if path.startswith(prefix):
      return True
  return False


def _is_legacy_path(path):
  return path.startswith(_LEGACY_DIR)


def _print_progress(msg, prev_msg_len):
  msg_len = len(msg)

  # Add spaces to remove the previous progress output completely.
  if msg_len < prev_msg_len:
    msg += ' ' * (prev_msg_len - msg_len)
  print(msg, end='\r')


def GetDateRange(*, past_days: int) -> Tuple[str, str]:
  """Returns start and end date for a period of past days.

  Use the results as start_date and end_date of GenerateLOCStats.
  """
  today = datetime.date.today()
  delta = datetime.timedelta(days=past_days)
  past = datetime.datetime(today.year, today.month, today.day) - delta
  return (past.date().isoformat(), today.isoformat())


if __name__ == "__main__":
  parser = argparse.ArgumentParser(
      description="Generates LOC stats for modularization effort.")

  date_group = parser.add_mutually_exclusive_group(required=True)
  date_group.add_argument('--date',
                          type=str,
                          metavar=('<date-from>', '<date-to>'),
                          nargs=2,
                          help='date range (YYYY-MM-DD)~(YYYY-MM-DD)')
  date_group.add_argument('--past-days',
                          type=int,
                          help='The number of days to look back for stats. '
                          '0 for today only.')
  parser.add_argument('-q',
                      '--quiet',
                      action='store_true',
                      help='Do not output any message while processing')
  parser.add_argument('-j',
                      '--json',
                      action='store_true',
                      help='Output result in json format. '
                      'If not specified, output in more human-readable table.')
  parser.add_argument('-o',
                      '--output',
                      type=str,
                      help='File to write the result to in json format. '
                      'If not specified, outputs to console.')
  parser.add_argument('--git-dir',
                      type=str,
                      help='Root directory of the git repo to look into. '
                      'If not specified, use the current directory.')
  args = parser.parse_args()
  if args.past_days and args.past_days < 0:
    raise parser.error('--past-days must be non-negative.')

  if args.date:
    start_date, end_date = args.date
  else:
    start_date, end_date = GetDateRange(past_days=args.past_days)

  result = GenerateLOCStats(start_date,
                            end_date,
                            quiet=args.quiet,
                            json_format=args.json,
                            git_dir=args.git_dir)
  if args.output:
    with open(args.output, 'w') as f:
      f.write(result)
  else:
    print(result)