chromium/tools/metrics/histograms/generate_expired_histograms_array.py

#!/usr/bin/env python
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
import datetime
import hashlib
import logging
import os
import re
import sys

import extract_histograms
import merge_xml
import histogram_paths

_DATE_FILE_RE = re.compile(r".*MAJOR_BRANCH_DATE=(.+).*")
_CURRENT_MILESTONE_RE = re.compile(r"MAJOR=([0-9]{2,3})\n")
_MILESTONE_EXPIRY_RE = re.compile(r"\AM([0-9]{2,3})")

_SCRIPT_NAME = "generate_expired_histograms_array.py"
_HASH_DATATYPE = "uint32_t"
_HEADER = """// Generated from {script_name}. Do not edit!

#ifndef {include_guard}
#define {include_guard}

#include <stdint.h>

namespace {namespace} {{

// Contains hashes of expired histograms.
const {hash_datatype} kExpiredHistogramsHashes[] = {{
{hashes}
}};

}}  // namespace {namespace}

#endif  // {include_guard}
"""

_DATE_FORMAT_ERROR = "Unable to parse expiry {date} in histogram {name}."

# Some extra "grace" time is given to expired histograms during which they
# will contintue to be collected and reported.  The dashboard should ignore
# data from this period making the expiry noticeable and giving time for
# owners to re-enable them without any discontinuity of data. Releases are
# geneally 6 weeks apart but sometimes 7 so +2 to be safe.
_EXPIRE_GRACE_MSTONES = 2
_EXPIRE_GRACE_WEEKS = _EXPIRE_GRACE_MSTONES * 6 + 2


class Error(Exception):
  pass


def _GetExpiredHistograms(histograms, base_date, current_milestone):
  """Filters histograms to find expired ones if date format is used.

  Args:
    histograms(Dict[str, Dict]): Histogram descriptions in the form
      {name: content}.
    base_date(datetime.date): A date to check expiry dates against.

  Returns:
    List of strings with names of expired histograms.

  Raises:
    Error if there is an expiry date that doesn't match expected format.
  """
  expired_histograms_names = []
  for name, content in histograms.items():
    if "obsolete" in content or "expires_after" not in content:
      continue
    expiry_str = content["expires_after"]
    if expiry_str == "never":
      continue

    match = _MILESTONE_EXPIRY_RE.search(expiry_str)
    if match:
      # if there is match then expiry is in Chrome milsetone format.
      if int(match.group(1)) < current_milestone:
        expired_histograms_names.append(name)
    else:
      # if no match then we try the date format.
      try:
        expiry_date = datetime.datetime.strptime(
            expiry_str, extract_histograms.EXPIRY_DATE_PATTERN).date()
      except ValueError:
        raise Error(_DATE_FORMAT_ERROR.
                    format(date=expiry_str, name=name))
      if expiry_date < base_date:
        expired_histograms_names.append(name)
  return expired_histograms_names


def _FindMatch(content, regex, group_num):
  match_result = regex.search(content)
  if not match_result:
    raise Error("Unable to match {pattern} with provided content: {content}".
                format(pattern=regex.pattern, content=content))
  return match_result.group(group_num)


def _GetBaseDate(content, regex):
  """Fetches base date from |content| to compare expiry dates with.

  Args:
   content: A string with the base date.
   regex: A regular expression object that matches the base date.

  Returns:
   A base date as datetime.date object.

  Raises:
    Error if |content| doesn't match |regex| or the matched date has invalid
    format.
  """
  base_date_str = _FindMatch(content, regex, 1)
  if not base_date_str:
    return None
  try:
    base_date = datetime.datetime.strptime(
        base_date_str, extract_histograms.EXPIRY_DATE_PATTERN).date()
    return base_date
  except ValueError:
    raise Error("Unable to parse base date {date} from {content}.".
                format(date=base_date_str, content=content))


def _GetCurrentMilestone(content, regex):
  """Extracts current milestone from |content|.

  Args:
   content: A string with the version information.
   regex: A regular expression object that matches milestone.

  Returns:
   A milestone  as int.

  Raises:
    Error if |content| doesn't match |regex|.
  """
  return int(_FindMatch(content, regex, 1))


def _HashName(name):
  """Returns hash for the given histogram |name|."""
  # This corresponds to HashMetricNameAs32Bits() in C++
  return "0x" + hashlib.md5(name.encode()).hexdigest()[:8]


def _GetHashToNameMap(histograms_names):
  """Returns dictionary {hash: histogram_name}."""
  hash_to_name_map = dict()
  for name in histograms_names:
    hash_to_name_map[_HashName(name)] = name
  return hash_to_name_map


def _GenerateHeaderFileContent(header_filename, namespace,
                               histograms_map):
  """Generates header file content.

  Args:
    header_filename: A filename of the generated header file.
    namespace: A namespace to contain generated array.
    histograms_map(Dict[str, str]): A dictionary {hash: histogram_name}.

  Returns:
    String with the generated content.
  """
  include_guard = re.sub("[^A-Z]", "_", header_filename.upper()) + "_"
  if not histograms_map:
    # Some platforms don't allow creating empty arrays.
    histograms_map["0x00000000"] = "Dummy.Histogram"
  hashes = "\n".join([
      "  {hash},  // {name}".format(hash=value, name=histograms_map[value])
      for value in sorted(histograms_map.keys())
  ])
  return _HEADER.format(script_name=_SCRIPT_NAME,
                        include_guard=include_guard,
                        namespace=namespace,
                        hash_datatype=_HASH_DATATYPE,
                        hashes=hashes)


def _GenerateFileContent(descriptions, branch_file_content,
                         mstone_file_content, header_filename, namespace):
  """Generates header file containing array with hashes of expired histograms.

  Args:
    descriptions: Combined histogram descriptions.
    branch_file_content: Content of file with base date.
    mstone_file_content: Content of file with milestone information.
    header_filename: A filename of the generated header file.
    namespace: A namespace to contain generated array.

  Raises:
    Error if there is an error in input xml files.
  """
  histograms, had_errors = (
      extract_histograms.ExtractHistogramsFromDom(descriptions))
  if had_errors:
    raise Error("Error parsing inputs.")
  base_date = _GetBaseDate(branch_file_content, _DATE_FILE_RE)
  base_date -= datetime.timedelta(weeks=_EXPIRE_GRACE_WEEKS)
  current_milestone = _GetCurrentMilestone(
      mstone_file_content, _CURRENT_MILESTONE_RE)
  current_milestone -= _EXPIRE_GRACE_MSTONES

  expired_histograms_names = _GetExpiredHistograms(
      histograms, base_date, current_milestone)
  expired_histograms_map = _GetHashToNameMap(expired_histograms_names)
  header_file_content = _GenerateHeaderFileContent(
      header_filename, namespace, expired_histograms_map)
  return header_file_content


def CheckUnsyncedHistograms(inputs):
  """Checks whether --inputs is in sync with |histogram_paths.ALL_XMLS|."""
  all_xmls_set = set(histogram_paths.ALL_XMLS)
  inputs_set = set(os.path.abspath(input) for input in inputs)
  to_add, to_remove = all_xmls_set - inputs_set, inputs_set - all_xmls_set
  return to_add, to_remove


def _GenerateFile(arguments):
  """Generates header file containing array with hashes of expired histograms.

  Args:
    arguments: An object with the following attributes:
      arguments.inputs: A list of xml files with histogram descriptions.
      arguments.header_filename: A filename of the generated header file.
      arguments.namespace: A namespace to contain generated array.
      arguments.output_dir: A directory to put the generated file.
      arguments.major_branch_date_filepath: File path for base date.
      arguments.milestone_filepath: File path for milestone information.
  """
  # Assert that the |--inputs| is the same as |histogram_paths.ALL_XMLS| to make
  # sure we have the most updated list of histogram descriptions. Otherwise,
  # inform the cl owner to update the --inputs.
  to_add, to_remove = CheckUnsyncedHistograms(arguments.inputs)
  assert len(to_add) == 0 and len(to_remove) == 0, (
      "The --inputs is not in sync with the most updated list of xmls. Please "
      "update the inputs in "
      "components/metrics/generate_expired_histograms_array.gni.\n"
      "  add: %s\n  remove: %s" % (', '.join(to_add), ', '.join(to_remove)))

  descriptions = merge_xml.MergeFiles(arguments.inputs)
  with open(arguments.major_branch_date_filepath, "r") as date_file:
    branch_file_content = date_file.read()
  with open(arguments.milestone_filepath, "r") as milestone_file:
    mstone_file_content = milestone_file.read()

  header_file_content = _GenerateFileContent(
      descriptions, branch_file_content, mstone_file_content,
      arguments.header_filename, arguments.namespace)

  with open(os.path.join(arguments.output_dir, arguments.header_filename),
            "w") as generated_file:
    generated_file.write(header_file_content)


def _ParseArguments():
  """Defines and parses arguments from the command line."""
  arg_parser = argparse.ArgumentParser(
      description="Generate array of expired histograms' hashes.")
  arg_parser.add_argument(
      "--output_dir",
      "-o",
      required=True,
      help="Base directory to for generated files.")
  arg_parser.add_argument(
      "--header_filename",
      "-H",
      required=True,
      help="File name of the generated header file.")
  arg_parser.add_argument(
      "--namespace",
      "-n",
      default="",
      help="Namespace of the generated factory function (code will be in "
      "the global namespace if this is omitted).")
  arg_parser.add_argument(
      "--major_branch_date_filepath",
      "-d",
      required=True,
      help="A path to the file with the base date.")
  arg_parser.add_argument(
      "--milestone_filepath",
      "-m",
      required=True,
      help="A path to the file with the milestone information.")
  arg_parser.add_argument(
      "inputs",
      nargs="+",
      help="Paths to .xml files with histogram descriptions.")
  return arg_parser.parse_args()


def main():
  arguments = _ParseArguments()
  _GenerateFile(arguments)


if __name__ == "__main__":
  sys.exit(main())