chromium/tools/perf/validate_wpr_archives

#!/usr/bin/env vpython3
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
import os
import sys

from core import path_util
path_util.AddTelemetryToPath()

from core import benchmark_finders

from telemetry.core import optparse_argparse_migration as oam

from py_utils import cloud_storage

path_util.AddAndroidPylibToPath()


def GetAllStorySets():
  story_sets = []
  benchmarks_to_skip = [
      'analysis_metrics_ct',
      'skpicture_printer_ct',
      'screenshot_ct',
      'rendering.cluster_telemetry',
      'repaint_ct',
      'rasterize_and_record_micro_ct',
      'multipage_skpicture_printer_ct',
      'loading.cluster_telemetry',
      'v8.loading.cluster_telemetry',
      'v8.loading_runtime_stats.cluster_telemetry',
      'memory.cluster_telemetry',
      'skpicture_printer',
      'multipage_skpicture_printer',
      'leak_detection.cluster_telemetry',
      'generic_trace_ct',
      'ad_tagging.cluster_telemetry',
      'layout_shift.cluster_telemetry',
  ]

  for benchmark in benchmark_finders.GetAllBenchmarks():
    if benchmark.Name() in benchmarks_to_skip:
      continue

    parser = oam.CreateFromOptparseInputs()
    benchmark.AddBenchmarkCommandLineArgs(parser)
    options, _ = parser.parse_args([])
    story_sets.append(benchmark().CreateStorySet(options))
  return story_sets


def NormalizedPath(p):
  return os.path.normpath(os.path.abspath(p))

def GetMissingArchivesInCloudStorage(archive_infos, wpr_sha_files):
  if wpr_sha_files:
    abs_wpr_sha_files_path = []
    for f in wpr_sha_files:
      assert os.path.exists(f), '% does not exist' % f
      abs_wpr_sha_files_path.append(NormalizedPath(f))
    wpr_sha_files = abs_wpr_sha_files_path

  cloud_storage_paths = set()
  missing_sha_files = set()
  for wpr_archive_info in archive_infos:
    bucket = wpr_archive_info._bucket
    story_archives = wpr_archive_info._data['archives']
    for story in story_archives:
      for _, archive_path in story_archives[story].items():
        archive_path = os.path.join(wpr_archive_info._base_dir,
            archive_path)
        hash_path = NormalizedPath(archive_path + '.sha1')
        if not os.path.exists(hash_path):
          missing_sha_files.add(hash_path)
          continue
        if hash_path not in wpr_sha_files:
          continue
        remote_path = cloud_storage.ReadHash(hash_path)
        cloud_storage_paths.add((bucket, remote_path, archive_path))

  buckets = {}
  # Create a dictionary of all of the buckets.
  for (bucket, _, _) in cloud_storage_paths:
    buckets[bucket] = {}
  # Some buckets contain a modest (< 10,000) number of entries and can be
  # processed more efficiently by listing the contents once rather than calling
  # .Exists() for each file. Other buckets contain 600,000+ (chromium-telemetry)
  # or far more files (chrome-telemetry) so .Exists() must be used for them.
  fast_buckets = ['chrome-partner-telemetry']

  # Iterate through all of the fast buckets getting a list of all files in those
  # buckets.
  for bucket in buckets:
    if bucket in fast_buckets:
      buckets[bucket] = set(cloud_storage.ListFiles(bucket))

  missing_archives = set()
  for (bucket, remote_path, archive_path) in cloud_storage_paths:
    found = False
    if bucket in fast_buckets:
      if '/' + remote_path in buckets[bucket]:
        found = True
    elif cloud_storage.Exists(bucket, remote_path):
      found = True
    if not found:
      missing_archives.add((archive_path, bucket))
  return missing_archives, missing_sha_files


def main(args):
  parser = argparse.ArgumentParser(
      'Validate whether WPR archives are properly stored in CloudStorage.')
  parser.add_argument('wpr_sha_files', nargs='*')
  options = parser.parse_args(args)
  archive_infos = []
  for s in GetAllStorySets():
    if not s.wpr_archive_info:
      continue
    archive_infos.append(s.wpr_archive_info)

  missing_archives, missing_sha_files = GetMissingArchivesInCloudStorage(
      archive_infos, options.wpr_sha_files)
  assert not missing_archives, (
      'Archives not checked in cloud storage properly:\n%s' %
      '\n'.join('%s (expected bucket: %s)' % (p, b) for p, b in missing_archives))
  assert not missing_sha_files, (
      'These SHA files are missing. Did you forget to check them in?\n%s' %
      '\n'.join(missing_sha_files))



if __name__ == '__main__':
  sys.exit(main(sys.argv[1:]))