chromium/tools/code_coverage/download_fuzz_corpora.py

#!/usr/bin/env python3
# Copyright 2023 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Download all the fuzzing corpora associated with all Chromium libfuzzer
targets.

Assumes that fuzzer targets are already built and reside in the BUILD_DIR
directory.

  * Example usage: download_fuzz_corpora.py --download-dir [DOWNLOAD_DIR]
    --build-dir [BUILD_DIR]
"""

CORPORA_BUCKET_BASE_URL = "gs://clusterfuzz-libfuzzer-backup/corpus/libfuzzer/"

import argparse
import coverage_consts
import logging
from multiprocessing import cpu_count, Pool
import os
import subprocess
import sys


def _gsutil(cmd):
  subprocess.run(cmd)


def _download_corpus(args):
  target = args[0]
  download_dir = args[1]
  target_folder = os.path.join(download_dir, target)
  subprocess.run(['mkdir', target_folder])
  target_path = os.path.join(CORPORA_BUCKET_BASE_URL, target, "latest.zip")
  gsutil_cmd = ['gsutil', 'cp', target_path, target_folder]
  _gsutil(gsutil_cmd)


def _unzip_corpus(args):
  target = args[0]
  download_dir = args[1]
  target_folder = os.path.join(download_dir, target)
  target_path = os.path.join(download_dir, target, "latest.zip")
  subprocess.run(['unzip', "latest.zip"], cwd=target_folder)
  subprocess.run(['rm', 'latest.zip'], cwd=target_folder)
  try:
    # Unzipping the corpora often also contains a "regressions" folder, which
    # is a subset of the total corpus, so can be ignored/removed
    subprocess.run(['rm', '-rf', 'regressions'], cwd=target_folder)
  except:
    pass


def unzip_corpora(download_dir, corpora_to_download):
  with Pool(cpu_count()) as p:
    results = p.map(_unzip_corpus, [(corpus, args.download_dir)
                                    for corpus in corpora_to_download])


def _ParseCommandArguments():
  """Adds and parses relevant arguments for tool comands.

  Returns:
    A dictionary representing the arguments.
  """
  arg_parser = argparse.ArgumentParser()
  arg_parser.usage = __doc__

  arg_parser.add_argument('--download-dir',
                          type=str,
                          required=True,
                          help='Directory into which corpora are downloaded.')
  arg_parser.add_argument('--build-dir',
                          required=True,
                          type=str,
                          help='Directory where fuzzers were built.')
  args = arg_parser.parse_args()
  return args


def Main():
  args = _ParseCommandArguments()
  exit

  if not args.download_dir:
    logging.error("No download_dir given")
    exit
  if not os.path.isdir(args.download_dir):
    logging.error("%s does not exist or is not a directory" % args.download_dir)
    exit
  if not args.build_dir:
    logging.error("No build_dir given")
    exit
  if not os.path.isdir(args.build_dir):
    logging.error("%s does not exist or is not a directory" % args.build_dir)
    exit

  corpora_to_download = []
  for target in os.listdir(args.build_dir):
    if target.endswith('_fuzzer'):
      corpora_to_download.append(target)

  print("Corpora to download: " + str(corpora_to_download))

  with Pool(cpu_count()) as p:
    results = p.map(_download_corpus, [(corpus, args.download_dir)
                                       for corpus in corpora_to_download])
  with Pool(cpu_count()) as p:
    results = p.map(_unzip_corpus, [(corpus, args.download_dir)
                                    for corpus in corpora_to_download])


if __name__ == '__main__':
  sys.exit(Main())