extract_licenses_from_apk.py | Explore in Territory

#!/usr/bin/env python3
# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Extracts licenses from a Chrome apk.

1) Extracts assets/resources.pak
2) Extracts entries using tools/grit/pak_util.py
3) Finds about_credits.html
4) Converts it to plaintext

This script requires the "brotli" executable.
Option 1) sudo apt-get install brotli
Option 2) "ninja clang_x64/brotli", and use "--brotli clang_x64/brotli"
"""
import argparse
import os
import pathlib
import re
import subprocess
import sys
import tempfile
import zipfile

_DIR_SOURCE_ROOT = pathlib.Path(__file__).parents[2]
_PAK_UTIL = _DIR_SOURCE_ROOT / 'tools' / 'grit' / 'pak_util.py'


def _extract_pak(pak_path, output_dir, brotli):
  cmd = [
      sys.executable, _PAK_UTIL, 'extract', pak_path, '--output-dir', output_dir
  ]
  if brotli:
    cmd += ['--brotli', brotli]
  try:
    subprocess.run(cmd, check=True)
  except subprocess.CalledProcessError as e:
    sys.stderr.write(str(e) + '\n')
    sys.exit(1)


def _find_licenses_file(output_dir):
  for subpath in os.listdir(output_dir):
    with open(os.path.join(output_dir, subpath), 'rb') as f:
      data = f.read(100)
      if b'Generated by licenses.py' in data:
        f.seek(0)
        return f.read().decode('utf8')
  sys.stderr.write('Could not find credits html in pak file.\n')
  sys.exit(1)


def _extract_licenses(chrome_apk, brotli):
  with tempfile.NamedTemporaryFile() as temp_pak:
    with zipfile.ZipFile(chrome_apk) as z:
      temp_pak.write(z.read('assets/resources.pak'))
      temp_pak.flush()
    with tempfile.TemporaryDirectory() as temp_d:
      _extract_pak(temp_pak.name, temp_d, brotli)
      return _find_licenses_file(temp_d)


def _transform_html(html):
  # <span class="title">TITLE</span>
  # <span class="homepage"><a href="URL">homepage</a></span>
  # <pre>LICENSE</pre>
  pattern = re.compile(r'"title".*?>(.*?)<.*?href="(.*?)".*?<pre>(.*?)</pre>',
                       re.DOTALL)
  entries = ['Open-source libraries used by Chrome:\n']
  for title, url, text in pattern.findall(html):
    entry = ['Project: ' + title, 'URL: ' + url, '', text]
    entries.append('\n'.join(entry))

  actual_count = len(entries) - 1
  expected_count = html.count('</pre>')
  if expected_count != actual_count:
    sys.stderr.write(
        f'Parsed {actual_count} but should have parse {expected_count}\n')
    sys.exit(1)

  sep = '\n' + '=' * 80 + '\n'
  return sep.join(entries)


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--brotli',
                      help='Path to brotli executable if not on PATH.')
  parser.add_argument('--chrome-apk',
                      required=True,
                      help='Path to .apk with assets/resources.pak in it.')
  parser.add_argument('--output', required=True, help='Output file path.')
  parser.add_argument('--raw',
                      action='store_true',
                      help='Do not convert to plain text.')
  args = parser.parse_args()

  data = _extract_licenses(args.chrome_apk, args.brotli)
  with open(args.output, 'w', encoding='utf8') as f:
    if not args.raw:
      data = _transform_html(data)
    f.write(data)


if __name__ == '__main__':
  main()
chromium/tools/android/extract_licenses_from_apk.py