chromium/tools/licenses/spdx_writer.py

# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import collections
import dataclasses
import json
import os
import pathlib
import re
from typing import Callable, DefaultDict, Tuple


class SpdxWriter:
  """Accepts package metadata and outputs licensing info in SPDX format."""

  def __init__(self,
               root: str,
               root_package_name: str,
               root_license: str,
               link_prefix: str,
               doc_name: str = None,
               doc_namespace: str = None,
               read_file=lambda x: pathlib.Path(x).read_text(encoding='utf-8')):
    self.root_package = _Package(root_package_name, root_license)
    # Use dict to ensure no duplicate pkgs.
    # In >=py3.7 dicts are ordered by insertion.
    self.packages = {}

    self.root = root
    self.link_prefix = link_prefix
    self.doc_namespace = doc_namespace
    self.read_file = read_file

    if not doc_name:
      doc_name = root_package_name
    self.doc_name = doc_name

  def add_package(self, name: str, license_file: str):
    """Add a package to the SPDX output."""
    self.packages[_Package(name, license_file)] = None

  def write_to_file(self, file_path: str):
    """Writes the content to a file."""
    with open(file_path, 'w', encoding='utf-8') as f:
      f.write(self.write())

  def write(self) -> str:
    """Writes out SPDX in JSON format."""
    writer = _SPDXJSONWriter(self.root, self.root_package, self.link_prefix,
                             self.doc_name, self.doc_namespace, self.read_file)

    for pkg in self.packages:
      writer.add_package(pkg)

    return writer.write()


@dataclasses.dataclass(frozen=True)
class _Package:
  """Stores needed data for a package to output SPDX."""
  name: str
  file: str

  @property
  def package_spdx_id(self) -> str:
    return self._escape_id(f'SPDXRef-Package-{self.name}')

  def _escape_id(self, spdx_id: str) -> str:
    return re.sub(r'[^a-zA-Z0-9-\.]', '-', spdx_id)

  @property
  def license_spdx_id(self) -> str:
    return self._escape_id(f'LicenseRef-{self.name}')


def _get_spdx_path(root: str, license_file_path: str) -> str:
  """Get relative path from the spdx root."""
  # remove rel path things in path
  abs_path = os.path.abspath(license_file_path)
  abs_root = os.path.abspath(root)
  if not abs_path.startswith(abs_root):
    raise ValueError(f'spdx root not valid. {abs_path} is not under {abs_root}')
  return abs_path[len(abs_root):]


class _SPDXJSONWriter():
  """Writes SPDX data in JSON format.

  Produce SPDX JSON output adherring to this schema:
    https://github.com/spdx/spdx-spec/blob/development/v2.2.2/schemas/spdx-schema.json
    See example:
    https://github.com/spdx/spdx-spec/blob/development/v2.2.2/examples/SPDXJSONExample-v2.2.spdx.json
  """

  def __init__(self, root: str, root_package: _Package, link_prefix: str,
               doc_name: str, doc_namespace: str,
               read_file: Callable[[str], str]):
    self.root = root
    self.root_package_id = root_package.package_spdx_id
    self.link_prefix = link_prefix

    self.read_file = read_file

    self.content = {
        # Actually 2.2.2, but only SPDX-N.M is needed.
        'spdxVersion': 'SPDX-2.2',
        'SPDXID': 'SPDXRef-DOCUMENT',
        'name': doc_name,
        'documentNamespace': doc_namespace,
        'creationInfo': {
            'creators': [f'Tool: {os.path.basename(__file__)}'],
        },
        'dataLicense': 'CC0-1.0',
        'documentDescribes': [self.root_package_id],
        'packages': [],
        'hasExtractedLicensingInfos': [],
        'relationships': [],
    }

    # Used to dedup license files based on file path.
    self.existing_license_files = {}  # 'file path': 'licenseId'
    # Used to make sure that there are no duplicate ids.
    self.existing_package_ids = collections.defaultdict(int)  # 'packageId': num
    self.existing_license_ids = collections.defaultdict(int)  # 'licenseId': num

    # Add the root package to make sure that its ID isn't taken.
    self.add_package(root_package)

  def write(self) -> str:
    """Returns a JSON string for the current state of the writer."""
    return json.dumps(self.content, indent=4)

  def _get_dedup_id(self, elem_id: str, id_dict: DefaultDict[str, int]) -> str:
    """Returns a unique id given a dictionary with existing ids.

    IDs are case sensitive, so this method ignores casing for uniqueness.

    Args:
      elem_id: the requested id to use for the element.
      id_dict: dictionary holding already used ids.

    Returns:
      When the elem_id is already unique, return elem_id.
      When the elem_id has been used, return elem_id + '-[next num]'.
    """
    suffix = id_dict[elem_id]
    id_dict[elem_id] += 1
    return f'{elem_id}-{suffix}' if suffix > 0 else elem_id

  def _get_package_id(self, pkg: _Package) -> str:
    """Makes sure that there are no pkg id duplicates."""
    return self._get_dedup_id(pkg.package_spdx_id, self.existing_package_ids)

  def _get_license_id(self, pkg: _Package) -> Tuple[str, bool]:
    """Handles license deduplication.

    If this pkg.file has already been seen, reuse that same id instead. If
    there are two packages with the same name but different license files,
    handle deduping the names.

    Args:
      pkg: The package to get a license id for.

    Returns:
      First return value is the id, second is whether the license needs to be
        added to the SPDX doc (False if it already exists in the doc).
    """
    existing = self.existing_license_files.get(pkg.file)
    if existing:
      return existing, False

    license_id = self._get_dedup_id(pkg.license_spdx_id,
                                    self.existing_license_ids)
    self.existing_license_files[pkg.file] = license_id
    return license_id, True

  def add_package(self, pkg: _Package):
    """Writes a package to the file (package metadata)."""
    pkg_id = self._get_package_id(pkg)
    license_id, need_to_add_license = self._get_license_id(pkg)

    self.content['packages'].append({
        'SPDXID': pkg_id,
        'name': pkg.name,
        'licenseConcluded': license_id,
    })

    if pkg.package_spdx_id != self.root_package_id:
      self.content['relationships'].append({
          'spdxElementId': self.root_package_id,
          'relationshipType': 'CONTAINS',
          'relatedSpdxElement': pkg_id,
      })

    if need_to_add_license:
      self._add_license_file(pkg, license_id)

  def _add_license_file(self, pkg: _Package, license_id: str):
    """Writes a license to the file (raw license text)."""
    spdx_path = _get_spdx_path(self.root, pkg.file)
    url = f'{self.link_prefix}{spdx_path.replace(os.sep, "/")}'
    self.content['hasExtractedLicensingInfos'].append({
        'name':
        f'{pkg.name}',
        'licenseId':
        license_id,
        'extractedText':
        self.read_file(pkg.file),
        'crossRefs': [{
            'url': url,
        }],
    })