# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import collections
import dataclasses
import json
import os
import pathlib
import re
from typing import Callable, DefaultDict, Tuple
class SpdxWriter:
"""Accepts package metadata and outputs licensing info in SPDX format."""
def __init__(self,
root: str,
root_package_name: str,
root_license: str,
link_prefix: str,
doc_name: str = None,
doc_namespace: str = None,
read_file=lambda x: pathlib.Path(x).read_text(encoding='utf-8')):
self.root_package = _Package(root_package_name, root_license)
# Use dict to ensure no duplicate pkgs.
# In >=py3.7 dicts are ordered by insertion.
self.packages = {}
self.root = root
self.link_prefix = link_prefix
self.doc_namespace = doc_namespace
self.read_file = read_file
if not doc_name:
doc_name = root_package_name
self.doc_name = doc_name
def add_package(self, name: str, license_file: str):
"""Add a package to the SPDX output."""
self.packages[_Package(name, license_file)] = None
def write_to_file(self, file_path: str):
"""Writes the content to a file."""
with open(file_path, 'w', encoding='utf-8') as f:
f.write(self.write())
def write(self) -> str:
"""Writes out SPDX in JSON format."""
writer = _SPDXJSONWriter(self.root, self.root_package, self.link_prefix,
self.doc_name, self.doc_namespace, self.read_file)
for pkg in self.packages:
writer.add_package(pkg)
return writer.write()
@dataclasses.dataclass(frozen=True)
class _Package:
"""Stores needed data for a package to output SPDX."""
name: str
file: str
@property
def package_spdx_id(self) -> str:
return self._escape_id(f'SPDXRef-Package-{self.name}')
def _escape_id(self, spdx_id: str) -> str:
return re.sub(r'[^a-zA-Z0-9-\.]', '-', spdx_id)
@property
def license_spdx_id(self) -> str:
return self._escape_id(f'LicenseRef-{self.name}')
def _get_spdx_path(root: str, license_file_path: str) -> str:
"""Get relative path from the spdx root."""
# remove rel path things in path
abs_path = os.path.abspath(license_file_path)
abs_root = os.path.abspath(root)
if not abs_path.startswith(abs_root):
raise ValueError(f'spdx root not valid. {abs_path} is not under {abs_root}')
return abs_path[len(abs_root):]
class _SPDXJSONWriter():
"""Writes SPDX data in JSON format.
Produce SPDX JSON output adherring to this schema:
https://github.com/spdx/spdx-spec/blob/development/v2.2.2/schemas/spdx-schema.json
See example:
https://github.com/spdx/spdx-spec/blob/development/v2.2.2/examples/SPDXJSONExample-v2.2.spdx.json
"""
def __init__(self, root: str, root_package: _Package, link_prefix: str,
doc_name: str, doc_namespace: str,
read_file: Callable[[str], str]):
self.root = root
self.root_package_id = root_package.package_spdx_id
self.link_prefix = link_prefix
self.read_file = read_file
self.content = {
# Actually 2.2.2, but only SPDX-N.M is needed.
'spdxVersion': 'SPDX-2.2',
'SPDXID': 'SPDXRef-DOCUMENT',
'name': doc_name,
'documentNamespace': doc_namespace,
'creationInfo': {
'creators': [f'Tool: {os.path.basename(__file__)}'],
},
'dataLicense': 'CC0-1.0',
'documentDescribes': [self.root_package_id],
'packages': [],
'hasExtractedLicensingInfos': [],
'relationships': [],
}
# Used to dedup license files based on file path.
self.existing_license_files = {} # 'file path': 'licenseId'
# Used to make sure that there are no duplicate ids.
self.existing_package_ids = collections.defaultdict(int) # 'packageId': num
self.existing_license_ids = collections.defaultdict(int) # 'licenseId': num
# Add the root package to make sure that its ID isn't taken.
self.add_package(root_package)
def write(self) -> str:
"""Returns a JSON string for the current state of the writer."""
return json.dumps(self.content, indent=4)
def _get_dedup_id(self, elem_id: str, id_dict: DefaultDict[str, int]) -> str:
"""Returns a unique id given a dictionary with existing ids.
IDs are case sensitive, so this method ignores casing for uniqueness.
Args:
elem_id: the requested id to use for the element.
id_dict: dictionary holding already used ids.
Returns:
When the elem_id is already unique, return elem_id.
When the elem_id has been used, return elem_id + '-[next num]'.
"""
suffix = id_dict[elem_id]
id_dict[elem_id] += 1
return f'{elem_id}-{suffix}' if suffix > 0 else elem_id
def _get_package_id(self, pkg: _Package) -> str:
"""Makes sure that there are no pkg id duplicates."""
return self._get_dedup_id(pkg.package_spdx_id, self.existing_package_ids)
def _get_license_id(self, pkg: _Package) -> Tuple[str, bool]:
"""Handles license deduplication.
If this pkg.file has already been seen, reuse that same id instead. If
there are two packages with the same name but different license files,
handle deduping the names.
Args:
pkg: The package to get a license id for.
Returns:
First return value is the id, second is whether the license needs to be
added to the SPDX doc (False if it already exists in the doc).
"""
existing = self.existing_license_files.get(pkg.file)
if existing:
return existing, False
license_id = self._get_dedup_id(pkg.license_spdx_id,
self.existing_license_ids)
self.existing_license_files[pkg.file] = license_id
return license_id, True
def add_package(self, pkg: _Package):
"""Writes a package to the file (package metadata)."""
pkg_id = self._get_package_id(pkg)
license_id, need_to_add_license = self._get_license_id(pkg)
self.content['packages'].append({
'SPDXID': pkg_id,
'name': pkg.name,
'licenseConcluded': license_id,
})
if pkg.package_spdx_id != self.root_package_id:
self.content['relationships'].append({
'spdxElementId': self.root_package_id,
'relationshipType': 'CONTAINS',
'relatedSpdxElement': pkg_id,
})
if need_to_add_license:
self._add_license_file(pkg, license_id)
def _add_license_file(self, pkg: _Package, license_id: str):
"""Writes a license to the file (raw license text)."""
spdx_path = _get_spdx_path(self.root, pkg.file)
url = f'{self.link_prefix}{spdx_path.replace(os.sep, "/")}'
self.content['hasExtractedLicensingInfos'].append({
'name':
f'{pkg.name}',
'licenseId':
license_id,
'extractedText':
self.read_file(pkg.file),
'crossRefs': [{
'url': url,
}],
})