# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Checks that collected symbols are not missing things."""
import logging
import models
import os
class QualityCheckError(Exception):
pass
def _Divide(a, b):
return float(a) / b if b else 0
def CheckDataQuality(size_info, track_string_literals):
logging.debug('Grouping symbols')
grouped = size_info.raw_symbols.GroupedByContainerAndSectionName()
section_sizes = size_info.section_sizes
logging.debug('computing')
errors = []
for symbols in grouped:
segment_has_error = [] # List so can be mutated from nested function.
container = symbols[0].container
section_name = symbols[0].section_name
segment_size = container.section_sizes[section_name]
is_other = section_name == models.SECTION_OTHER
is_native = section_name in models.NATIVE_SECTIONS
is_dex = section_name in models.DEX_SECTIONS
logging.debug('checking section %s<%s>', section_name, container.name)
actual_size = 0.0
actual_padding = 0.0
placeholder_size = 0.0
no_name_size = 0.0
no_source_path_size = 0.0
no_attribution_size = 0.0
no_component_size = 0.0
string_literal_size = 0.0
alias_count = 0
generated_count = 0
unlikely_count = 0
startup_count = 0
for sym in symbols:
pss = sym.pss
actual_size += pss
actual_padding += sym.padding_pss
if sym.full_name.startswith('**'):
placeholder_size += pss
if not sym.full_name:
no_name_size += pss
if not sym.source_path:
no_source_path_size += pss
if not (sym.full_name or sym.source_path or sym.object_path):
no_attribution_size += pss
if not sym.component:
no_component_size += pss
if sym.IsStringLiteral():
string_literal_size += pss
alias_count += int(bool(sym.aliases and sym is sym.aliases[0]))
generated_count += int(bool(sym.flags & models.FLAG_GENERATED_SOURCE))
unlikely_count += int(bool(sym.flags & models.FLAG_UNLIKELY))
startup_count += int(bool(sym.flags & models.FLAG_STARTUP))
if os.path.isabs(sym.source_path):
errors.append('Abs path found in source_path: ' + repr(sym))
if os.path.isabs(sym.object_path):
errors.append('Abs path found in object_path: ' + repr(sym))
def report_error(msg, *args):
if not segment_has_error:
segment_has_error.append(True)
errors.append(('Error(s) found in container "{}", section "{}", '
'which has {} symbols totalling {} bytes: ').format(
container.name, section_name, len(symbols),
segment_size))
full_msg = msg.format(*args)
errors.append(' ' + full_msg)
def report_size_error(kind, size, limit_fraction):
report_error(
'Abnormally high number of bytes attributed to {}: {:.0f} '
'({:.0%}, limit was {:.0%}).', kind, size,
_Divide(size, segment_size), limit_fraction)
def check_size(kind, size, limit_fraction):
limit = limit_fraction * segment_size
if size > limit:
report_size_error(kind, size, limit_fraction)
def check_some_exist(kind, count, limit=1):
if count < limit:
report_error(
'Expected at least {} {} to exist. '
'Found only {} out of {} symbols.', limit, kind, count,
len(symbols))
if not isinstance(segment_size, int):
report_error('Section size should be a whole number.')
continue
if segment_size < 1:
report_error('Section size less than one.')
continue
if round(actual_size) != segment_size:
report_error('Sum of symbols sizes do not match section size. Sum={}',
round(actual_size))
continue
check_size('padding', actual_padding, (0.05 if is_other else 0.01))
# One bad symbol can mess up small containers.
is_small_section = (len(symbols) < 10 or
_Divide(segment_size, section_sizes[section_name]) < .1)
if not is_small_section:
# Dex string tables show up as placeholders.
check_size('placeholders', placeholder_size, (0.2 if is_dex else 0.01))
check_size('symbols without names', no_name_size, 0.01)
check_size('symbols without source paths', no_source_path_size, 0.1)
check_size('symbols without name or path', no_attribution_size, 0.01)
check_size('symbols without component', no_component_size, 0.20)
if track_string_literals and section_name == models.SECTION_RODATA:
if _Divide(string_literal_size, segment_size) < .05:
report_error(
'Expected more size from string literals. Found only {} ({:.1%})',
string_literal_size, _Divide(string_literal_size, segment_size))
if is_native:
check_some_exist('symbol aliases', alias_count)
if is_native or is_dex:
check_some_exist('generated symbols', generated_count)
if section_name == models.SECTION_TEXT:
check_some_exist('symbols annotated by AFDO profile', unlikely_count)
check_some_exist('static initializers', startup_count)
if errors:
# Cap the number of log messages.
MAX_ERRORS = 40
logging.error('--check-data-quality Found %d errors:', len(errors))
for msg in errors[:MAX_ERRORS]:
logging.error('Failed: %s', msg)
if len(errors) > MAX_ERRORS:
logging.error('... and %d more.', len(errors) - MAX_ERRORS)
raise QualityCheckError()
# TODO(agrieve): Have this utilize the stats collected by CheckDataQuality().
def _DescribeSizeInfoContainerCoverage(raw_symbols, container):
"""Yields lines describing how accurate |size_info| is."""
for section, section_name in models.SECTION_TO_SECTION_NAME.items():
expected_size = container.section_sizes.get(section_name)
in_section = raw_symbols.WhereInSection(section_name, container=container)
actual_size = in_section.size
if expected_size is None:
yield 'Section {}: {} bytes from {} symbols.'.format(
section_name, actual_size, len(in_section))
else:
size_fraction = _Divide(actual_size, expected_size)
yield ('Section {}: has {:.1%} of {} bytes accounted for from '
'{} symbols. {} bytes are unaccounted for.').format(
section_name, size_fraction, actual_size, len(in_section),
expected_size - actual_size)
padding = in_section.padding
yield '* Padding accounts for {} bytes ({:.1%})'.format(
padding, _Divide(padding, actual_size))
def size_msg(syms, show_padding=False):
size = syms.size if not show_padding else syms.size_without_padding
size_msg = 'Accounts for {} bytes ({:.1%}).'.format(
size, _Divide(size, actual_size))
if show_padding:
size_msg = size_msg[:-1] + ' padding is {} bytes.'.format(syms.padding)
return size_msg
syms = in_section.Filter(lambda s: s.source_path)
yield '* {} have source paths. {}'.format(len(syms), size_msg(syms))
syms = in_section.WhereHasComponent()
yield '* {} have a component assigned. {}'.format(len(syms), size_msg(syms))
syms = in_section.WhereIsPlaceholder()
if syms:
yield '* {} placeholders exist (symbols that start with **). {}'.format(
len(syms), size_msg(syms))
syms = syms.Inverted().WhereHasAnyAttribution().Inverted()
if syms:
yield '* {} symbols have no name or path. {}'.format(
len(syms), size_msg(syms))
if section == 'r':
syms = in_section.Filter(lambda s: s.IsStringLiteral())
yield '* {} string literals exist. {}'.format(
len(syms), size_msg(syms, show_padding=True))
syms = in_section.Filter(lambda s: s.aliases)
if syms:
uniques = sum(1 for s in syms.IterUniqueSymbols())
saved = sum(s.size_without_padding * (s.num_aliases - 1)
for s in syms.IterUniqueSymbols())
yield ('* {} aliases exist, mapped to {} unique addresses '
'({} bytes saved)').format(len(syms), uniques, saved)
syms = in_section.WhereObjectPathMatches('{shared}')
if syms:
yield '* {} symbols have shared ownership. {}'.format(
len(syms), size_msg(syms))
else:
yield '* 0 symbols have shared ownership.'
for flag, desc in ((models.FLAG_HOT, 'marked as "hot"'),
(models.FLAG_UNLIKELY, 'marked as "unlikely"'),
(models.FLAG_STARTUP,
'marked as "startup"'), (models.FLAG_CLONE, 'clones'),
(models.FLAG_GENERATED_SOURCE,
'from generated sources')):
syms = in_section.WhereHasFlag(flag)
if syms:
yield '* {} symbols are {}. {}'.format(len(syms), desc, size_msg(syms))
spam_counter = 0
i = 1
count = len(in_section)
while i < count:
prev_sym = in_section[i - 1]
sym = in_section[i]
if (not sym.full_name.startswith('*')
# Assembly symbol are iffy.
and not prev_sym.source_path.endswith('.S') and
not sym.source_path.endswith('.S')
# String literal symbol creation is imperfect.
and not prev_sym.IsStringLiteral() and not sym.IsStringLiteral()
# Thresholds found by experimenting with arm32 Chrome.
# E.g.: Set to 0 and see what warnings appear, then take max value.
and ((sym.section in 'rd' and sym.padding >= 256) or
(sym.section in 't' and sym.padding >= 64))):
# TODO(crbug.com/40626114): We should synthesize symbols for these gaps
# rather than attribute them as padding.
spam_counter += 1
if spam_counter > 5:
break
yield 'Large padding of {} between:'.format(sym.padding)
yield ' A) ' + repr(in_section[i - 1])
yield ' B) ' + repr(sym)
# All aliases will have the same padding.
i += sym.num_aliases
def DescribeSizeInfoCoverage(size_info):
for i, container in enumerate(size_info.containers):
if i > 0:
yield ''
if container.name:
yield 'Container <%s>' % container.name
# TODO(huangs): Change to use "yield from" once linters allow this.
for line in _DescribeSizeInfoContainerCoverage(size_info.raw_symbols,
container):
yield line