chromium/tools/binary_size/libsupersize/r8_disassembly.py

#!/usr/bin/env python3
# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Parses R8 disassembly outputs."""

import argparse
import code
import collections
import dataclasses
import logging
import readline  # Makes code.InteractiveConsole works better.

# R8 disassembly performs best-effort deobfuscation of symbols while dumping
# DEX bytecode for each class, each method. The high-level structure (in pseudo
# RegExp) is "H(CM*)*", where H = Header (not parsed); C = Class info;
# M = Method info.
#
# Class info has ({} = data; {{}} = comment; else literals) format:
#
#   # Bytecode for
#   # Class: "{class_name}"
#
# Method info has legacy format (pre 2022-11-10):
#
#   #
#   # Method: "{method_name}":
#   # {access_flags...}
#   #
#
# {{The rest are optional}}
#   {return_type} {class_name}.{method_name}({param_types})  {{= signature}}
#   registers: {...}, inputs: {...}, outputs: {...}
#   {bytecode...} {{multiple lines}}}
#
# Note that {signature} may be broken, and become:
#   {class_name} {method_name}
#
# Method info, has updated format (post 2022-11-10):
#
#   #
#   # Method: "{return_type} {class_name}.{method_name}({param_types})":
#   # {access_flags...}
#   # Residual: "{{Optional line: Similar to Method but obfuscated}}"
#   #
#
# {{Same optional lines as legacy format}}
#
# Notes:
# * Blank lines separate Class info and Method info blocks
# * {param_types...} is delimited by ", ", and may be empty.
# * {access_flags...} is a space-delimited list of words (unused by SuperSize).
# * {bytecode...} may have blank lines, but no lines starting with "#".
#
# Caveats:
# * If ambiguity exists, alternatives are delimited by " <OR> ".
# * Updated format for method info:
#   * "Method:" can be broken, just like {signature}.
#   * Signature data, if it exists, is redundant since it's just "Method:" data.
# * "Residual:" data might be inconsistent with "Method:". Therefore
#   heuristics should be applied to reconcile these.


@dataclasses.dataclass
class DexMethod:
  name: str
  class_name: str
  param_types: list = None
  return_type: str = None
  bytecode: list = None


class DexClass:
  def __init__(self, name):
    self.name = name
    self.methods = []

  def FindMethodByteCode(self, class_name, method_name, param_types,
                         return_type):
    for method in self.methods:
      if (method.name == method_name and method.class_name == class_name
          and method.return_type == return_type
          and method.param_types == param_types):
        return method.bytecode
    return None


class _WrapPeekableNoNewLine:
  """Line iterator decorator with peek(), and strips new line."""

  def __init__(self, it):
    self._it = it
    self._peek_lineno = 1
    self._buf = None  # Look-ahead cache to support peek().
    self._cur = None  # Most recent next() value.

  def _next_internal(self):
    ret = next(self._it, None)
    return None if ret is None else ret.rstrip('\n')

  def __next__(self):
    if self._buf is None:
      self._cur = self._next_internal()
    else:
      self._cur, self._buf = self._buf, None
    self._peek_lineno += 1
    return self._cur

  def peek(self):
    if self._buf is None:
      self._buf = self._next_internal()
    return self._buf

  def format_error(self, expected, is_peek=False):
    return ' '.join(('Line %d:' % (self._peek_lineno - int(not is_peek)),
                     'Expected %s,' % expected,
                     'got %r.' % (self.peek() if is_peek else self._cur)))


# pylint: disable=stop-iteration-return
def _ExtractMethodInfo(it):
  """Extracts coarse method data from R8 DEX dump.

  "Coarse" meaning detailed Method info data are left unparsed.

  Args:
    it: _WrapPeekableNoNewLine iterator for lines of R8 DEX dump.

  Yields:
    method_str: De-quoted string after "# Methods:".
    residual_str: De-quoted string after "# Residual:".
    signature: Signature string.
    byte_code: Lines of disassembled code (blank lines removed).
  """

  def is_end():
    return it.peek() is None or it.peek().startswith('# Class:')

  method_str = None
  while True:
    method_str = ''
    residual_str = ''
    signature = ''
    byte_code = []
    # Seek and read "Method:", possibly exit.
    while not is_end():
      if it.peek().startswith('# Method:'):
        break
      next(it)
    else:
      break
    method_str = next(it).split('\'')[1]
    # Skip access flags, e.g., "# public abstract".
    line = next(it)
    assert line.startswith('# '), it.format_error('comment with access flags')
    # Read "Residual:", made optional for compatibility.
    line = next(it)
    if line.startswith('# Residual:'):
      residual_str = line.split('\'')[1]
      line = next(it)
    # Eat line "#".
    assert line == '#', it.format_error('empty comment')
    # Eat blank line.
    assert next(it) == '', it.format_error('empty line')
    # Seek and read signature, possibly advance to next or exit.
    if it.peek() is None:
      break
    if it.peek().startswith('#'):
      yield (method_str, residual_str, signature, byte_code)
      continue
    assert it.peek() != '', it.format_error('comment or signature', True)
    signature = next(it)
    # Accumulate code, advance to next or exit.
    assert it.peek().startswith('registers:'), it.format_error(
        'registers/inputs/outputs', True)
    while not is_end():
      if it.peek().startswith('#'):
        break
      line = next(it)
      if line:
        byte_code.append(line + '\n')
    else:
      break
    yield (method_str, residual_str, signature, byte_code)

  if method_str:
    yield (method_str, residual_str, signature, byte_code)


def _ExtractClassNameAndMethodInfos(r8_lines_it):
  """Extracts Class and coarse Method info of R8 DEX dump.

  Args:
    r8_lines_it: Iterator for lines of R8 DEX dump.

  Yields:
    class_name: Name of classes.
    method_info: List of coarse Method info from _ExtractMethodInfo().
  """
  it = _WrapPeekableNoNewLine(r8_lines_it)

  while it.peek() is not None:
    line = next(it)
    if not line.startswith('# Class:'):
      continue
    yield line.split('\'')[1], list(_ExtractMethodInfo(it))


def _SplitMethod(signature):
  """Splits a method signature into components.

  Args:
    signature: "{return_type} {class_name}.{method_name}({param_types})".

  Returns:
    return_type: String for "{return_type}".
    class_name: String for "{class_name}".
    method_name: Sting for "{method_name}".
    param_types: List of strings for "{param_types}".
  """
  return_type_name, param_str = signature[:-1].split('(')
  return_type, method_full_name = return_type_name.split(' ')
  last_dot_pos = method_full_name.rfind('.')
  if last_dot_pos < 0:
    class_name = ''
    method_name = method_full_name
  else:
    class_name = method_full_name[:last_dot_pos]
    method_name = method_full_name[last_dot_pos + 1:]
  param_types = param_str.split(', ') if param_str else []
  return return_type, class_name, method_name, param_types


def _ExtractMethodLegacy(method_name, signature):
  """Extracts method info for legacy format.

  Returns: Same as _SplitMethod().
  """
  (return_type, class_name, _, param_types) = _SplitMethod(signature)
  return return_type, class_name, method_name, param_types


def _CompareSignatures(return_type1, param_types1, return_type2, param_types2):
  """Computes similarity score of two method signatures."""
  # Compute score component of return type.
  ret_score = 1 if return_type1 == return_type2 else 0
  # Compute score component of param types.
  if param_types1 == param_types2:
    # Fast path for common case (identical types).
    param_score = 1.0
  else:
    size = min(len(param_types1), len(param_types2))
    if size == 0:
      # Empty params vs. non-empty params: Score 0. Also prevents divide by 0.
      param_score = 0.0
    else:
      # Favor (normalized) param type matches in the same positions.
      pos_matches = sum(param_types1[i] == param_types2[i] for i in range(size))
      pos_matches /= size
      # Favor (normalized) param type multi-set matches.
      counter1 = collections.Counter(param_types1)
      counter2 = collections.Counter(param_types2)
      multi_set_matches = sum((counter1 & counter2).values()) / size
      # Penalize mismatched number of params.
      diff = abs(len(param_types1) - len(param_types2))
      # Estimated weighing factors.
      param_score = pos_matches * 0.6 + multi_set_matches * 0.3 - diff * 0.02
  return ret_score * 0.1 + param_score * 0.9


def _ExtractMethod(method_str, residual_str):
  """Extracts method info for updated format.

  Returns: Same as _SplitMethod().
  """
  residual = _SplitMethod(residual_str) if residual_str else None
  alternatives = method_str.split(' <OR> ')

  if alternatives[0].endswith(')'):
    # {return_type} {class_name}.{method_name}({param_types})
    assert all(f.endswith(')') for f in alternatives)
    method_parts = [_SplitMethod(f) for f in alternatives]
    scores = [(_CompareSignatures(method_parts[i][0], method_parts[i][3],
                                  residual[0], residual[3]), i)
              for i in range(len(alternatives))]
    return method_parts[max(scores)[1]]

  # Broken: {class_name} {method_name}
  assert len(alternatives) == 1
  if residual:
    return residual
  assert False, 'Failed to extract method.'
  return None


def Parse(lines):
  """Parses R8 disassembly lines into DexClass.

  Args:
    lines: Iterator for lines of R8 DEX dump.

  Returns:
    class_obj_map: Dict from class name to DexClass instances.
    anomalies: List [(class_name, extracted_class_name, method_name)] with
      unexplained mismatch between "{class_name}" from "Class:" and class name
      extracted from method signatures.
  """
  class_obj_map = {}
  total_methods = 0
  count_with_code = 0
  count_synthetics = 0
  warning_quota_signature = 8
  warning_quota_synthetic = 8
  anomalies = []
  for class_name, method_infos in _ExtractClassNameAndMethodInfos(lines):
    base_class_name = class_name.split('$$')[0]
    class_obj = DexClass(class_name)
    class_obj_map[class_name] = class_obj
    for method_str, residual_str, signature, byte_code in method_infos:
      total_methods += 1
      if not byte_code:
        continue
      count_with_code += 1
      if ' ' not in method_str:
        return_type, extracted_class_name, method_name, param_types = (
            _ExtractMethodLegacy(method_str, signature))
      else:
        if signature not in ('', method_str):
          if warning_quota_signature > 0:
            warning_quota_signature -= 1
            logging.warning('Found signature not matching "# Method":')
            logging.warning('  %s', signature)
            logging.warning('  %s', method_str)
        return_type, extracted_class_name, method_name, param_types = (
            _ExtractMethod(method_str, residual_str))
      if extracted_class_name != class_name:
        if '.' not in extracted_class_name:
          # |extracted_class_name| is obfuscated,
          pass
        elif extracted_class_name.split('$$')[0] == base_class_name:
          # Name aftet "$$" changed: These are synthetics examples showing
          # "true value" from |class_name| vs. "extracted value":
          # * org.chromium.base.FileUtils$$ExternalSyntheticLambda0 vs.
          #   org.chromium.base.FileUtils$$InternalSyntheticLambda$1${...}$0
          if 'Synthetic' not in class_name:
            if warning_quota_synthetic > 0:
              warning_quota_synthetic -= 1
              logging.warning('Found "$$" in non-synthetic class: %s',
                              class_name)
          count_synthetics += 1
        else:
          # Anomalies: Might be from class merging, but other oddities exist:
          # * androidx.activity.ComponentActivity vs.
          #   androidx.core.app.ComponentActivity;
          # * androidx.recyclerview.widget.SimpleItemAnimator vs.
          #   androidx.recyclerview.widget.RecyclerView$ItemAnimator
          # * kotlin.sequences.SequencesKt vs.
          #   kotlin.sequences.SequencesKt___SequencesKt
          # * com.google.android.play.core.splitinstall.a vs. (!)
          #   com.google.android.play.core.splitinstall.SplitInstallSessionState
          anomalies.append((class_name, extracted_class_name, method_name))
      method_obj = DexMethod(method_name, class_name, param_types, return_type,
                             byte_code)
      class_obj.methods.append(method_obj)
  logging.debug(
      'R8 disassembler method stats: Found %d total, %d with code, '
      '%d affected by synthetics, %d anomalies.', total_methods,
      count_with_code, count_synthetics, len(anomalies))
  return class_obj_map, anomalies


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input',
                      type=str,
                      help='File containing outputs of deobfuscated R8 '
                      'disassembly output.')
  args = parser.parse_args()
  with open(args.input, 'rt', encoding='utf-8') as fh:
    class_obj_map, anomalies = Parse(fh)
  variables = {'class_obj_map': class_obj_map, 'anomalies': anomalies}
  banner = []
  banner.append('=' * 80)
  banner.append('class_obj_map: {method: DexClass obj}')
  banner.append('anomalies: [(class_name, extracted_class_name, method_name)]')
  code.InteractiveConsole(variables).interact('\n'.join(banner))


if __name__ == '__main__':
  main()