chromium/tools/dump_process_memory/analyze_dumps.py

#!/usr/bin/env python3
#
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""From a dump generated by dump_process.cc dump, prints statistics about
its content.
"""

import argparse
import array
import collections
import hashlib
import logging
import os
import zlib

import sys
from os import path
sys.path.append('tools/android/native_lib_memory')

import parse_smaps

PAGE_SIZE = 1 << 12


# These are typically only populated with DCHECK() on.
FREED_PATTERNS_32 = {
    0xcccccccc: 'V8',
    0xcdcdcdcd: 'PartitionAlloc zapped',
    0xabababab: 'PartitionAlloc uninitialized',
    0xdeadbeef: 'V8 zapped',
    0x0baddeaf: 'V8 zapped handles',
    0x0baffedf: 'V8 zapped global handles',
    0x0beefdaf: 'V8 zapped from space',
    0xbeefdeef: 'V8 zapped slots',
    0xbadbaddb: 'V8 debug zapped',
    0xfeed1eaf: 'V8 zapped freelist'
}

FREED_PATTERNS_64 = {(key | (key << 32)): value
                     for (key, value) in FREED_PATTERNS_32.items()}


def _ReadPage(f, bitness):
  """Reads a page of data from a file.

  Args:
    f: (file) An opened file to read from.
    bitness: (int) 32 or 64 bit.

  Returns:
    An array.array() of unsigned int (if bitness is 32) or unsigned long (if
    bitness is 64)  with the page content.
  """
  result = array.array('Q' if bitness == 64 else 'I')
  result.fromfile(f, PAGE_SIZE // result.itemsize)
  return result


def _PrettyPrintSize(x):
  """Pretty print sizes in bytes, e.g. 123456 -> 123.45kB.

  Args:
    x: (int) size

  Returns:
    (str) Pretty printed version, 2 decimal places.
  """
  if x < 1e3:
    return '%dB' % x
  elif 1e3 <= x < 1e6:
    return '%.2fkB' % (x / 1e3)
  elif 1e6 <= x < 1e9:
    return '%.2fMB' % (x / 1e6)
  else:
    return '%.2fGB' % (x / 1e9)


class MappingStats:
  """Statistics about a mapping, from a dump.

  Slots:
    filename: (str) Dump filename.
    start: (int) Start address of the mapping.
    end: (int) End address of the mapping.
    pages: (int) Sizs of the mapping in pages.
    pointers: (Counter) number of PA pointers found in region of mapping.
    is_zero: ([bool]) For each page, whether it's a zero page.
    is_present: ([bool]) For each page, whether it's present.
    is_swapped: ([bool]) For each page, whether it has been swapped out.
    compressed_size: ([int]) If a page is not zero, its compressed size.
    hashes: ([str]) If a page is not zero, its SHA1 hash.
    freed: ({'description (str)': size (int)}) Size of freed data, per type.
  """
  __slots__ = ('filename', 'start', 'end', 'pages', 'pointers', 'is_zero',
               'is_present', 'is_swapped', 'compressed_size', 'hashes', 'freed')

  def __init__(self, filename, start, end):
    """Init.

    Args:
      filename: (str) Dump filename.
      start: (int) Start address.
      end: (int) End address
    """
    assert (end - start) % PAGE_SIZE == 0
    self.filename = filename
    self.start = start
    self.end = end
    self.pages = (end - start) // PAGE_SIZE
    self.pointers = collections.Counter()
    self.is_zero = [False for _ in range(self.pages)]
    self.is_present = [False for _ in range(self.pages)]
    self.is_swapped = [False for _ in range(self.pages)]
    self.compressed_size = [0 for _ in range(self.pages)]
    self.hashes = [None for _ in range(self.pages)]
    self.freed = collections.defaultdict(int)


class Ranges:
  """Represents a set of discontiguous ranges.

  Allows simple operations such as looking whether something is contained in
  every range.

  Does not do checking to ensure that the ranges are discontiguous.
  """

  def __init__(self, ranges):
    ranges.sort(key=lambda r: r.start)

    self.valid_pages = {}

    for r in ranges:
      start = r.start
      end = r.end
      for x in range(start, end, PAGE_SIZE):
        self.valid_pages[x] = r.pathname

    self.ranges = ranges

  def Contains(self, val):
    return self.valid_pages.get(val - (val % PAGE_SIZE), False)


def _OpenRanges(directory, filename, pred=lambda _: True):
  with open(os.path.join(directory, filename)) as f:
    mappings = parse_smaps._ParseProcSmapsLines(f.readlines())
    return Ranges([mapping for mapping in mappings if pred(mapping)])


def _IsPointer(x, ptr_ranges):
  if ptr := ptr_ranges.Contains(x):
    print(f'\r0x{x:x}', end='')
    return ptr
  return False


def _GetFreedPatterns(bitness):
  return FREED_PATTERNS_64 if bitness == 64 else FREED_PATTERNS_32


def _GetStatsFromFileDump(filename, ptr_ranges, pa_ranges, bitness):
  """Computes per-dump statistics.

  Args:
    filename: (str) Path to the dump.
    ptr_ranges: (Ranges) Ranges containing valid pointers.
    pa_ranges: (Ranges) Ranges containing PA pointers only.
    bitness: (int) 32 or 64 bit.

  Returns:
    MappingStats for the mapping.
  """
  # Dump integrity checks.
  metadata_filename = filename + '.metadata'
  pid_start_end = os.path.basename(filename)[:-len('.dump')]
  (_, start, end) = [int(x, 10) for x in pid_start_end.split('-')]
  file_stat = os.stat(filename)
  assert start % PAGE_SIZE == 0
  assert end % PAGE_SIZE == 0
  assert file_stat.st_size == (end - start)
  metadata_file_stat = os.stat(metadata_filename)
  result = MappingStats(filename, start, end)
  # each line is [01]{2}\n, eg '10\n', 1 line per page.
  assert metadata_file_stat.st_size == 3 * result.pages

  if not pa_ranges.Contains(start) or not pa_ranges.Contains(end):
    return None

  FREED_PATTERNS = _GetFreedPatterns(bitness)
  with open(filename, 'rb') as f, open(metadata_filename, 'rb') as metadata_f:
    for i in range(result.pages):
      page = _ReadPage(f, bitness)
      assert len(page) == PAGE_SIZE / (bitness / 8)
      for x in page:
        if x in FREED_PATTERNS:
          result.freed[FREED_PATTERNS[x]] += (bitness / 8)
        if ptr := _IsPointer(x, ptr_ranges):
          result.pointers[ptr] += 1
      is_zero = max(page) == 0
      present, swapped = (bool(int(x)) for x in metadata_f.readline().strip())
      # Not present, not swapped private anonymous == lazily initialized zero
      # page.
      if not present and not swapped:
        assert is_zero
      result.is_zero[i] = is_zero
      result.is_present[i] = present
      result.is_swapped[i] = swapped
      if not is_zero:
        sha1 = hashlib.sha1()
        sha1.update(page)
        page_hash = sha1.digest()
        result.hashes[i] = page_hash
        compressed = zlib.compress(page, 1)
        result.compressed_size[i] = len(compressed)
  return result


def _FindPageFromHash(mappings, page_hash):
  """Returns a page with a given hash from a list of mappings.

  Args:
    mappings: ([MappingStats]) List of mappings.
    page_hash: (str) Page hash to look for,

  Returns:
    array.array(uint32_t) with the page content
  """
  for mapping in mappings:
    for i in range(mapping.pages):
      if mapping.hashes[i] == page_hash:
        with open(mapping.filename, 'r') as f:
          f.seek(i * PAGE_SIZE)
          page = _ReadPage(f)
          sha1 = hashlib.sha1()
          sha1.update(page)
          assert page_hash == sha1.digest()
          return page


def _PrintPage(page):
  """Prints the content of a page."""
  for i, x in enumerate(page):
    print('{:08x}'.format(x), end=' ')
    if i % 16 == 15:
      print()


AggregateStats = collections.namedtuple(
    'AggregateStats', ('content_to_count', 'pages', 'zero_pages',
                       'compressed_size', 'swapped_pages', 'not_present_pages',
                       'present_zero_pages', 'freed', 'pointers'))


def _AggregateStats(dump_stats, bitness):
  """Aggregate statistics across dumps.

  Args:
    dump_stats: ([MappingStats]) Stats from all mappings.
    bitness: (int) 32 or 64 bit.

  Returns:
    An instance of AggregateStats.
  """

  FREED_PATTERNS = _GetFreedPatterns(bitness)
  content_to_count = collections.defaultdict(int)
  total_pages = sum(stats.pages for stats in dump_stats)
  total_zero_pages = sum(sum(stats.is_zero) for stats in dump_stats)
  total_compressed_size = sum(sum(stats.compressed_size)
                              for stats in dump_stats)
  total_swapped_pages = sum(sum(stats.is_swapped) for stats in dump_stats)
  total_not_present_pages = sum(stats.pages - sum(stats.is_present)
                                for stats in dump_stats)
  total_present_zero_pages = sum(
      sum(x == (True, True) for x in zip(stats.is_zero, stats.is_present))
      for stats in dump_stats)
  total_freed_space = {x: 0 for x in FREED_PATTERNS.values()}
  total_pointers = collections.Counter()
  for stats in dump_stats:
    total_pointers += stats.pointers
  for dump in dump_stats:
    for (freed_data_type, value) in dump.freed.items():
      total_freed_space[freed_data_type] += value

  content_to_count = collections.defaultdict(int)
  for stats in dump_stats:
    for page_hash in stats.hashes:
      if page_hash:
        content_to_count[page_hash] += 1

  return AggregateStats(content_to_count=content_to_count,
                        pages=total_pages,
                        zero_pages=total_zero_pages,
                        compressed_size=total_compressed_size,
                        swapped_pages=total_swapped_pages,
                        not_present_pages=total_not_present_pages,
                        present_zero_pages=total_present_zero_pages,
                        freed=total_freed_space,
                        pointers=total_pointers)


def PrintStats(dumps, directory, verbose, bitness):
  """Logs statistics about a process mappings dump.

  Args:
    dumps: ([str]) List of dumps.
    directory: (str) Directory containing dumps.
    verbose: (bool) Verbose output.
    bitness: (int) 32 or 64 bit.
  """
  ptr_ranges = _OpenRanges(
      directory, 'smaps.txt', lambda mapping: mapping.permissions != '---p')
  pa_ranges = _OpenRanges(
      directory,
      'smaps.txt', lambda mapping: mapping.pathname == '[anon:partition_alloc]')
  dump_stats = [
      _GetStatsFromFileDump(filename, ptr_ranges, pa_ranges, bitness)
      for filename in dumps
  ]
  dump_stats = [dump_stat for dump_stat in dump_stats if dump_stat is not None]
  total = _AggregateStats(dump_stats, bitness)
  duplicated_pages = sum(x - 1 for x in total.content_to_count.values())
  count_and_hashes = sorted(((v, k) for k, v in total.content_to_count.items()),
                            reverse=True)
  max_common_pages = count_and_hashes[0][0] - 1
  total_size_non_zero_pages = (total.pages - total.zero_pages) * PAGE_SIZE

  print('\r', end='')

  print('Total pages = %d (%s)' % (total.pages,
                                   _PrettyPrintSize(total.pages * PAGE_SIZE)))
  print('Total zero pages = %d (%.02f%%)' %
        (total.zero_pages, (100. * total.zero_pages) / total.pages))
  print('Total present zero pages = %d (%s)' %
        (total.present_zero_pages,
         _PrettyPrintSize(total.present_zero_pages * PAGE_SIZE)))
  print(
      'Total size of non-zero pages = %d (%s)' %
      (total_size_non_zero_pages, _PrettyPrintSize(total_size_non_zero_pages)))
  print('Total compressed size = %d (%.02f%%)' %
        (total.compressed_size,
         (100. * total.compressed_size) / total_size_non_zero_pages))
  print('Duplicated non-zero pages = %d' % duplicated_pages)
  print('Max non-zero pages with the same content = %d' % max_common_pages)
  print(
      'Swapped pages = %d (%s)' %
      (total.swapped_pages, _PrettyPrintSize(total.swapped_pages * PAGE_SIZE)))
  print('Non-present pages = %d (%s)' %
        (total.not_present_pages,
         _PrettyPrintSize(total.not_present_pages * PAGE_SIZE)))
  print('Freed: ')
  for k in total.freed:
    print('  %s = %d (%s)' % (k, total.freed[k], _PrettyPrintSize(
        total.freed[k])))

  print(f'# Pointers: {total.pointers.total()}')
  for pathname, count in total.pointers.most_common(10):
    print(f'{pathname}: {count}')

  if verbose:
    print('Top Duplicated Pages:')
    for i in range(10):
      count, page_hash = count_and_hashes[i]
      print('%d common pages' % count)
      page = _FindPageFromHash(dump_stats, page_hash)
      _PrintPage(page)
      print()


def _CreateArgumentParser():
  parser = argparse.ArgumentParser()
  parser.add_argument('--directory', type=str, required=True,
                      help='Dumps directory')
  parser.add_argument('--verbose', action='store_true', help='Dumps directory')
  parser.add_argument('--bitness',
                      choices={32, 64},
                      type=int,
                      required=True,
                      help='Whether the dump is from a 64 or 32 build')
  return parser


def main():
  logging.basicConfig(level=logging.INFO)
  parser = _CreateArgumentParser()
  args = parser.parse_args()

  dumps = []
  for f in os.listdir(args.directory):
    if f.endswith('.dump'):
      dumps.append(os.path.join(args.directory, f))

  PrintStats(dumps, args.directory, args.verbose, args.bitness)


if __name__ == '__main__':
  main()