pe_summarize.py | Explore in Territory

# Copyright 2016 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
Parse information about a PE file to summarize the on-disk and
in-memory sizes of the sections, in decimal MB instead of in hex. This
script will also automatically display diffs between two files if they
have the same name. This script relies on having VS 2015 installed and is used
to help investigate binary size regressions and improvements.

Section information printed by dumpbin looks like this:

SECTION HEADER #2
  .rdata name
  5CCD56 virtual size
 1CEF000 virtual address (11CEF000 to 122BBD55)
  5CCE00 size of raw data
 1CEE000 file pointer to raw data (01CEE000 to 022BADFF)
       0 file pointer to relocation table
       0 file pointer to line numbers
       0 number of relocations
       0 number of line numbers
40000040 flags
         Initialized Data
         Read Only

The reports generated by this script look like this:

> python tools\win\pe_summarize.py out\release\chrome.dll
Size of out\release\chrome.dll is 41.190912 MB
      name:   mem size  ,  disk size
     .text: 33.199959 MB
    .rdata:  6.170416 MB
     .data:  0.713864 MB,  0.270336 MB
      .tls:  0.000025 MB
  CPADinfo:  0.000036 MB
   .rodata:  0.003216 MB
  .crthunk:  0.000064 MB
    .gfids:  0.001052 MB
    _RDATA:  0.000288 MB
     .rsrc:  0.130808 MB
    .reloc:  1.410172 MB

Note that the .data section has separate in-memory and on-disk sizes due to
zero-initialized data. Other sections have smaller discrepancies - the disk size
is only printed if it differs from the memory size by more than 512 bytes.

Note that many of the sections - such as .text, .rdata, and .rsrc - are shared
between processes. Some sections - such as .reloc - are discarded after a
process is loaded. Other sections, such as .data, produce private pages and are
therefore objectively 'worse' than the others.
"""

from __future__ import print_function

import os
import subprocess
import sys


def _FindSection(section_list, section_name):
    for i in range(len(section_list)):
        if section_name == section_list[i][0]:
            return i
    return -1


def main():
    if len(sys.argv) < 2:
        print(r'Usage: %s PEFileName [OtherPeFileNames...]' % sys.argv[0])
        print(r'Sample: %s chrome.dll' % sys.argv[0])
        print(r'Sample: %s chrome.dll original\chrome.dll' % sys.argv[0])
        return 0

    # Track the name of the last PE (Portable Executable) file to be processed -
    # file name only, without the path.
    last_pe_filepart = ""

    for pe_path in sys.argv[1:]:
        results = []
        if not os.path.exists(pe_path):
            print('%s does not exist!' % pe_path)
            continue

        print('Size of %s is %1.6f MB' %
              (pe_path, os.path.getsize(pe_path) / 1e6))
        print('%10s:  %9s  ,  %9s' % ('name', 'mem size', 'disk size'))

        sections = None
        # Pass the undocumented /nopdb header to avoid hitting symbol servers
        # for the entrypoint name.
        command = 'dumpbin.exe /nopdb /headers "%s"' % pe_path
        try:
            for line in subprocess.check_output(command).decode().splitlines():
                if line.startswith('SECTION HEADER #'):
                    sections = []
                elif type(sections) == type([]):
                    # We must be processing a section header.
                    sections.append(line.strip())
                    # When we've accumulated four lines of data, process them.
                    if len(sections) == 4:
                        name, memory_size, _, disk_size = sections
                        assert name.count('name') == 1
                        assert memory_size.count('virtual size') == 1
                        assert disk_size.count('size of raw data') == 1
                        name = name.split()[0]
                        memory_size = int(memory_size.split()[0], 16)
                        disk_size = int(disk_size.split()[0], 16)
                        # Print the sizes in decimal MB. This makes large
                        # numbers easier to understand - 33.199959 is easier to
                        # read than 33199959. Decimal MB is used to allow simple
                        # conversions to a precise number of bytes.
                        if abs(memory_size - disk_size) < 512:
                            print('%10s: %9.6f MB' % (name, memory_size / 1e6))
                        else:
                            print('%10s: %9.6f MB, %9.6f MB' %
                                  (name, memory_size / 1e6, disk_size / 1e6))
                        results.append((name, memory_size))
                        sections = None
        except WindowsError as error:
            if error.winerror == 2:
                print(
                    r'Cannot find dumpbin. Run "C:\Program Files\Microsoft '
                    r'Visual Studio\2022\Professional\VC\Auxiliary\Build'
                    r'\vcvarsall.bat amd64" or similar to add dumpbin to the '
                    r'path.')
            else:
                print(error)
            break

        print()
        pe_filepart = os.path.split(pe_path)[1]
        if pe_filepart.lower() == last_pe_filepart.lower():
            # Print out the section-by-section size changes, for memory sizes
            # only.
            print('Memory size change from %s to %s' % (last_pe_path, pe_path))
            total_delta = 0
            for i in range(len(results)):
                section_name = results[i][0]
                # Find a matching section name. Mismatches can occur when
                # comparing 32-bit and 64-bit binaries. They can also occur when
                # one of the binaries pulls in code that defines custom sections
                # such as .rodata.
                last_i = _FindSection(last_results, section_name)
                delta = results[i][1]
                if last_i >= 0:
                    delta -= last_results[last_i][1]
                total_delta += delta
                if delta:
                    print('%12s: %7d bytes change' % (section_name, delta))
            for last_i in range(len(last_results)):
                section_name = last_results[last_i][0]
                # Find sections that exist only in last_results.
                i = _FindSection(results, section_name)
                if i < 0:
                    delta = -last_results[last_i][1]
                    total_delta += delta
                    print('%12s: %7d bytes change' % (section_name, delta))
            print('Total change: %7d bytes' % total_delta)
        last_pe_filepart = pe_filepart
        last_pe_path = pe_path
        last_results = results


if __name__ == '__main__':
    sys.exit(main())
chromium/tools/win/pe_summarize.py