dalvik_bytecode.py | Explore in Territory

# Copyright 2022 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utilities for parsing Dalvik bytecode."""

import collections
import struct

# Dalvik Bytecode specs copied from first two column of table in:
#   https://source.android.com/docs/core/runtime/dalvik-bytecode#instructions
# with minor modification (truncating comments).
_DALVIK_BYTECODE_SPECS = """00 10x  nop
01 12x  move vA, vB
02 22x  move/from16 vAA, vBBBB
03 32x  move/16 vAAAA, vBBBB
04 12x  move-wide vA, vB
05 22x  move-wide/from16 vAA, vBBBB
06 32x  move-wide/16 vAAAA, vBBBB
07 12x  move-object vA, vB
08 22x  move-object/from16 vAA, vBBBB
09 32x  move-object/16 vAAAA, vBBBB
0a 11x  move-result vAA
0b 11x  move-result-wide vAA
0c 11x  move-result-object vAA
0d 11x  move-exception vAA
0e 10x  return-void
0f 11x  return vAA
10 11x  return-wide vAA
11 11x  return-object vAA
12 11n  const/4 vA, #+B
13 21s  const/16 vAA, #+BBBB
14 31i  const vAA, #+BBBBBBBB
15 21h  const/high16 vAA, #+BBBB0000
16 21s  const-wide/16 vAA, #+BBBB
17 31i  const-wide/32 vAA, #+BBBBBBBB
18 51l  const-wide vAA, #+BBBBBBBBBBBBBBBB
19 21h  const-wide/high16 vAA, #+BBBB000000000000
1a 21c  const-string vAA, string@BBBB
1b 31c  const-string/jumbo vAA, string@BBBBBBBB
1c 21c  const-class vAA, type@BBBB
1d 11x  monitor-enter vAA
1e 11x  monitor-exit vAA
1f 21c  check-cast vAA, type@BBBB
20 22c  instance-of vA, vB, type@CCCC
21 12x  array-length vA, vB
22 21c  new-instance vAA, type@BBBB
23 22c  new-array vA, vB, type@CCCC
24 35c  filled-new-array {vC, vD, vE, vF, vG}, type@BBBB
25 3rc  filled-new-array/range {vCCCC .. vNNNN}, type@BBBB
26 31t  fill-array-data vAA, +BBBBBBBB (with supplemental data...)
27 11x  throw vAA
28 10t  goto +AA
29 20t  goto/16 +AAAA
2a 30t  goto/32 +AAAAAAAA
2b 31t  packed-switch vAA, +BBBBBBBB (with supplemental data...)
2c 31t  sparse-switch vAA, +BBBBBBBB (with supplemental data...)
2d..31 23x  cmpkind vAA, vBB, vCC
2d: cmpl-float (lt bias)
2e: cmpg-float (gt bias)
2f: cmpl-double (lt bias)
30: cmpg-double (gt bias)
31: cmp-long
32..37 22t  if-test vA, vB, +CCCC
32: if-eq
33: if-ne
34: if-lt
35: if-ge
36: if-gt
37: if-le
38..3d 21t  if-testz vAA, +BBBB
38: if-eqz
39: if-nez
3a: if-ltz
3b: if-gez
3c: if-gtz
3d: if-lez
3e..43 10x  (unused)
44..51 23x  arrayop vAA, vBB, vCC
44: aget
45: aget-wide
46: aget-object
47: aget-boolean
48: aget-byte
49: aget-char
4a: aget-short
4b: aput
4c: aput-wide
4d: aput-object
4e: aput-boolean
4f: aput-byte
50: aput-char
51: aput-short
52..5f 22c  iinstanceop vA, vB, field@CCCC
52: iget
53: iget-wide
54: iget-object
55: iget-boolean
56: iget-byte
57: iget-char
58: iget-short
59: iput
5a: iput-wide
5b: iput-object
5c: iput-boolean
5d: iput-byte
5e: iput-char
5f: iput-short
60..6d 21c  sstaticop vAA, field@BBBB
60: sget
61: sget-wide
62: sget-object
63: sget-boolean
64: sget-byte
65: sget-char
66: sget-short
67: sput
68: sput-wide
69: sput-object
6a: sput-boolean
6b: sput-byte
6c: sput-char
6d: sput-short
6e..72 35c  invoke-kind {vC, vD, vE, vF, vG}, meth@BBBB
6e: invoke-virtual
6f: invoke-super
70: invoke-direct
71: invoke-static
72: invoke-interface
73 10x  (unused)
74..78 3rc  invoke-kind/range {vCCCC .. vNNNN}, meth@BBBB
74: invoke-virtual/range
75: invoke-super/range
76: invoke-direct/range
77: invoke-static/range
78: invoke-interface/range
79..7a 10x  (unused)
7b..8f 12x  unop vA, vB
7b: neg-int
7c: not-int
7d: neg-long
7e: not-long
7f: neg-float
80: neg-double
81: int-to-long
82: int-to-float
83: int-to-double
84: long-to-int
85: long-to-float
86: long-to-double
87: float-to-int
88: float-to-long
89: float-to-double
8a: double-to-int
8b: double-to-long
8c: double-to-float
8d: int-to-byte
8e: int-to-char
8f: int-to-short
90..af 23x  binop vAA, vBB, vCC
90: add-int
91: sub-int
92: mul-int
93: div-int
94: rem-int
95: and-int
96: or-int
97: xor-int
98: shl-int
99: shr-int
9a: ushr-int
9b: add-long
9c: sub-long
9d: mul-long
9e: div-long
9f: rem-long
a0: and-long
a1: or-long
a2: xor-long
a3: shl-long
a4: shr-long
a5: ushr-long
a6: add-float
a7: sub-float
a8: mul-float
a9: div-float
aa: rem-float
ab: add-double
ac: sub-double
ad: mul-double
ae: div-double
af: rem-double
b0..cf 12x  binop/2addr vA, vB
b0: add-int/2addr
b1: sub-int/2addr
b2: mul-int/2addr
b3: div-int/2addr
b4: rem-int/2addr
b5: and-int/2addr
b6: or-int/2addr
b7: xor-int/2addr
b8: shl-int/2addr
b9: shr-int/2addr
ba: ushr-int/2addr
bb: add-long/2addr
bc: sub-long/2addr
bd: mul-long/2addr
be: div-long/2addr
bf: rem-long/2addr
c0: and-long/2addr
c1: or-long/2addr
c2: xor-long/2addr
c3: shl-long/2addr
c4: shr-long/2addr
c5: ushr-long/2addr
c6: add-float/2addr
c7: sub-float/2addr
c8: mul-float/2addr
c9: div-float/2addr
ca: rem-float/2addr
cb: add-double/2addr
cc: sub-double/2addr
cd: mul-double/2addr
ce: div-double/2addr
cf: rem-double/2addr
d0..d7 22s  binop/lit16 vA, vB, #+CCCC
d0: add-int/lit16
d1: rsub-int (reverse subtract)
d2: mul-int/lit16
d3: div-int/lit16
d4: rem-int/lit16
d5: and-int/lit16
d6: or-int/lit16
d7: xor-int/lit16
d8..e2 22b  binop/lit8 vAA, vBB, #+CC
d8: add-int/lit8
d9: rsub-int/lit8
da: mul-int/lit8
db: div-int/lit8
dc: rem-int/lit8
dd: and-int/lit8
de: or-int/lit8
df: xor-int/lit8
e0: shl-int/lit8
e1: shr-int/lit8
e2: ushr-int/lit8
e3..f9 10x  (unused)
fa 45cc invoke-polymorphic {vC, vD, vE, vF, vG}, meth@BBBB, proto@HHHH
fb 4rcc invoke-polymorphic/range {vCCCC .. vNNNN}, meth@BBBB, proto@HHHH
fc 35c  invoke-custom {vC, vD, vE, vF, vG}, call_site@BBBB
fd 3rc  invoke-custom/range {vCCCC .. vNNNN}, call_site@BBBB
fe 21c  const-method-handle vAA, method_handle@BBBB
ff 21c  const-method-type vAA, proto@BBBB
"""

DalvikByteCode = collections.namedtuple('DalvikByteCode',
                                        'op,size,format,name,params')


def _ParseByteCodeSpecs():
  """Parses _DALVIK_BYTECODE_SPECS into DalvikByteCode array."""
  format_map = [None] * 256
  name_map = [None] * 256
  params_map = [None] * 256
  (op_lo, op_hi) = (None, None)
  for line in _DALVIK_BYTECODE_SPECS.splitlines():
    comment_pos = line.find(' (')
    if comment_pos >= 0:
      line = line[:comment_pos]
    assert len(line) >= 5
    if line[2] == ':':
      # Inside op range, e.g.: 'b0: add-int/2addr'.
      # ['b0', 'add-int/2addr'].
      toks = line.split(': ')
      assert len(toks) == 2
      op = int(toks[0], 16)
      assert op_lo <= op <= op_hi
      name_map[op] = toks[1]  # 'add-int/2addr'.
      if op == op_hi:
        op_lo = op_hi = None
    elif line[2:4] == '..':
      # Define op range, e.g.: 'b0..cf 12x  binop/2addr vA, vB'.
      # ['b0..cf', '12x', 'binop/2addr', 'vA, vB'].
      toks = line.split(maxsplit=3)
      # (0xb0, 0xcf).
      (op_lo, op_hi) = (int(t, 16) for t in toks[0].split('..'))
      for op in range(op_lo, op_hi + 1):
        format_map[op] = toks[1]  # '12x'.
      if len(toks) > 2:  # If not unused.
        for op in range(op_lo, op_hi + 1):
          params_map[op] = toks[3]  # 'vA, vB'.
    else:
      # Standalone op, e.g.: '15 21h  const/high16 vAA, #+BBBB0000'.
      # ['15', '21h', 'const/high16', 'vAA, #+BBBB0000'].
      toks = line.split(maxsplit=3)
      op = int(toks[0], 16)
      format_map[op] = toks[1]  # '21h'.
      if len(toks) > 2:  # If not unused.
        name_map[op] = toks[2]  # 'const/high16'.
        params_map[op] = toks[3] if len(toks) >= 4 else ''  # 'vAA, #+BBBB0000'.

  ret = []
  for op in range(256):
    size = int(format_map[op][0]) * 2  # '21h' -> 4.
    bc = DalvikByteCode(op, size, format_map[op], name_map[op], params_map[op])
    ret.append(bc)
  return ret


DALVIK_INSTRUCTIONS = _ParseByteCodeSpecs()


def Split(insns):
  """Splits Dalvik code into a series of instruction bytes.

  The minimalistic approach avoids wasted work. It's up to the caller to filter
  and/or disassemble emitted bytes. It is assumed that supplemental data (from
  31t instructions {fill-array-data, packed-switch, sparse-switch}) are found at
  the end of `insns`. These are detected and omitted.

  Args:
    insns: Even-length bytearray data containing valid Dalvik code.
  """
  pos_end = len(insns)
  assert pos_end % 2 == 0
  pos = 0
  while pos < pos_end:
    instr = DALVIK_INSTRUCTIONS[insns[pos]]
    size = instr.size
    chunk = insns[pos:pos + size]
    # Instructions with supplemental data contains relative offset to where
    # data starts, which indicates where code ends.
    if instr.format == '31t':
      offset = struct.unpack_from('<L', chunk, 2)[0]
      pos_end = min(pos_end, pos + offset * 2)
    yield chunk
    pos += size
  # Do not emit supplemental data.
chromium/tools/binary_size/libsupersize/dalvik_bytecode.py