aes_nohw.c.inc | Explore in Territory

/* Copyright (c) 2019, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

#include <openssl/aes.h>

#include <assert.h>
#include <string.h>

#include "../../internal.h"
#include "internal.h"

#if defined(OPENSSL_SSE2)
#include <emmintrin.h>
#endif


// This file contains a constant-time implementation of AES, bitsliced with
// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block
// batches, respectively. The 128-bit implementation requires SSE2 intrinsics.
//
// This implementation is based on the algorithms described in the following
// references:
// - https://bearssl.org/constanttime.html#aes
// - https://eprint.iacr.org/2009/129.pdf
// - https://eprint.iacr.org/2009/191.pdf


// Word operations.
//
// An aes_word_t is the word used for this AES implementation. Throughout this
// file, bits and bytes are ordered little-endian, though "left" and "right"
// shifts match the operations themselves, which makes them reversed in a
// little-endian, left-to-right reading.
//
// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an
// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE|
// bits each, each corresponding to a byte in an AES block in column-major
// order (AES's byte order). We refer to these as "logical bytes". Note, in the
// 32-bit and 64-bit implementations, they are smaller than a byte. (The
// contents of a logical byte will be described later.)
//
// MSVC does not support C bit operators on |__m128i|, so the wrapper functions
// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and
// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift
// value ranges from 0 to 15 independent of |aes_word_t| and
// |AES_NOHW_BATCH_SIZE|.
//
// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which
// uses row-major order. Matching the AES order was easier to reason about, and
// we do not have PSHUFB available to arbitrarily permute bytes.

#if defined(OPENSSL_SSE2)
aes_word_t;
// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in
// MSVC, so we define a constant.
#define AES_NOHW_WORD_SIZE …
#define AES_NOHW_BATCH_SIZE …
#define AES_NOHW_ROW0_MASK …
#define AES_NOHW_ROW1_MASK …
#define AES_NOHW_ROW2_MASK …
#define AES_NOHW_ROW3_MASK …
#define AES_NOHW_COL01_MASK …
#define AES_NOHW_COL2_MASK …
#define AES_NOHW_COL3_MASK …

static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { … }

static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { … }

static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { … }

static inline aes_word_t aes_nohw_not(aes_word_t a) { … }

// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128|
// must be constants.
#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) …
#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) …
#else  // !OPENSSL_SSE2
#if defined(OPENSSL_64_BIT)
typedef uint64_t aes_word_t;
#define AES_NOHW_WORD_SIZE …
#define AES_NOHW_BATCH_SIZE …
#define AES_NOHW_ROW0_MASK …
#define AES_NOHW_ROW1_MASK …
#define AES_NOHW_ROW2_MASK …
#define AES_NOHW_ROW3_MASK …
#define AES_NOHW_COL01_MASK …
#define AES_NOHW_COL2_MASK …
#define AES_NOHW_COL3_MASK …
#else  // !OPENSSL_64_BIT
typedef uint32_t aes_word_t;
#define AES_NOHW_WORD_SIZE …
#define AES_NOHW_BATCH_SIZE …
#define AES_NOHW_ROW0_MASK …
#define AES_NOHW_ROW1_MASK …
#define AES_NOHW_ROW2_MASK …
#define AES_NOHW_ROW3_MASK …
#define AES_NOHW_COL01_MASK …
#define AES_NOHW_COL2_MASK …
#define AES_NOHW_COL3_MASK …
#endif  // OPENSSL_64_BIT

static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) {
  return a & b;
}

static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) {
  return a | b;
}

static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) {
  return a ^ b;
}

static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; }

static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) {
  return a << (i * AES_NOHW_BATCH_SIZE);
}

static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) {
  return a >> (i * AES_NOHW_BATCH_SIZE);
}
#endif  // OPENSSL_SSE2

static_assert;
static_assert;


// Block representations.
//
// This implementation uses three representations for AES blocks. First, the
// public API represents blocks as uint8_t[16] in the usual way. Second, most
// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|.
// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words
// containing bitsliced blocks a, b, c, d, this would be as follows (vertical
// bars divide logical bytes):
//
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
//   ...
//
// Finally, an individual block may be stored as an intermediate form in an
// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each
// block, so that block[0]'s ith logical byte contains least-significant
// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of
// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as
// "compacting" the block. Note this is no-op with 128-bit words because then
// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit
// words, one block would be stored in two words:
//
//   block[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
//   block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ...
//
// Observe that the distances between corresponding bits in bitsliced and
// compact bit orders match. If we line up corresponding words of each block,
// the bitsliced and compact representations may be converted by tranposing bits
// in corresponding logical bytes. Continuing the 64-bit example:
//
//   block_a[0] = a0 a1 a2 a3 |  a8  a9 a10 a11 | a16 a17 a18 a19 ...
//   block_b[0] = b0 b1 b2 b3 |  b8  b9 b10 b11 | b16 b17 b18 b19 ...
//   block_c[0] = c0 c1 c2 c3 |  c8  c9 c10 c11 | c16 c17 c18 c19 ...
//   block_d[0] = d0 d1 d2 d3 |  d8  d9 d10 d11 | d16 d17 d18 d19 ...
//
//   batch.w[0] = a0 b0 c0 d0 |  a8  b8  c8  d8 | a16 b16 c16 d16 ...
//   batch.w[1] = a1 b1 c1 d1 |  a9  b9  c9  d9 | a17 b17 c17 d17 ...
//   batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ...
//   batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ...
//
// Note also that bitwise operations and (logical) byte permutations on an
// |aes_word_t| work equally for the bitsliced and compact words.
//
// We use the compact form in the |AES_KEY| representation to save work
// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists
// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately
// before or after |aes_nohw_transpose|.

#define AES_NOHW_BLOCK_WORDS …

// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise
// specified, it is in bitsliced form.
AES_NOHW_BATCH;

// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is
// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH|
// |AES_KEY|s so it should not be used as a long-term key representation.
AES_NOHW_SCHEDULE;

// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in
// compact form.
static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch,
                                      const aes_word_t in[AES_NOHW_BLOCK_WORDS],
                                      size_t i) { … }

// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in
// compact form.
static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch,
                                      aes_word_t out[AES_NOHW_BLOCK_WORDS],
                                      size_t i) { … }

#if !defined(OPENSSL_SSE2)
// aes_nohw_delta_swap returns |a| with bits |a & mask| and
// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap.
static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask,
                                             aes_word_t shift) {
  // See
  // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/
  aes_word_t b = (a ^ (a >> shift)) & mask;
  return a ^ b ^ (b << shift);
}

// In the 32-bit and 64-bit implementations, a block spans multiple words.
// |aes_nohw_compact_block| must permute bits across different words. First we
// implement |aes_nohw_compact_word| which performs a smaller version of the
// transformation which stays within a single word.
//
// These transformations are generalizations of the output of
// http://programming.sirrida.de/calcperm.php on smaller inputs.
#if defined(OPENSSL_64_BIT)
static inline uint64_t aes_nohw_compact_word(uint64_t a) {
  // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap
  // quartets of those chunks:
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
  // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks):
  //   0 2 1 3 | 4 6 5 7 | 8 10  9 11 | 12 14 13 15 =>
  //   0 2 4 6 | 1 3 5 7 | 8 10 12 14 |  9 11 13 15
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
  // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks):
  //   0 2 4 6 | 1  3  5  7 | 8 10 12 14 | 9 11 13 15 =>
  //   0 2 4 6 | 8 10 12 14 | 1  3  5  7 | 9 11 13 15
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
  return a;
}

static inline uint64_t aes_nohw_uncompact_word(uint64_t a) {
  // Reverse the steps of |aes_nohw_uncompact_word|.
  a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16);
  a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8);
  a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4);
  return a;
}
#else   // !OPENSSL_64_BIT
static inline uint32_t aes_nohw_compact_word(uint32_t a) {
  // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap:
  //   0 1 2 3 | 4 5 6 7 | 8  9 10 11 | 12 13 14 15 =>
  //   0 4 2 6 | 1 5 3 7 | 8 12 10 14 |  9 13 11 15
  // Note:  0x00cc = 0b0000_0000_1100_1100
  //   0x00cc << 6 = 0b0011_0011_0000_0000
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
  // Now we swap groups of four bits (still numbering by pairs):
  //   0 4 2  6 | 1 5 3  7 | 8 12 10 14 | 9 13 11 15 =>
  //   0 4 8 12 | 1 5 9 13 | 2  6 10 14 | 3  7 11 15
  // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
  return a;
}

static inline uint32_t aes_nohw_uncompact_word(uint32_t a) {
  // Reverse the steps of |aes_nohw_uncompact_word|.
  a = aes_nohw_delta_swap(a, 0x0000f0f0, 12);
  a = aes_nohw_delta_swap(a, 0x00cc00cc, 6);
  return a;
}

static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1,
                                                uint8_t a2, uint8_t a3) {
  return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) |
         ((uint32_t)a3 << 24);
}
#endif  // OPENSSL_64_BIT
#endif  // !OPENSSL_SSE2

static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
                                          const uint8_t in[16]) { … }

static inline void aes_nohw_uncompact_block(
    uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { … }

// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in
// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and
// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it
// is repeated to the full width of |aes_word_t|.
#if defined(OPENSSL_SSE2)
// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require
// constant shift values.
#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b,              \
                           /* uint32_t */ mask, /* const */ shift) …
#else
static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b,
                                      uint32_t mask, aes_word_t shift) {
#if defined(OPENSSL_64_BIT)
  aes_word_t mask_w = (((uint64_t)mask) << 32) | mask;
#else
  aes_word_t mask_w = mask;
#endif
  // This is a variation on a delta swap.
  aes_word_t swap = ((*a >> shift) ^ *b) & mask_w;
  *a ^= swap << shift;
  *b ^= swap;
}
#endif  // OPENSSL_SSE2

// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides
// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares
// and transposes each square.
static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { … }

// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in,
                              size_t num_blocks) { … }

// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|.
// |num_blocks| must be at most |AES_NOHW_BATCH|.
static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks,
                                const AES_NOHW_BATCH *batch) { … }


// AES round steps.

static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch,
                                   const AES_NOHW_BATCH *key) { … }

static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { … }

// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES
// S-box, defined in FIPS PUB 197, section 5.1.1, step 2.
static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) { … }

static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) { … }

// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated
// to the right by |n|. This is a macro because |aes_nohw_shift_*| require
// constant shift counts in the SSE2 implementation.
#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) …

static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { … }

static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) { … }

// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated
// down by one.
static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { … }

// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated
// by two.
static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { … }

static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { … }

static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) { … }

static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key,
                                   size_t num_rounds, AES_NOHW_BATCH *batch) { … }

static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key,
                                   size_t num_rounds, AES_NOHW_BATCH *batch) { … }


// Key schedule.

static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out,
                                       const AES_KEY *key) { … }

static const uint8_t aes_nohw_rcon[10] = …;

// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in
// |rcon|, stored in a |aes_word_t|.
static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { … }

static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS],
                               const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { … }

static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { … }

static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) { … }

static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { … }


// External API.

int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits,
                             AES_KEY *aeskey) { … }

int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits,
                             AES_KEY *aeskey) { … }

void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { … }

void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { … }

static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16],
                                      const uint8_t b[16]) { … }

void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out,
                                   size_t blocks, const AES_KEY *key,
                                   const uint8_t ivec[16]) { … }

void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len,
                          const AES_KEY *key, uint8_t *ivec, const int enc) { … }
chromium/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.c.inc