gcm_nohw.c.inc | Explore in Territory

/* Copyright (c) 2019, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

#include <openssl/base.h>

#include "../../internal.h"
#include "internal.h"

#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2)
#include <emmintrin.h>
#endif


// This file contains a constant-time implementation of GHASH based on the notes
// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction
// algorithm described in
// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf.
//
// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our
// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run
// the 32-bit implementation, but we can use its intrinsics if necessary.

#if defined(BORINGSSL_HAS_UINT128)

static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
                           uint64_t b) { … }

#elif defined(OPENSSL_SSE2)

static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) {
  // One term every four bits means the largest term is 32/4 = 8, which does not
  // overflow into the next term.
  __m128i aa = _mm_setr_epi32(a, 0, a, 0);
  __m128i bb = _mm_setr_epi32(b, 0, b, 0);

  __m128i a0a0 =
      _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0));
  __m128i a2a2 =
      _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0));
  __m128i b0b1 =
      _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0));
  __m128i b2b3 =
      _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0));

  __m128i c0c1 =
      _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3));
  __m128i c2c3 =
      _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3));

  __m128i a1a1 =
      _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0));
  __m128i a3a3 =
      _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0));
  __m128i b3b0 =
      _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0));
  __m128i b1b2 =
      _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0));

  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0));
  c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2));
  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0));
  c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2));

  c0c1 = _mm_and_si128(
      c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222));
  c2c3 = _mm_and_si128(
      c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888));

  c0c1 = _mm_xor_si128(c0c1, c2c3);
  // c0 ^= c1
  c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8));
  return c0c1;
}

static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
                           uint64_t b) {
  uint32_t a0 = a & 0xffffffff;
  uint32_t a1 = a >> 32;
  uint32_t b0 = b & 0xffffffff;
  uint32_t b1 = b >> 32;
  // Karatsuba multiplication.
  __m128i lo = gcm_mul32_nohw(a0, b0);
  __m128i hi = gcm_mul32_nohw(a1, b1);
  __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1);
  mid = _mm_xor_si128(mid, lo);
  mid = _mm_xor_si128(mid, hi);
  __m128i ret = _mm_unpacklo_epi64(lo, hi);
  mid = _mm_slli_si128(mid, 4);
  mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0));
  ret = _mm_xor_si128(ret, mid);
  memcpy(out_lo, &ret, 8);
  memcpy(out_hi, ((char*)&ret) + 8, 8);
}

#else  // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2

static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) {
  // One term every four bits means the largest term is 32/4 = 8, which does not
  // overflow into the next term.
  uint32_t a0 = a & 0x11111111;
  uint32_t a1 = a & 0x22222222;
  uint32_t a2 = a & 0x44444444;
  uint32_t a3 = a & 0x88888888;

  uint32_t b0 = b & 0x11111111;
  uint32_t b1 = b & 0x22222222;
  uint32_t b2 = b & 0x44444444;
  uint32_t b3 = b & 0x88888888;

  uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^
                (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1);
  uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^
                (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2);
  uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^
                (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3);
  uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^
                (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0);

  return (c0 & UINT64_C(0x1111111111111111)) |
         (c1 & UINT64_C(0x2222222222222222)) |
         (c2 & UINT64_C(0x4444444444444444)) |
         (c3 & UINT64_C(0x8888888888888888));
}

static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a,
                           uint64_t b) {
  uint32_t a0 = a & 0xffffffff;
  uint32_t a1 = a >> 32;
  uint32_t b0 = b & 0xffffffff;
  uint32_t b1 = b >> 32;
  // Karatsuba multiplication.
  uint64_t lo = gcm_mul32_nohw(a0, b0);
  uint64_t hi = gcm_mul32_nohw(a1, b1);
  uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi;
  *out_lo = lo ^ (mid << 32);
  *out_hi = hi ^ (mid >> 32);
}

#endif  // BORINGSSL_HAS_UINT128

void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) { … }

static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) { … }

void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) { … }

void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
                    size_t len) { … }
chromium/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_nohw.c.inc