/* Copyright (c) 2018, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <openssl/hrss.h> #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <openssl/bn.h> #include <openssl/hmac.h> #include <openssl/mem.h> #include <openssl/rand.h> #include <openssl/sha.h> #if defined(_MSC_VER) #define RESTRICT #else #define RESTRICT … #endif #include "../internal.h" #include "internal.h" #if defined(OPENSSL_SSE2) #include <emmintrin.h> #endif #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) #include <arm_neon.h> #endif // This is an implementation of [HRSS], but with a KEM transformation based on // [SXY]. The primary references are: // HRSS: https://eprint.iacr.org/2017/667.pdf // HRSSNIST: // https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip // SXY: https://eprint.iacr.org/2017/1005.pdf // NTRUTN14: // https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf // NTRUCOMP: https://eprint.iacr.org/2018/1174 // SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd // Vector operations. // // A couple of functions in this file can use vector operations to meaningful // effect. If we're building for a target that has a supported vector unit, // |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a // 128-bit vector. The following functions abstract over the differences between // NEON and SSE2 for implementing some vector operations. // TODO: MSVC can likely also be made to work with vector operations, but ^ must // be replaced with _mm_xor_si128, etc. #if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER)) #define HRSS_HAVE_VECTOR_UNIT vec_t; // vec_capable returns one iff the current platform supports SSE2. static int vec_capable(void) { … } // vec_add performs a pair-wise addition of four uint16s from |a| and |b|. static inline vec_t vec_add(vec_t a, vec_t b) { … } // vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. static inline vec_t vec_sub(vec_t a, vec_t b) { … } // vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting // vector. static inline vec_t vec_mul(vec_t a, uint16_t b) { … } // vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and // returns the resulting vector. static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { … } // vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. static inline void vec3_rshift_word(vec_t v[3]) { … } // vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. static inline void vec4_rshift_word(vec_t v[4]) { … } // vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first // five from |right|, and returns the resulting vector. static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { … } // poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one // bit. static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { … } // poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one // bit. static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { … } // vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in // a vector and returns the result. static inline vec_t vec_broadcast_bit(vec_t a) { … } // vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the // compiler requires that |i| be a compile-time constant.) #define vec_get_word(v, i) … #elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) #define HRSS_HAVE_VECTOR_UNIT typedef uint16x8_t vec_t; // These functions perform the same actions as the SSE2 function of the same // name, above. static int vec_capable(void) { return CRYPTO_is_NEON_capable(); } static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; } static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; } static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); } static inline void vec3_rshift_word(vec_t v[3]) { const uint16x8_t kZero = {0}; v[2] = vextq_u16(v[1], v[2], 7); v[1] = vextq_u16(v[0], v[1], 7); v[0] = vextq_u16(kZero, v[0], 7); } static inline void vec4_rshift_word(vec_t v[4]) { const uint16x8_t kZero = {0}; v[3] = vextq_u16(v[2], v[3], 7); v[2] = vextq_u16(v[1], v[2], 7); v[1] = vextq_u16(v[0], v[1], 7); v[0] = vextq_u16(kZero, v[0], 7); } static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { return vextq_u16(left, right, 5); } static inline uint16_t vec_get_word(vec_t v, unsigned i) { return v[i]; } #if !defined(OPENSSL_AARCH64) static inline vec_t vec_broadcast_bit(vec_t a) { a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); return vdupq_lane_u16(vget_low_u16(a), 0); } static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { vec_t carry_s = {0}; vec_t carry_a = {0}; const vec_t kZero = {0}; for (int i = 0; i < 6; i++) { vec_t next_carry_s = a_s[i] >> 15; a_s[i] <<= 1; a_s[i] |= vextq_u16(kZero, next_carry_s, 7); a_s[i] |= carry_s; carry_s = vextq_u16(next_carry_s, kZero, 7); vec_t next_carry_a = a_a[i] >> 15; a_a[i] <<= 1; a_a[i] |= vextq_u16(kZero, next_carry_a, 7); a_a[i] |= carry_a; carry_a = vextq_u16(next_carry_a, kZero, 7); } } static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { vec_t carry_s = {0}; vec_t carry_a = {0}; const vec_t kZero = {0}; for (int i = 5; i >= 0; i--) { vec_t next_carry_s = a_s[i] << 15; a_s[i] >>= 1; a_s[i] |= vextq_u16(next_carry_s, kZero, 1); a_s[i] |= carry_s; carry_s = vextq_u16(kZero, next_carry_s, 1); vec_t next_carry_a = a_a[i] << 15; a_a[i] >>= 1; a_a[i] |= vextq_u16(next_carry_a, kZero, 1); a_a[i] |= carry_a; carry_a = vextq_u16(kZero, next_carry_a, 1); } } #endif // !OPENSSL_AARCH64 #endif // (ARM || AARCH64) && NEON // Polynomials in this scheme have N terms. // #define N 701 // Underlying data types and arithmetic operations. // ------------------------------------------------ // Binary polynomials. // poly2 represents a degree-N polynomial over GF(2). The words are in little- // endian order, i.e. the coefficient of x^0 is the LSB of the first word. The // final word is only partially used since N is not a multiple of the word size. // Defined in internal.h: // struct poly2 { // crypto_word_t v[WORDS_PER_POLY]; // }; OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) { … } static void poly2_zero(struct poly2 *p) { … } // word_reverse returns |in| with the bits in reverse order. static crypto_word_t word_reverse(crypto_word_t in) { … } // lsb_to_all replicates the least-significant bit of |v| to all bits of the // word. This is used in bit-slicing operations to make a vector from a fixed // value. static crypto_word_t lsb_to_all(crypto_word_t v) { … } // poly2_mod_phiN reduces |p| by Φ(N). static void poly2_mod_phiN(struct poly2 *p) { … } // poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes // the result to |out|. static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) { … } // poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { … } // poly2_fmadd sets |out| to |out| + |in| * m, where m is either // |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, crypto_word_t m) { … } // poly2_lshift1 left-shifts |p| by one bit. static void poly2_lshift1(struct poly2 *p) { … } // poly2_rshift1 right-shifts |p| by one bit. static void poly2_rshift1(struct poly2 *p) { … } // poly2_clear_top_bits clears the bits in the final word that are only for // alignment. static void poly2_clear_top_bits(struct poly2 *p) { … } // poly2_top_bits_are_clear returns one iff the extra bits in the final words of // |p| are zero. static int poly2_top_bits_are_clear(const struct poly2 *p) { … } // Ternary polynomials. // poly3 represents a degree-N polynomial over GF(3). Each coefficient is // bitsliced across the |s| and |a| arrays, like this: // // s | a | value // ----------------- // 0 | 0 | 0 // 0 | 1 | 1 // 1 | 1 | -1 (aka 2) // 1 | 0 | <invalid> // // ('s' is for sign, and 'a' is the absolute value.) // // Once bitsliced as such, the following circuits can be used to implement // addition and multiplication mod 3: // // (s3, a3) = (s1, a1) × (s2, a2) // a3 = a1 ∧ a2 // s3 = (s1 ⊕ s2) ∧ a3 // // (s3, a3) = (s1, a1) + (s2, a2) // t = s1 ⊕ a2 // s3 = t ∧ (s2 ⊕ a1) // a3 = (a1 ⊕ a2) ∨ (t ⊕ s2) // // (s3, a3) = (s1, a1) - (s2, a2) // t = a1 ⊕ a2 // s3 = (s1 ⊕ a2) ∧ (t ⊕ s2) // a3 = t ∨ (s1 ⊕ s2) // // Negating a value just involves XORing s by a. // // struct poly3 { // struct poly2 s, a; // }; OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) { … } static void poly3_zero(struct poly3 *p) { … } // poly3_reverse_700 reverses the order of the first 700 terms of |in| and // writes them to |out|. static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) { … } // poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|). static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2) { … } // poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2) { … } // poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|). static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a, const crypto_word_t s1, const crypto_word_t a1, const crypto_word_t s2, const crypto_word_t a2) { … } // poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, crypto_word_t ma) { … } // poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma). static void poly3_fmsub(struct poly3 *RESTRICT out, const struct poly3 *RESTRICT in, crypto_word_t ms, crypto_word_t ma) { … } // final_bit_to_all replicates the bit in the final position of the last word to // all the bits in the word. static crypto_word_t final_bit_to_all(crypto_word_t v) { … } // poly3_top_bits_are_clear returns one iff the extra bits in the final words of // |p| are zero. OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) { … } // poly3_mod_phiN reduces |p| by Φ(N). static void poly3_mod_phiN(struct poly3 *p) { … } static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { … } static void poly3_lshift1(struct poly3 *p) { … } static void poly3_rshift1(struct poly3 *p) { … } // poly3_span represents a pointer into a poly3. struct poly3_span { … }; // poly3_span_add adds |n| words of values from |a| and |b| and writes the // result to |out|. static void poly3_span_add(const struct poly3_span *out, const struct poly3_span *a, const struct poly3_span *b, size_t n) { … } // poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. static void poly3_span_sub(const struct poly3_span *a, const struct poly3_span *b, size_t n) { … } // poly3_mul_aux is a recursive function that multiplies |n| words from |a| and // |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of // |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't // used and the recursion stops. For |n| in {11, 22}, the transitive total // amount of |scratch| needed happens to be 2n+2. static void poly3_mul_aux(const struct poly3_span *out, const struct poly3_span *scratch, const struct poly3_span *a, const struct poly3_span *b, size_t n) { … } // HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, const struct poly3 *y) { … } #if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) // poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is // |0xff..ff|. Otherwise, |swap| must be zero. static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], vec_t b_a[6], const vec_t swap) { … } // poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|). static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], vec_t b_a[6], const vec_t ms, const vec_t ma) { … } // poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod // Φ(N). static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { … } #endif // HRSS_HAVE_VECTOR_UNIT // HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod // Φ(N). void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { … } // Polynomials in Q. // Coefficients are reduced mod Q. (Q is clearly not prime, therefore the // coefficients do not form a field.) #define Q … // VECS_PER_POLY is the number of 128-bit vectors needed to represent a // polynomial. #define COEFFICIENTS_PER_VEC … #define VECS_PER_POLY … // poly represents a polynomial with coefficients mod Q. Note that, while Q is a // power of two, this does not operate in GF(Q). That would be a binary field // but this is simply mod Q. Thus the coefficients are not a field. // // Coefficients are ordered little-endian, thus the coefficient of x^0 is the // first element of the array. struct poly { … }; // poly_normalize zeros out the excess elements of |x| which are included only // for alignment. static void poly_normalize(struct poly *x) { … } // poly_assert_normalized asserts that the excess elements of |x| are zeroed out // for the cases that case. (E.g. |poly_mul_vec|.) static void poly_assert_normalized(const struct poly *x) { … } OPENSSL_UNUSED static void poly_print(const struct poly *p) { … } // POLY_MUL_SCRATCH contains space for the working variables needed by // |poly_mul|. The contents afterwards may be discarded, but the object may also // be reused with future |poly_mul| calls to save heap allocations. // // This object must have 32-byte alignment. struct POLY_MUL_SCRATCH { … }; #if defined(HRSS_HAVE_VECTOR_UNIT) // poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| // and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements // of |scratch| and the function recurses, except if |n| < 3, when |scratch| // isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| // needs 172 elements. static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch, const vec_t *restrict a, const vec_t *restrict b, const size_t n) { … } // poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, const struct poly *x, const struct poly *y) { … } #endif // HRSS_HAVE_VECTOR_UNIT // poly_mul_novec_aux writes the product of |a| and |b| to |out|, using // |scratch| as scratch space. It'll use Karatsuba if the inputs are large // enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and // the function recurses, except if |n| < 64, when |scratch| isn't used and the // recursion stops. If |n| == |N| then |scratch| needs 1318 elements. static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, const uint16_t *a, const uint16_t *b, size_t n) { … } // poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, const struct poly *x, const struct poly *y) { … } static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r, const struct poly *a, const struct poly *b) { … } // poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). static void poly_mul_x_minus_1(struct poly *p) { … } // poly_mod_phiN sets |p| to |p| mod Φ(N). static void poly_mod_phiN(struct poly *p) { … } // poly_clamp reduces each coefficient mod Q. static void poly_clamp(struct poly *p) { … } // Conversion functions // -------------------- // poly2_from_poly sets |*out| to |in| mod 2. static void poly2_from_poly(struct poly2 *out, const struct poly *in) { … } // mod3 treats |a| as a signed number and returns |a| mod 3. static uint16_t mod3(int16_t a) { … } // poly3_from_poly sets |*out| to |in|. static void poly3_from_poly(struct poly3 *out, const struct poly *in) { … } // poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, // Q-1}. It returns a mask indicating whether all coefficients were found to be // in that set. static crypto_word_t poly3_from_poly_checked(struct poly3 *out, const struct poly *in) { … } static void poly_from_poly2(struct poly *out, const struct poly2 *in) { … } static void poly_from_poly3(struct poly *out, const struct poly3 *in) { … } // Polynomial inversion // -------------------- // poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod // Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion // mod Q. static void poly_invert_mod2(struct poly *out, const struct poly *in) { … } // poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out, const struct poly *in) { … } // Marshal and unmarshal functions for various basic types. // -------------------------------------------------------- #define POLY_BYTES … // poly_marshal serialises all but the final coefficient of |in| to |out|. static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { … } // poly_unmarshal parses the output of |poly_marshal| and sets |out| such that // all but the final coefficients match, and the final coefficient is calculated // such that evaluating |out| at one results in zero. It returns one on success // or zero if |in| is an invalid encoding. static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { … } // mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may // have an invalid value when processing attacker-controlled inputs. static uint16_t mod3_from_modQ(uint16_t v) { … } // poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are // all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may // have invalid values when processing attacker-controlled inputs.) static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], const struct poly *in) { … } // HRSS-specific functions // ----------------------- // poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}. // This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but // with HRSS-SXY the sampling algorithm is now a private detail of the // implementation (previously it had to match between two parties). This // function uses that freedom to implement a flatter distribution of values. static void poly_short_sample(struct poly *out, const uint8_t in[HRSS_SAMPLE_BYTES]) { … } // poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], // section 1.8.2. static void poly_short_sample_plus(struct poly *out, const uint8_t in[HRSS_SAMPLE_BYTES]) { … } // poly_lift computes the function discussed in [HRSS], appendix B. static void poly_lift(struct poly *out, const struct poly *a) { … } struct public_key { … }; struct private_key { … }; // public_key_from_external converts an external public key pointer into an // internal one. Externally the alignment is only specified to be eight bytes // but we need 16-byte alignment. We could annotate the external struct with // that alignment but we can only assume that malloced pointers are 8-byte // aligned in any case. (Even if the underlying malloc returns values with // 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess // that up.) static struct public_key *public_key_from_external( struct HRSS_public_key *ext) { … } // private_key_from_external does the same thing as |public_key_from_external|, // but for private keys. See the comment on that function about alignment // issues. static struct private_key *private_key_from_external( struct HRSS_private_key *ext) { … } // malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and // sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release // it. It returns NULL if out of memory. static void *malloc_align32(void **out_ptr, size_t size) { … } int HRSS_generate_key( struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { … } static const char kSharedKey[] = …; int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32], const struct HRSS_public_key *in_pub, const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { … } int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], const struct HRSS_private_key *in_priv, const uint8_t *ciphertext, size_t ciphertext_len) { … } void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub) { … } int HRSS_parse_public_key(struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { … }