/*===-- blake3.c - BLAKE3 C Implementation ------------------------*- C -*-===*\ |* *| |* Released into the public domain with CC0 1.0 *| |* See 'llvm/lib/Support/BLAKE3/LICENSE' for info. *| |* SPDX-License-Identifier: CC0-1.0 *| |* *| \*===----------------------------------------------------------------------===*/ #include <assert.h> #include <stdbool.h> #include <string.h> #include "blake3_impl.h" const char *llvm_blake3_version(void) { … } INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8], uint8_t flags) { … } INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8], uint64_t chunk_counter) { … } INLINE size_t chunk_state_len(const blake3_chunk_state *self) { … } INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { … } INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) { … } output_t; INLINE output_t make_output(const uint32_t input_cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { … } // Chaining values within a given chunk (specifically the compress_in_place // interface) are represented as words. This avoids unnecessary bytes<->words // conversion overhead in the portable implementation. However, the hash_many // interface handles both user input and parent node blocks, so it accepts // bytes. For that reason, chaining values in the CV stack are represented as // bytes. INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { … } INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out, size_t out_len) { … } INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input, size_t input_len) { … } INLINE output_t chunk_state_output(const blake3_chunk_state *self) { … } INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], const uint32_t key[8], uint8_t flags) { … } // Given some input larger than one chunk, return the number of bytes that // should go in the left subtree. This is the largest power-of-2 number of // chunks that leaves at least 1 byte for the right subtree. INLINE size_t left_len(size_t content_len) { … } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { … } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) { … } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement exendable ouput.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multi-threading parallelism for that update(). static size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) { … } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. INLINE void compress_subtree_to_parent_node( const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) { … } INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], uint8_t flags) { … } void llvm_blake3_hasher_init(blake3_hasher *self) { … } void llvm_blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]) { … } void llvm_blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len) { … } void llvm_blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) { … } // As described in hasher_push_cv() below, we do "lazy merging", delaying // merges until right before the next CV is about to be added. This is // different from the reference implementation. Another difference is that we // aren't always merging 1 chunk at a time. Instead, each CV might represent // any power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" algorithm // described in the spec, we use a "count the total number of 1-bits" variant // that doesn't require us to retain the subtree size of the CV on top of the // stack. The principle is the same: each CV that should remain in the stack is // represented by a 1-bit in the total number of chunks (or bytes) so far. INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { … } // In reference_impl.rs, we merge the new CV with existing CVs from the stack // before pushing it. We can do that because we know more input is coming, so // we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the chunk_state. // If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once // as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to update. // In this case, the root node of the 64 KiB subtree would be the root node // of the whole tree, and it would need to be ROOT finalized. We can't // compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the the root of the whole tree. For example, maybe // we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the // node at the root of the 256 KiB subtree until we know how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're about // to add a CV to the stack, we don't merge it with anything first, as the // reference impl does. Instead we do merges using the *previous* CV that was // added, which is sitting on top of the stack, and we put the new CV // (unmerged) on top of the stack afterwards. This guarantees that we never // merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when we're // hashing an input all-at-once.) INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN], uint64_t chunk_counter) { … } void llvm_blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len) { … } void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len) { … } void llvm_blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len) { … } void llvm_blake3_hasher_reset(blake3_hasher *self) { … }