// Copyright 2011 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_ #define THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_ #include <stddef.h> #include <stdint.h> #include "base/containers/span.h" #include "base/hash/md5.h" // BDict (binary dictionary) format. All offsets are little endian. // // Header (28 bytes). // "BDic" Signature (4 bytes) // Version (little endian 4 bytes) // Absolute offset in file of the aff info. (4 bytes) // Absolute offset in file of the dic table. (4 bytes) // (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes) // // Aff information: // Absolute offset in file of the affix group table (4 bytes) // Absolute offset in file of the affix rules table (4 bytes) // Absolute offset in file of the replacements table (4 bytes) // Absolute offset in file of the "other rules" table (4 bytes) // // The data between the aff header and the affix rules table is the comment // from the beginning of the .aff file which often contains copyrights, etc. // // Affix group table: // Array of NULL terminated strings. It will end in a double-NULL. // // Affix rules table: // List of LF terminated lines. NULL terminated. // // Replacements table: // List of pairs of NULL teminated words. The end is indicated by a // double-NULL. The first word in the pair is the replacement source, the // second is what to replace it with. Example: // foo\0bar\0a\0b\0\0 // for replacing ("foo" with "bar") and ("a" with "b"). // // Other rules table: // List of LF terminated lines. NULL terminated. // // // Dic table. This stores the .dic file which contains the words in the // dictionary, and indices for each one that indicate a set of suffixes or // prefixes that can be applied. We store it in a trie to save space. It // replaces Hunspell's hash manager. // // 0abxxxxx xxxxxxxx (in binary) Leaf node: // The number stored in the bits represented by x is the affix index. // // If bit <a> is set, the leaf node has an additional string. Following the // 2 byte header is a NULL-terminated (possibly 0-length) string that should // be appended to the node. This allows long unique endings to be handled // efficiently. // // If bit <b> is set, the leaf node has a supplimental list of affix IDs // following the ordinary data for the leaf node. These affix group IDs are // additional rules for the same word. For example, two prefixes may go // with distinct sets of suffixes. // // If the affix index is all 1's, then that means that there is only the // supplimental list, and the 13-bit of affix built-in to the node don't // count. This is used to represent numbers greater than 13 bits, since // the supplimentary list has 16 bits per entry. The node must have a // supplimenal list if this is set. // // This additional array is an array of 16-bit little-endian values, // terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID". // // 0x110000ab: Lookup node. // When <a> is set, addresses are 32-bits relative to the beginning of the // dictionary data. When unset, addresses are 16-bits relative to the // beginning of this node. All values are little endian. // // When <b> is set, there is one additional entry before the table begins. // This is the 0th character. 0 is a common addition (meaning no more data) // and this prevents us from having to store entries for all the control // characters. This magic element is not counted in the table size. // // The ID byte is followeed by two bytes: // XX: First character value in the lookup table. // XX: Number of characters in the lookup table. // // This is followed optionally by the entry for 0, and then by a table of // size indicated by the second charatcer after the ID. // // 1110xxxx: List node with 8-bit addresses. // The number of items (max 16) in the list is stored in the bits xxxx. // Followed by N (character byte, 8-bit offset) pairs. These offsets are // relative to the end of the list of pairs. // 1111xxxx: List node with 16-bit addresses. Same as above but offsets are // 2-bytes each. LITTLE ENDIAN! namespace hunspell { #pragma pack(push, 1) class BDict { … }; #pragma pack(pop) } // namespace hunspell #endif // THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_