chromium/third_party/hunspell/google/bdict.h

// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_
#define THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_

#include <stddef.h>
#include <stdint.h>

#include "base/containers/span.h"
#include "base/hash/md5.h"

// BDict (binary dictionary) format. All offsets are little endian.
//
// Header (28 bytes).
//   "BDic" Signature (4 bytes)
//   Version (little endian 4 bytes)
//   Absolute offset in file of the aff info. (4 bytes)
//   Absolute offset in file of the dic table. (4 bytes)
//   (Added by v2.0) MD5 checksum of the aff info and the dic table. (16 bytes)
//
// Aff information:
//   Absolute offset in file of the affix group table (4 bytes)
//   Absolute offset in file of the affix rules table (4 bytes)
//   Absolute offset in file of the replacements table (4 bytes)
//   Absolute offset in file of the "other rules" table (4 bytes)
//
//   The data between the aff header and the affix rules table is the comment
//   from the beginning of the .aff file which often contains copyrights, etc.
//
//   Affix group table:
//     Array of NULL terminated strings. It will end in a double-NULL.
//
//   Affix rules table:
//     List of LF terminated lines. NULL terminated.
//
//   Replacements table:
//     List of pairs of NULL teminated words. The end is indicated by a
//     double-NULL. The first word in the pair is the replacement source, the
//     second is what to replace it with. Example:
//       foo\0bar\0a\0b\0\0
//     for replacing ("foo" with "bar") and ("a" with "b").
//
//   Other rules table:
//     List of LF terminated lines. NULL terminated.
//
//
// Dic table. This stores the .dic file which contains the words in the
// dictionary, and indices for each one that indicate a set of suffixes or
// prefixes that can be applied. We store it in a trie to save space. It
// replaces Hunspell's hash manager.
//
//   0abxxxxx xxxxxxxx (in binary) Leaf node:
//     The number stored in the bits represented by x is the affix index.
//
//     If bit <a> is set, the leaf node has an additional string. Following the
//     2 byte header is a NULL-terminated (possibly 0-length) string that should
//     be appended to the node. This allows long unique endings to be handled
//     efficiently.
//
//     If bit <b> is set, the leaf node has a supplimental list of affix IDs
//     following the ordinary data for the leaf node. These affix group IDs are
//     additional rules for the same word. For example, two prefixes may go
//     with distinct sets of suffixes.
//
//     If the affix index is all 1's, then that means that there is only the
//     supplimental list, and the 13-bit of affix built-in to the node don't
//     count. This is used to represent numbers greater than 13 bits, since
//     the supplimentary list has 16 bits per entry. The node must have a
//     supplimenal list if this is set.
//
//     This additional array is an array of 16-bit little-endian values,
//     terminated by 0xFFFF (since 0 is an affix ID meaning "no affix ID".
//
//   0x110000ab: Lookup node.
//     When <a> is set, addresses are 32-bits relative to the beginning of the
//     dictionary data. When unset, addresses are 16-bits relative to the
//     beginning of this node. All values are little endian.
//
//     When <b> is set, there is one additional entry before the table begins.
//     This is the 0th character. 0 is a common addition (meaning no more data)
//     and this prevents us from having to store entries for all the control
//     characters. This magic element is not counted in the table size.
//
//     The ID byte is followeed by two bytes:
//       XX: First character value in the lookup table.
//       XX: Number of characters in the lookup table.
//
//     This is followed optionally by the entry for 0, and then by a table of
//     size indicated by the second charatcer after the ID.
//
//   1110xxxx: List node with 8-bit addresses.
//     The number of items (max 16) in the list is stored in the bits xxxx.
//     Followed by N (character byte, 8-bit offset) pairs. These offsets are
//     relative to the end of the list of pairs.
//   1111xxxx: List node with 16-bit addresses. Same as above but offsets are
//     2-bytes each. LITTLE ENDIAN!

namespace hunspell {

#pragma pack(push, 1)

class BDict {};

#pragma pack(pop)

}  // namespace hunspell

#endif  // THIRD_PARTY_HUNSPELL_GOOGLE_BDICT_H_