//! Mapping of characters (codepoints, not graphemes) to nominal glyph identifiers.
//!
//! If you have never run into character to glyph mapping before
//! [Glyph IDs and the 'cmap' table](https://rsheeter.github.io/font101/#glyph-ids-and-the-cmap-table)
//! might be informative.
//!
//! The functionality in this module provides a 1-to-1 mapping from Unicode
//! characters (or [Unicode variation sequences](http://unicode.org/faq/vs.html)) to
//! nominal or "default" internal glyph identifiers for a given font.
//! This is a necessary first step, but generally insufficient for proper layout of
//! [complex text](https://en.wikipedia.org/wiki/Complex_text_layout) or even
//! simple text containing diacritics and ligatures.
//!
//! Comprehensive mapping of characters to positioned glyphs requires a process called
//! shaping. For more detail, see: [Why do I need a shaping engine?](https://harfbuzz.github.io/why-do-i-need-a-shaping-engine.html)
use read_fonts::{
tables::cmap::{
self, Cmap, Cmap12, Cmap12Iter, Cmap14, Cmap14Iter, Cmap4, Cmap4Iter, CmapSubtable,
EncodingRecord, PlatformId,
},
types::GlyphId,
FontData, TableProvider,
};
pub use read_fonts::tables::cmap::MapVariant;
/// Mapping of characters to nominal glyph identifiers.
///
/// The mappings are derived from the [cmap](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap)
/// table.
///
/// ## Obtaining a Charmap
///
/// Typically a Charmap is acquired by calling [charmap](crate::MetadataProvider::charmap) on a [FontRef](crate::FontRef).
///
/// ## Selection strategy
///
/// Fonts may contain multiple subtables in various formats supporting different encodings. The selection
/// strategy implemented here is designed to choose mappings that capture the broadest available Unicode
/// coverage:
///
/// * Unicode characters: a symbol mapping subtable is selected if available. Otherwise, subtables supporting
/// the Unicode full repertoire or Basic Multilingual Plane (BMP) are preferred, in that order. Formats
/// [4](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-4-segment-mapping-to-delta-values)
/// and [12](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-12-segmented-coverage) are
/// supported.
///
/// * Unicode variation sequences: these are provided by a format
/// [14](https://learn.microsoft.com/en-us/typography/opentype/spec/cmap#format-14-unicode-variation-sequences)
/// subtable.
///
#[derive(Clone, Default)]
pub struct Charmap<'a> {
codepoint_subtable: Option<CodepointSubtable<'a>>,
variant_subtable: Option<Cmap14<'a>>,
}
impl<'a> Charmap<'a> {
/// Creates a new character map from the given font.
pub fn new(font: &impl TableProvider<'a>) -> Self {
let Ok(cmap) = font.cmap() else {
return Default::default();
};
let selection = MappingSelection::new(&cmap);
Self {
codepoint_subtable: selection
.codepoint_subtable
.map(|subtable| CodepointSubtable {
subtable,
is_symbol: selection.mapping_index.codepoint_subtable_is_symbol,
}),
variant_subtable: selection.variant_subtable,
}
}
/// Returns true if a suitable Unicode character mapping is available.
pub fn has_map(&self) -> bool {
self.codepoint_subtable.is_some()
}
/// Returns true if a symbol mapping was selected.
pub fn is_symbol(&self) -> bool {
self.codepoint_subtable
.as_ref()
.map(|x| x.is_symbol)
.unwrap_or(false)
}
/// Returns true if a Unicode variation sequence mapping is available.
pub fn has_variant_map(&self) -> bool {
self.variant_subtable.is_some()
}
/// Maps a character to a nominal glyph identifier.
///
/// Returns `None` if a mapping does not exist.
pub fn map(&self, ch: impl Into<u32>) -> Option<GlyphId> {
self.codepoint_subtable.as_ref()?.map(ch.into())
}
/// Returns an iterator over all mappings of codepoint to nominal glyph
/// identifiers in the character map.
pub fn mappings(&self) -> Mappings<'a> {
self.codepoint_subtable
.as_ref()
.map(|subtable| {
Mappings(match &subtable.subtable {
SupportedSubtable::Format4(cmap4) => MappingsInner::Format4(cmap4.iter()),
SupportedSubtable::Format12(cmap12) => MappingsInner::Format12(cmap12.iter()),
})
})
.unwrap_or(Mappings(MappingsInner::None))
}
/// Maps a character and variation selector to a nominal glyph identifier.
///
/// Returns `None` if a mapping does not exist.
pub fn map_variant(&self, ch: impl Into<u32>, selector: impl Into<u32>) -> Option<MapVariant> {
self.variant_subtable.as_ref()?.map_variant(ch, selector)
}
/// Returns an iterator over all mappings of character and variation
/// selector to nominal glyph identifier in the character map.
pub fn variant_mappings(&self) -> VariantMappings<'a> {
VariantMappings(self.variant_subtable.clone().map(|cmap14| cmap14.iter()))
}
}
/// Cacheable indices of selected mapping tables for materializing a character
/// map.
///
/// Since [`Charmap`] carries a lifetime, it is difficult to store in a cache.
/// This type serves as an acceleration structure that allows for construction
/// of a character map while skipping the search for the most suitable Unicode
/// mappings.
#[derive(Copy, Clone, Default, Debug)]
pub struct MappingIndex {
/// Index of Unicode or symbol mapping subtable.
codepoint_subtable: Option<u16>,
/// True if the above is a symbol mapping.
codepoint_subtable_is_symbol: bool,
/// Index of Unicode variation selector subtable.
variant_subtable: Option<u16>,
}
impl MappingIndex {
/// Finds the indices of the most suitable Unicode mapping tables in the
/// given font.
pub fn new<'a>(font: &impl TableProvider<'a>) -> Self {
let Ok(cmap) = font.cmap() else {
return Default::default();
};
MappingSelection::new(&cmap).mapping_index
}
/// Creates a new character map for the given font using the tables referenced by
/// the precomputed indices.
///
/// The font should be the same as the one used to construct this object.
pub fn charmap<'a>(&self, font: &impl TableProvider<'a>) -> Charmap<'a> {
let Ok(cmap) = font.cmap() else {
return Default::default();
};
let records = cmap.encoding_records();
let data = cmap.offset_data();
Charmap {
codepoint_subtable: self
.codepoint_subtable
.and_then(|index| get_subtable(data, records, index))
.and_then(SupportedSubtable::new)
.map(|subtable| CodepointSubtable {
subtable,
is_symbol: self.codepoint_subtable_is_symbol,
}),
variant_subtable: self
.variant_subtable
.and_then(|index| get_subtable(data, records, index))
.and_then(|subtable| match subtable {
CmapSubtable::Format14(cmap14) => Some(cmap14),
_ => None,
}),
}
}
}
/// Iterator over all mappings of character to nominal glyph identifier
/// in a character map.
///
/// This is created with the [`Charmap::mappings`] method.
#[derive(Clone)]
pub struct Mappings<'a>(MappingsInner<'a>);
impl<'a> Iterator for Mappings<'a> {
type Item = (u32, GlyphId);
fn next(&mut self) -> Option<Self::Item> {
match &mut self.0 {
MappingsInner::None => None,
MappingsInner::Format4(iter) => iter.next(),
MappingsInner::Format12(iter) => iter.next(),
}
}
}
#[derive(Clone)]
enum MappingsInner<'a> {
None,
Format4(Cmap4Iter<'a>),
Format12(Cmap12Iter<'a>),
}
/// Iterator over all mappings of character and variation selector to
/// nominal glyph identifier in a character map.
///
/// This is created with the [`Charmap::variant_mappings`] method.
#[derive(Clone)]
pub struct VariantMappings<'a>(Option<Cmap14Iter<'a>>);
impl<'a> Iterator for VariantMappings<'a> {
type Item = (u32, u32, MapVariant);
fn next(&mut self) -> Option<Self::Item> {
self.0.as_mut()?.next()
}
}
fn get_subtable<'a>(
data: FontData<'a>,
records: &[EncodingRecord],
index: u16,
) -> Option<CmapSubtable<'a>> {
records
.get(index as usize)
.and_then(|record| record.subtable(data).ok())
}
#[derive(Clone)]
struct CodepointSubtable<'a> {
subtable: SupportedSubtable<'a>,
/// True if the subtable is a symbol mapping.
is_symbol: bool,
}
impl<'a> CodepointSubtable<'a> {
fn map(&self, codepoint: u32) -> Option<GlyphId> {
self.map_impl(codepoint).or_else(|| {
if self.is_symbol && codepoint <= 0x00FF {
// From HarfBuzz:
// For symbol-encoded OpenType fonts, we duplicate the
// U+F000..F0FF range at U+0000..U+00FF. That's what
// Windows seems to do, and that's hinted about at:
// https://docs.microsoft.com/en-us/typography/opentype/spec/recom
// under "Non-Standard (Symbol) Fonts".
// See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1595>
self.map_impl(codepoint + 0xF000)
} else {
None
}
})
}
fn map_impl(&self, codepoint: u32) -> Option<GlyphId> {
match &self.subtable {
SupportedSubtable::Format4(subtable) => subtable.map_codepoint(codepoint),
SupportedSubtable::Format12(subtable) => subtable.map_codepoint(codepoint),
}
}
}
#[derive(Clone)]
enum SupportedSubtable<'a> {
Format4(Cmap4<'a>),
Format12(Cmap12<'a>),
}
impl<'a> SupportedSubtable<'a> {
fn new(subtable: CmapSubtable<'a>) -> Option<Self> {
Some(match subtable {
CmapSubtable::Format4(cmap4) => Self::Format4(cmap4),
CmapSubtable::Format12(cmap12) => Self::Format12(cmap12),
_ => return None,
})
}
fn from_cmap_record(cmap: &Cmap<'a>, record: &cmap::EncodingRecord) -> Option<Self> {
Self::new(record.subtable(cmap.offset_data()).ok()?)
}
}
/// The mapping kind of a cmap subtable.
///
/// The ordering is significant and determines the priority of subtable
/// selection (greater is better).
#[derive(Copy, Clone, PartialEq, PartialOrd)]
enum MappingKind {
None = 0,
UnicodeBmp = 1,
UnicodeFull = 2,
Symbol = 3,
}
/// The result of searching the cmap table for the "best" available
/// subtables.
///
/// For `codepoint_subtable`, best means either symbol (which is preferred)
/// or a Unicode subtable with the greatest coverage.
///
/// For `variant_subtable`, best means a format 14 subtable.
struct MappingSelection<'a> {
/// The mapping index accelerator that holds indices of the following
/// subtables.
mapping_index: MappingIndex,
/// Either a symbol subtable or the Unicode subtable with the
/// greatest coverage.
codepoint_subtable: Option<SupportedSubtable<'a>>,
/// Subtable that supports mapping Unicode variation sequences.
variant_subtable: Option<Cmap14<'a>>,
}
impl<'a> MappingSelection<'a> {
fn new(cmap: &Cmap<'a>) -> Self {
const ENCODING_MS_SYMBOL: u16 = 0;
const ENCODING_MS_UNICODE_CS: u16 = 1;
const ENCODING_APPLE_ID_UNICODE_32: u16 = 4;
const ENCODING_APPLE_ID_VARIANT_SELECTOR: u16 = 5;
const ENCODING_MS_ID_UCS_4: u16 = 10;
let mut mapping_index = MappingIndex::default();
let mut mapping_kind = MappingKind::None;
let mut codepoint_subtable = None;
let mut variant_subtable = None;
let mut maybe_choose_subtable = |kind, index, subtable| {
if kind > mapping_kind {
mapping_kind = kind;
mapping_index.codepoint_subtable_is_symbol = kind == MappingKind::Symbol;
mapping_index.codepoint_subtable = Some(index as u16);
codepoint_subtable = Some(subtable);
}
};
// This generally follows the same strategy as FreeType, searching the encoding
// records in reverse and prioritizing UCS-4 subtables over UCS-2.
// See <https://gitlab.freedesktop.org/freetype/freetype/-/blob/ac5babe87629107c43f627e2cd17c6cf4f2ecd43/src/base/ftobjs.c#L1370>
// The exception is that we prefer a symbol subtable over all others which matches the behavior
// of HarfBuzz.
// See <https://github.com/harfbuzz/harfbuzz/blob/453ded05392af38bba9f89587edce465e86ffa6b/src/hb-ot-cmap-table.hh#L1818>
for (i, record) in cmap.encoding_records().iter().enumerate().rev() {
match (record.platform_id(), record.encoding_id()) {
(PlatformId::Unicode, ENCODING_APPLE_ID_VARIANT_SELECTOR) => {
// Unicode variation sequences
if let Ok(CmapSubtable::Format14(subtable)) =
record.subtable(cmap.offset_data())
{
if variant_subtable.is_none() {
mapping_index.variant_subtable = Some(i as u16);
variant_subtable = Some(subtable);
}
}
}
(PlatformId::Windows, ENCODING_MS_SYMBOL) => {
// Symbol
if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
maybe_choose_subtable(MappingKind::Symbol, i, subtable);
}
}
(PlatformId::Windows, ENCODING_MS_ID_UCS_4)
| (PlatformId::Unicode, ENCODING_APPLE_ID_UNICODE_32) => {
// Unicode full repertoire
if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
maybe_choose_subtable(MappingKind::UnicodeFull, i, subtable);
}
}
(PlatformId::ISO, _)
| (PlatformId::Unicode, _)
| (PlatformId::Windows, ENCODING_MS_UNICODE_CS) => {
// Unicode BMP only
if let Some(subtable) = SupportedSubtable::from_cmap_record(cmap, record) {
maybe_choose_subtable(MappingKind::UnicodeBmp, i, subtable);
}
}
_ => {}
}
}
Self {
mapping_index,
codepoint_subtable,
variant_subtable,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::MetadataProvider;
use read_fonts::FontRef;
#[test]
fn choose_format_12_over_4() {
let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
let charmap = font.charmap();
assert!(matches!(
charmap.codepoint_subtable.unwrap().subtable,
SupportedSubtable::Format12(..)
));
}
#[test]
fn choose_format_4() {
let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
let charmap = font.charmap();
assert!(matches!(
charmap.codepoint_subtable.unwrap().subtable,
SupportedSubtable::Format4(..)
));
}
#[test]
fn choose_symbol() {
let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
let charmap = font.charmap();
assert!(charmap.is_symbol());
assert!(matches!(
charmap.codepoint_subtable.unwrap().subtable,
SupportedSubtable::Format4(..)
));
}
#[test]
fn map_format_4() {
let font = FontRef::new(font_test_data::VAZIRMATN_VAR).unwrap();
let charmap = font.charmap();
assert_eq!(charmap.map('A'), Some(GlyphId::new(1)));
assert_eq!(charmap.map('À'), Some(GlyphId::new(2)));
assert_eq!(charmap.map('`'), Some(GlyphId::new(3)));
assert_eq!(charmap.map('B'), None);
}
#[test]
fn map_format_12() {
let font = FontRef::new(font_test_data::CMAP12_FONT1).unwrap();
let charmap = font.charmap();
assert_eq!(charmap.map(' '), None);
assert_eq!(charmap.map(0x101723_u32), Some(GlyphId::new(1)));
assert_eq!(charmap.map(0x101725_u32), Some(GlyphId::new(3)));
assert_eq!(charmap.map(0x102523_u32), Some(GlyphId::new(6)));
assert_eq!(charmap.map(0x102526_u32), Some(GlyphId::new(9)));
assert_eq!(charmap.map(0x102527_u32), Some(GlyphId::new(10)));
}
#[test]
fn map_symbol_pua() {
let font = FontRef::new(font_test_data::CMAP4_SYMBOL_PUA).unwrap();
let charmap = font.charmap();
assert!(charmap.codepoint_subtable.as_ref().unwrap().is_symbol);
assert_eq!(charmap.map(0xF001_u32), Some(GlyphId::new(1)));
assert_eq!(charmap.map(0xF002_u32), Some(GlyphId::new(2)));
assert_eq!(charmap.map(0xF003_u32), Some(GlyphId::new(3)));
assert_eq!(charmap.map(0xF0FE_u32), Some(GlyphId::new(4)));
// The following don't exist in the cmap table and are remapped into the U+F000..F0FF range
// due to the selection of a symbol mapping subtable.
assert_eq!(charmap.map(0x1_u32), Some(GlyphId::new(1)));
assert_eq!(charmap.map(0x2_u32), Some(GlyphId::new(2)));
assert_eq!(charmap.map(0x3_u32), Some(GlyphId::new(3)));
assert_eq!(charmap.map(0xFE_u32), Some(GlyphId::new(4)));
}
#[test]
fn map_variants() {
use super::MapVariant::*;
let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
let charmap = font.charmap();
let selector = '\u{e0100}';
assert_eq!(charmap.map_variant('a', selector), None);
assert_eq!(charmap.map_variant('\u{4e00}', selector), Some(UseDefault));
assert_eq!(charmap.map_variant('\u{4e06}', selector), Some(UseDefault));
assert_eq!(
charmap.map_variant('\u{4e08}', selector),
Some(Variant(GlyphId::new(25)))
);
assert_eq!(
charmap.map_variant('\u{4e09}', selector),
Some(Variant(GlyphId::new(26)))
);
}
#[test]
fn mappings() {
for font_data in [
font_test_data::VAZIRMATN_VAR,
font_test_data::CMAP12_FONT1,
font_test_data::SIMPLE_GLYF,
font_test_data::CMAP4_SYMBOL_PUA,
] {
let font = FontRef::new(font_data).unwrap();
let charmap = font.charmap();
for (codepoint, glyph_id) in charmap.mappings() {
assert_eq!(charmap.map(codepoint), Some(glyph_id));
}
}
}
#[test]
fn variant_mappings() {
let font = FontRef::new(font_test_data::CMAP14_FONT1).unwrap();
let charmap = font.charmap();
for (codepoint, selector, variant) in charmap.variant_mappings() {
assert_eq!(charmap.map_variant(codepoint, selector), Some(variant));
}
}
}