/* Basic UTF-8 manipulation routines by Jeff Bezanson placed in the public domain Fall 2005 This code is designed to provide the utilities you need to manipulate UTF-8 as an internal string encoding. These functions do not perform the error checking normally needed when handling UTF-8 data, so if you happen to be from the Unicode Consortium you will want to flay me alive. I do this because error checking can be performed at the boundaries (I/O), with these routines reserved for higher performance on data known to be valid. modified by Bryan Jurish (moo) March 2009 + removed some unneeded functions (escapes, printf etc), added others modified by IOhannes m zmölnig (umlaeute) Nov 2021 + convert native strings to UTF-8 */ #include <string.h> #include <stdarg.h> #include "s_utf8.h" static const uint32_t offsetsFromUTF8[6] = …; static const char trailingBytesForUTF8[256] = …; /* returns length of next utf-8 sequence */ int u8_seqlen(const char *s) { … } /* conversions without error checking only works for valid UTF-8, i.e. no 5- or 6-byte sequences srcsz = source size in bytes, or -1 if 0-terminated sz = dest size in # of wide characters returns # characters converted dest will always be L'\0'-terminated, even if there isn't enough room for all the characters. if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. */ int u8_utf8toucs2(uint16_t *dest, int sz, const char *src, int srcsz) { … } /* srcsz = number of source characters, or -1 if 0-terminated sz = size of dest buffer in bytes returns # characters converted dest will only be '\0'-terminated if there is enough space. this is for consistency; imagine there are 2 bytes of space left, but the next character requires 3 bytes. in this case we could NUL-terminate, but in general we can't when there's insufficient space. therefore this function only NUL-terminates if all the characters fit, and there's space for the NUL as well. the destination string will never be bigger than the source string. */ int u8_ucs2toutf8(char *dest, int sz, const uint16_t *src, int srcsz) { … } /* moo: get byte length of character number, or 0 if not supported */ int u8_wc_nbytes(uint32_t ch) { … } int u8_wc_toutf8(char *dest, uint32_t ch) { … } /*-- moo --*/ int u8_wc_toutf8_nul(char *dest, uint32_t ch) { … } /* charnum => byte offset */ int u8_offset(const char *str, int charnum) { … } /* byte offset => charnum */ int u8_charnum(const char *s, int offset) { … } /* reads the next utf-8 sequence out of a string, updating an index */ uint32_t u8_nextchar(const char *s, int *i) { … } /* number of characters */ int u8_strlen(const char *s) { … } void u8_inc(const char *s, int *i) { … } void u8_dec(const char *s, int *i) { … } /* srcsz = number of source characters, or -1 if 0-terminated sz = size of dest buffer in bytes returns # characters converted */ #ifdef _WIN32 #include <windows.h> #endif int u8_nativetoutf8(char *dest, int sz, const char *src, int srcsz) { … }