utf8.c | Explore in Territory

#include "git-compat-util.h"
#include "strbuf.h"
#include "utf8.h"

/* This code is originally from https://www.cl.cam.ac.uk/~mgk25/ucs/ */

static const char utf16_be_bom[] = …;
static const char utf16_le_bom[] = …;
static const char utf32_be_bom[] = …;
static const char utf32_le_bom[] = …;

struct interval { … };

size_t display_mode_esc_sequence_len(const char *s)
{ … }

/* auxiliary function for binary search in interval table */
static int bisearch(ucs_char_t ucs, const struct interval *table, int max)
{ … }

/* The following two functions define the column width of an ISO 10646
 * character as follows:
 *
 *    - The null character (U+0000) has a column width of 0.
 *
 *    - Other C0/C1 control characters and DEL will lead to a return
 *      value of -1.
 *
 *    - Non-spacing and enclosing combining characters (general
 *      category code Mn or Me in the Unicode database) have a
 *      column width of 0.
 *
 *    - SOFT HYPHEN (U+00AD) has a column width of 1.
 *
 *    - Other format characters (general category code Cf in the Unicode
 *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
 *
 *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
 *      have a column width of 0.
 *
 *    - Spacing characters in the East Asian Wide (W) or East Asian
 *      Full-width (F) category as defined in Unicode Technical
 *      Report #11 have a column width of 2.
 *
 *    - All remaining characters (including all printable
 *      ISO 8859-1 and WGL4 characters, Unicode control characters,
 *      etc.) have a column width of 1.
 *
 * This implementation assumes that ucs_char_t characters are encoded
 * in ISO 10646.
 */

static int git_wcwidth(ucs_char_t ch)
{ … }

/*
 * Pick one ucs character starting from the location *start points at,
 * and return it, while updating the *start pointer to point at the
 * end of that character.  When remainder_p is not NULL, the location
 * holds the number of bytes remaining in the string that we are allowed
 * to pick from.  Otherwise we are allowed to pick up to the NUL that
 * would eventually appear in the string.  *remainder_p is also reduced
 * by the number of bytes we have consumed.
 *
 * If the string was not a valid UTF-8, *start pointer is set to NULL
 * and the return value is undefined.
 */
static ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p)
{ … }

/*
 * This function returns the number of columns occupied by the character
 * pointed to by the variable start. The pointer is updated to point at
 * the next character. When remainder_p is not NULL, it points at the
 * location that stores the number of remaining bytes we can use to pick
 * a character (see pick_one_utf8_char() above).
 */
int utf8_width(const char **start, size_t *remainder_p)
{ … }

/*
 * Returns the total number of columns required by a null-terminated
 * string, assuming that the string is utf8.  Returns strlen() instead
 * if the string does not look like a valid utf8 string.
 */
int utf8_strnwidth(const char *string, size_t len, int skip_ansi)
{ … }

int utf8_strwidth(const char *string)
{ … }

int is_utf8(const char *text)
{ … }

static void strbuf_add_indented_text(struct strbuf *buf, const char *text,
				     int indent, int indent2)
{ … }

/*
 * Wrap the text, if necessary. The variable indent is the indent for the
 * first line, indent2 is the indent for all other lines.
 * If indent is negative, assume that already -indent columns have been
 * consumed (and no extra indent is necessary for the first line).
 */
void strbuf_add_wrapped_text(struct strbuf *buf,
		const char *text, int indent1, int indent2, int width)
{ … }

void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len,
			     int indent, int indent2, int width)
{ … }

void strbuf_utf8_replace(struct strbuf *sb_src, int pos, int width,
			 const char *subst)
{ … }

/*
 * Returns true (1) if the src encoding name matches the dst encoding
 * name directly or one of its alternative names. E.g. UTF-16BE is the
 * same as UTF16BE.
 */
static int same_utf_encoding(const char *src, const char *dst)
{ … }

int is_encoding_utf8(const char *name)
{ … }

int same_encoding(const char *src, const char *dst)
{ … }

/*
 * Wrapper for fprintf and returns the total number of columns required
 * for the printed string, assuming that the string is utf8.
 */
int utf8_fprintf(FILE *stream, const char *format, ...)
{ … }

/*
 * Given a buffer and its encoding, return it re-encoded
 * with iconv.  If the conversion fails, returns NULL.
 */
#ifndef NO_ICONV
#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6))
	typedef const char * iconv_ibp;
#else
	iconv_ibp;
#endif
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv,
			    size_t bom_len, size_t *outsz_p)
{ … }

static const char *fallback_encoding(const char *name)
{ … }

char *reencode_string_len(const char *in, size_t insz,
			  const char *out_encoding, const char *in_encoding,
			  size_t *outsz)
{ … }
#endif

static int has_bom_prefix(const char *data, size_t len,
			  const char *bom, size_t bom_len)
{ … }

int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
{ … }

int is_missing_required_utf_bom(const char *enc, const char *data, size_t len)
{ … }

/*
 * Returns first character length in bytes for multi-byte `text` according to
 * `encoding`.
 *
 * - The `text` pointer is updated to point at the next character.
 * - When `remainder_p` is not NULL, on entry `*remainder_p` is how much bytes
 *   we can consume from text, and on exit `*remainder_p` is reduced by returned
 *   character length. Otherwise `text` is treated as limited by NUL.
 */
int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding)
{ … }

/*
 * Pick the next char from the stream, ignoring codepoints an HFS+ would.
 * Note that this is _not_ complete by any means. It's just enough
 * to make is_hfs_dotgit() work, and should not be used otherwise.
 */
static ucs_char_t next_hfs_char(const char **in)
{ … }

static int is_hfs_dot_generic(const char *path,
			      const char *needle, size_t needle_len)
{ … }

/*
 * Inline wrapper to make sure the compiler resolves strlen() on literals at
 * compile time.
 */
static inline int is_hfs_dot_str(const char *path, const char *needle)
{ … }

int is_hfs_dotgit(const char *path)
{ … }

int is_hfs_dotgitmodules(const char *path)
{ … }

int is_hfs_dotgitignore(const char *path)
{ … }

int is_hfs_dotgitattributes(const char *path)
{ … }

int is_hfs_dotmailmap(const char *path)
{ … }

const char utf8_bom[] = …;

int skip_utf8_bom(char **text, size_t len)
{ … }

void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width,
		       const char *s)
{ … }
git/utf8.c