// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2008-2011, International Business Machines * Corporation, Google and others. All Rights Reserved. * ******************************************************************************* */ // Author : [email protected] (Mohamed Eldawy) // ucnvsel.cpp // // Purpose: To generate a list of encodings capable of handling // a given Unicode text // // Started 09-April-2008 /** * \file * * This is an implementation of an encoding selector. * The goal is, given a unicode string, find the encodings * this string can be mapped to. To make processing faster * a trie is built when you call ucnvsel_open() that * stores all encodings a codepoint can map to */ #include "unicode/ucnvsel.h" #if !UCONFIG_NO_CONVERSION #include <string.h> #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/ucnv.h" #include "unicode/ustring.h" #include "unicode/uchriter.h" #include "utrie2.h" #include "propsvec.h" #include "uassert.h" #include "ucmndata.h" #include "udataswp.h" #include "uenumimp.h" #include "cmemory.h" #include "cstring.h" U_NAMESPACE_USE struct UConverterSelector { … }; static void generateSelectorData(UConverterSelector* result, UPropsVectors *upvec, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status) { … } /* open a selector. If converterListSize is 0, build for all converters. If excludedCodePoints is nullptr, don't exclude any codepoints */ U_CAPI UConverterSelector* U_EXPORT2 ucnvsel_open(const char* const* converterList, int32_t converterListSize, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status) { … } /* close opened selector */ U_CAPI void U_EXPORT2 ucnvsel_close(UConverterSelector *sel) { … } static const UDataInfo dataInfo = …; enum { … }; /* * Serialized form of a UConverterSelector, formatVersion 1: * * The serialized form begins with a standard ICU DataHeader with a UDataInfo * as the template above. * This is followed by: * int32_t indexes[UCNVSEL_INDEX_COUNT]; // see index entry constants above * serialized UTrie2; // indexes[UCNVSEL_INDEX_TRIE_SIZE] bytes * uint32_t pv[indexes[UCNVSEL_INDEX_PV_COUNT]]; // bit vectors * char* encodingNames[indexes[UCNVSEL_INDEX_NAMES_LENGTH]]; // NUL-terminated strings + padding */ /* serialize a selector */ U_CAPI int32_t U_EXPORT2 ucnvsel_serialize(const UConverterSelector* sel, void* buffer, int32_t bufferCapacity, UErrorCode* status) { … } /** * swap a selector into the desired Endianness and Asciiness of * the system. Just as FYI, selectors are always saved in the format * of the system that created them. They are only converted if used * on another system. In other words, selectors created on different * system can be different even if the params are identical (endianness * and Asciiness differences only) * * @param ds pointer to data swapper containing swapping info * @param inData pointer to incoming data * @param length length of inData in bytes * @param outData pointer to output data. Capacity should * be at least equal to capacity of inData * @param status an in/out ICU UErrorCode * @return 0 on failure, number of bytes swapped on success * number of bytes swapped can be smaller than length */ static int32_t ucnvsel_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *status) { … } /* unserialize a selector */ U_CAPI UConverterSelector* U_EXPORT2 ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status) { … } // a bunch of functions for the enumeration thingie! Nothing fancy here. Just // iterate over the selected encodings struct Enumerator { … }; U_CDECL_BEGIN static void U_CALLCONV ucnvsel_close_selector_iterator(UEnumeration *enumerator) { … } static int32_t U_CALLCONV ucnvsel_count_encodings(UEnumeration *enumerator, UErrorCode *status) { … } static const char* U_CALLCONV ucnvsel_next_encoding(UEnumeration* enumerator, int32_t* resultLength, UErrorCode* status) { … } static void U_CALLCONV ucnvsel_reset_iterator(UEnumeration* enumerator, UErrorCode* status) { … } U_CDECL_END static const UEnumeration defaultEncodings = …; // internal fn to intersect two sets of masks // returns whether the mask has reduced to all zeros static UBool intersectMasks(uint32_t* dest, const uint32_t* source1, int32_t len) { … } // internal fn to count how many 1's are there in a mask // algorithm taken from http://graphics.stanford.edu/~seander/bithacks.html static int16_t countOnes(uint32_t* mask, int32_t len) { … } /* internal function! */ static UEnumeration *selectForMask(const UConverterSelector* sel, uint32_t *theMask, UErrorCode *status) { … } /* check a string against the selector - UTF16 version */ U_CAPI UEnumeration * U_EXPORT2 ucnvsel_selectForString(const UConverterSelector* sel, const char16_t *s, int32_t length, UErrorCode *status) { … } /* check a string against the selector - UTF8 version */ U_CAPI UEnumeration * U_EXPORT2 ucnvsel_selectForUTF8(const UConverterSelector* sel, const char *s, int32_t length, UErrorCode *status) { … } #endif // !UCONFIG_NO_CONVERSION