// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ****************************************************************************** * * Copyright (C) 2000-2016, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: ushape.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2000jun29 * created by: Markus W. Scherer * * Arabic letter shaping implemented by Ayman Roshdy */ #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/ushape.h" #include "cmemory.h" #include "putilimp.h" #include "ustr_imp.h" #include "ubidi_props.h" #include "uassert.h" /* * This implementation is designed for 16-bit Unicode strings. * The main assumption is that the Arabic characters and their * presentation forms each fit into a single char16_t. * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII * characters. */ /* * ### TODO in general for letter shaping: * - the letter shaping code is UTF-16-unaware; needs update * + especially invertBuffer()?! * - needs to handle the "Arabic Tail" that is used in some legacy codepages * as a glyph fragment of wide-glyph letters * + IBM Unicode conversion tables map it to U+200B (ZWSP) * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms * + Unicode 3.2 added U+FE73 ARABIC TAIL FRAGMENT */ /* definitions for Arabic letter shaping ------------------------------------ */ #define IRRELEVANT … #define LAMTYPE … #define ALEFTYPE … #define LINKR … #define LINKL … #define APRESENT … #define SHADDA … #define CSHADDA … #define COMBINE … #define HAMZAFE_CHAR … #define HAMZA06_CHAR … #define YEH_HAMZA_CHAR … #define YEH_HAMZAFE_CHAR … #define LAMALEF_SPACE_SUB … #define TASHKEEL_SPACE_SUB … #define NEW_TAIL_CHAR … #define OLD_TAIL_CHAR … #define LAM_CHAR … #define SPACE_CHAR … #define SHADDA_CHAR … #define TATWEEL_CHAR … #define SHADDA_TATWEEL_CHAR … #define SHADDA06_CHAR … #define SHAPE_MODE … #define DESHAPE_MODE … struct uShapeVariables { … }; static const uint8_t tailFamilyIsolatedFinal[] = …; static const uint8_t tashkeelMedial[] = …; static const char16_t yehHamzaToYeh[] = …; static const uint8_t IrrelevantPos[] = …; static const char16_t convertLamAlef[] = …; static const char16_t araLink[178]= …; static const uint8_t presALink[] = …; static const uint8_t presBLink[]= …; static const char16_t convertFBto06[] = …; static const char16_t convertFEto06[] = …; static const uint8_t shapeTable[4][4][4]= …; /* * This function shapes European digits to Arabic-Indic digits * in-place, writing over the input characters. * Since we know that we are only looking for BMP code points, * we can safely just work with code units (again, at least UTF-16). */ static void _shapeToArabicDigitsWithContext(char16_t *s, int32_t length, char16_t digitBase, UBool isLogical, UBool lastStrongWasAL) { … } /* *Name : invertBuffer *Function : This function inverts the buffer, it's used * in case the user specifies the buffer to be * U_SHAPE_TEXT_DIRECTION_LOGICAL */ static void invertBuffer(char16_t *buffer, int32_t size, uint32_t /*options*/, int32_t lowlimit, int32_t highlimit) { … } /* *Name : changeLamAlef *Function : Converts the Alef characters into an equivalent * LamAlef location in the 0x06xx Range, this is an * intermediate stage in the operation of the program * later it'll be converted into the 0xFExx LamAlefs * in the shaping function. */ static inline char16_t changeLamAlef(char16_t ch) { … } /* *Name : getLink *Function : Resolves the link between the characters as * Arabic characters have four forms : * Isolated, Initial, Middle and Final Form */ static char16_t getLink(char16_t ch) { … } /* *Name : countSpaces *Function : Counts the number of spaces * at each end of the logical buffer */ static void countSpaces(char16_t *dest, int32_t size, uint32_t /*options*/, int32_t *spacesCountl, int32_t *spacesCountr) { … } /* *Name : isTashkeelChar *Function : Returns 1 for Tashkeel characters in 06 range else return 0 */ static inline int32_t isTashkeelChar(char16_t ch) { … } /* *Name : isTashkeelCharFE *Function : Returns 1 for Tashkeel characters in FE range else return 0 */ static inline int32_t isTashkeelCharFE(char16_t ch) { … } /* *Name : isAlefChar *Function : Returns 1 for Alef characters else return 0 */ static inline int32_t isAlefChar(char16_t ch) { … } /* *Name : isLamAlefChar *Function : Returns 1 for LamAlef characters else return 0 */ static inline int32_t isLamAlefChar(char16_t ch) { … } /*BIDI *Name : isTailChar *Function : returns 1 if the character matches one of the tail characters (0xfe73 or 0x200b) otherwise returns 0 */ static inline int32_t isTailChar(char16_t ch) { … } /*BIDI *Name : isSeenTailFamilyChar *Function : returns 1 if the character is a seen family isolated character * in the FE range otherwise returns 0 */ static inline int32_t isSeenTailFamilyChar(char16_t ch) { … } /* Name : isSeenFamilyChar * Function : returns 1 if the character is a seen family character in the Unicode * 06 range otherwise returns 0 */ static inline int32_t isSeenFamilyChar(char16_t ch){ … } /*Start of BIDI*/ /* *Name : isAlefMaksouraChar *Function : returns 1 if the character is a Alef Maksoura Final or isolated * otherwise returns 0 */ static inline int32_t isAlefMaksouraChar(char16_t ch) { … } /* * Name : isYehHamzaChar * Function : returns 1 if the character is a yehHamza isolated or yehhamza * final is found otherwise returns 0 */ static inline int32_t isYehHamzaChar(char16_t ch) { … } /* * Name: isTashkeelOnTatweelChar * Function: Checks if the Tashkeel Character is on Tatweel or not,if the * Tashkeel on tatweel (FE range), it returns 1 else if the * Tashkeel with shadda on tatweel (FC range)return 2 otherwise * returns 0 */ static inline int32_t isTashkeelOnTatweelChar(char16_t ch){ … } /* * Name: isIsolatedTashkeelChar * Function: Checks if the Tashkeel Character is in the isolated form * (i.e. Unicode FE range) returns 1 else if the Tashkeel * with shadda is in the isolated form (i.e. Unicode FC range) * returns 2 otherwise returns 0 */ static inline int32_t isIsolatedTashkeelChar(char16_t ch){ … } /* *Name : calculateSize *Function : This function calculates the destSize to be used in preflighting * when the destSize is equal to 0 * It is used also to calculate the new destsize in case the * destination buffer will be resized. */ static int32_t calculateSize(const char16_t *source, int32_t sourceLength, int32_t destSize,uint32_t options) { … } /* *Name : handleTashkeelWithTatweel *Function : Replaces Tashkeel as following: * Case 1 :if the Tashkeel on tatweel, replace it with Tatweel. * Case 2 :if the Tashkeel aggregated with Shadda on Tatweel, replace * it with Shadda on Tatweel. * Case 3: if the Tashkeel is isolated replace it with Space. * */ static int32_t handleTashkeelWithTatweel(char16_t *dest, int32_t sourceLength, int32_t /*destSize*/, uint32_t /*options*/, UErrorCode * /*pErrorCode*/) { … } /* *Name : handleGeneratedSpaces *Function : The shapeUnicode function converts Lam + Alef into LamAlef + space, * and Tashkeel to space. * handleGeneratedSpaces function puts these generated spaces * according to the options the user specifies. LamAlef and Tashkeel * spaces can be replaced at begin, at end, at near or decrease the * buffer size. * * There is also Auto option for LamAlef and tashkeel, which will put * the spaces at end of the buffer (or end of text if the user used * the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END). * * If the text type was visual_LTR and the option * U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected the END * option will place the space at the beginning of the buffer and * BEGIN will place the space at the end of the buffer. */ static int32_t handleGeneratedSpaces(char16_t *dest, int32_t sourceLength, int32_t destSize, uint32_t options, UErrorCode *pErrorCode,struct uShapeVariables shapeVars ) { … } /* *Name :expandCompositCharAtBegin *Function :Expands the LamAlef character to Lam and Alef consuming the required * space from beginning of the buffer. If the text type was visual_LTR * and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END was selected * the spaces will be located at end of buffer. * If there are no spaces to expand the LamAlef, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h */ static int32_t expandCompositCharAtBegin(char16_t *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode) { … } /* *Name : expandCompositCharAtEnd *Function : Expands the LamAlef character to Lam and Alef consuming the * required space from end of the buffer. If the text type was * Visual LTR and the option U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END * was used, the spaces will be consumed from begin of buffer. If * there are no spaces to expand the LamAlef, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h */ static int32_t expandCompositCharAtEnd(char16_t *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode) { … } /* *Name : expandCompositCharAtNear *Function : Expands the LamAlef character into Lam + Alef, YehHamza character * into Yeh + Hamza, SeenFamily character into SeenFamily character * + Tail, while consuming the space next to the character. * If there are no spaces next to the character, an error * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h */ static int32_t expandCompositCharAtNear(char16_t *dest, int32_t sourceLength, int32_t destSize,UErrorCode *pErrorCode, int yehHamzaOption, int seenTailOption, int lamAlefOption, struct uShapeVariables shapeVars) { … } /* * Name : expandCompositChar * Function : LamAlef, need special handling, since it expands from one * character into two characters while shaping or deshaping. * In order to expand it, near or far spaces according to the * options user specifies. Also buffer size can be increased. * * For SeenFamily characters and YehHamza only the near option is * supported, while for LamAlef we can take spaces from begin, end, * near or even increase the buffer size. * There is also the Auto option for LamAlef only, which will first * search for a space at end, begin then near, respectively. * If there are no spaces to expand these characters, an error will be set to * U_NO_SPACE_AVAILABLE as defined in utypes.h */ static int32_t expandCompositChar(char16_t *dest, int32_t sourceLength, int32_t destSize,uint32_t options, UErrorCode *pErrorCode, int shapingMode,struct uShapeVariables shapeVars) { … } /* *Name : shapeUnicode *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped * arabic Unicode buffer in FExx Range */ static int32_t shapeUnicode(char16_t *dest, int32_t sourceLength, int32_t destSize,uint32_t options, UErrorCode *pErrorCode, int tashkeelFlag, struct uShapeVariables shapeVars) { … } /* *Name : deShapeUnicode *Function : Converts an Arabic Unicode buffer in FExx Range into unshaped * arabic Unicode buffer in 06xx Range */ static int32_t deShapeUnicode(char16_t *dest, int32_t sourceLength, int32_t destSize,uint32_t options, UErrorCode *pErrorCode, struct uShapeVariables shapeVars) { … } /* **************************************** * u_shapeArabic **************************************** */ U_CAPI int32_t U_EXPORT2 u_shapeArabic(const char16_t *source, int32_t sourceLength, char16_t *dest, int32_t destCapacity, uint32_t options, UErrorCode *pErrorCode) { … }