#include "unicode/utf.h"
Go to the source code of this file.
Defines | |
#define | UTF_SIZE 16 |
Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. | |
#define | UTF_SAFE |
The default choice for general Unicode string macros is to use the . | |
#define | UTF8_ERROR_VALUE_1 0x15 |
#define | UTF8_ERROR_VALUE_2 0x9f |
See documentation on UTF8_ERROR_VALUE_1 for details. | |
#define | UTF_ERROR_VALUE 0xffff |
Error value for all UTFs. | |
#define | UTF_IS_ERROR(c) (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) |
Is a given 32-bit code an error value as returned by one of the macros for any UTF? | |
#define | UTF_IS_VALID(c) |
This is a combined macro: Is c a valid Unicode value _and_ not an error code? | |
#define | UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) |
Is this code unit or code point a surrogate (U+d800. | |
#define | UTF_IS_UNICODE_NONCHAR(c) |
Is a given 32-bit code point a Unicode noncharacter? | |
#define | UTF_IS_UNICODE_CHAR(c) |
Is a given 32-bit value a Unicode code point value (0. | |
#define | UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) |
Count the trail bytes for a UTF-8 lead byte. | |
#define | UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. | |
#define | UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) |
Is this this code point a single code unit (byte)? | |
#define | UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) |
Is this this code unit the lead code unit (byte) of a code point? | |
#define | UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) |
Is this this code unit a trailing code unit (byte) of a code point? | |
#define | UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) |
Does this scalar Unicode value need multiple code units for storage? | |
#define | UTF8_CHAR_LENGTH(c) |
Given the lead character, how many bytes are taken by this code point. | |
#define | UTF8_MAX_CHAR_LENGTH 4 |
The maximum number of bytes per code point. | |
#define | UTF8_ARRAY_SIZE(size) ((5*(size))/2) |
Average number of code units compared to UTF-16. | |
#define | UTF8_GET_CHAR_UNSAFE(s, i, c) |
#define | UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) |
#define | UTF8_NEXT_CHAR_UNSAFE(s, i, c) |
#define | UTF8_APPEND_CHAR_UNSAFE(s, i, c) |
#define | UTF8_FWD_1_UNSAFE(s, i) |
#define | UTF8_FWD_N_UNSAFE(s, i, n) |
#define | UTF8_SET_CHAR_START_UNSAFE(s, i) |
#define | UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) |
#define | UTF8_APPEND_CHAR_SAFE(s, i, length, c) |
#define | UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length) |
#define | UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n) |
#define | UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i) |
#define | UTF8_PREV_CHAR_UNSAFE(s, i, c) |
#define | UTF8_BACK_1_UNSAFE(s, i) |
#define | UTF8_BACK_N_UNSAFE(s, i, n) |
#define | UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) |
#define | UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) |
#define | UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i) |
#define | UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n) |
#define | UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length) |
#define | UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) |
Is uchar a first/lead surrogate? | |
#define | UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) |
Is uchar a second/trail surrogate? | |
#define | UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) |
Assuming c is a surrogate, is it a first/lead surrogate? | |
#define | UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
Helper constant for UTF16_GET_PAIR_VALUE. | |
#define | UTF16_GET_PAIR_VALUE(first, second) (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) |
Get the UTF-32 value from the surrogate code units. | |
#define | UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) |
#define | UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) |
#define | UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) |
#define | UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) |
#define | UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) |
#define | UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) |
#define | UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) |
#define | UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) |
Does this scalar Unicode value need multiple code units for storage? | |
#define | UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) |
#define | UTF16_MAX_CHAR_LENGTH 2 |
#define | UTF16_ARRAY_SIZE(size) (size) |
Average number of code units compared to UTF-16. | |
#define | UTF16_GET_CHAR_UNSAFE(s, i, c) |
Get a single code point from an offset that points to any of the code units that belong to that code point. | |
#define | UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) |
#define | UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
#define | UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
#define | UTF16_FWD_1_UNSAFE(s, i) |
#define | UTF16_FWD_N_UNSAFE(s, i, n) |
#define | UTF16_SET_CHAR_START_UNSAFE(s, i) |
#define | UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) |
#define | UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
#define | UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length) |
#define | UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n) |
#define | UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i) |
#define | UTF16_PREV_CHAR_UNSAFE(s, i, c) |
#define | UTF16_BACK_1_UNSAFE(s, i) |
#define | UTF16_BACK_N_UNSAFE(s, i, n) |
#define | UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
#define | UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
#define | UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i) |
#define | UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n) |
#define | UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) |
#define | UTF32_IS_SAFE(c, strict) |
#define | UTF32_IS_SINGLE(uchar) 1 |
#define | UTF32_IS_LEAD(uchar) 0 |
#define | UTF32_IS_TRAIL(uchar) 0 |
#define | UTF32_NEED_MULTIPLE_UCHAR(c) 0 |
#define | UTF32_CHAR_LENGTH(c) 1 |
#define | UTF32_MAX_CHAR_LENGTH 1 |
#define | UTF32_ARRAY_SIZE(size) (size) |
#define | UTF32_GET_CHAR_UNSAFE(s, i, c) |
#define | UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) |
#define | UTF32_NEXT_CHAR_UNSAFE(s, i, c) |
#define | UTF32_APPEND_CHAR_UNSAFE(s, i, c) |
#define | UTF32_FWD_1_UNSAFE(s, i) |
#define | UTF32_FWD_N_UNSAFE(s, i, n) |
#define | UTF32_SET_CHAR_START_UNSAFE(s, i) |
#define | UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) |
#define | UTF32_APPEND_CHAR_SAFE(s, i, length, c) |
#define | UTF32_FWD_1_SAFE(s, i, length) |
#define | UTF32_FWD_N_SAFE(s, i, length, n) |
#define | UTF32_SET_CHAR_START_SAFE(s, start, i) |
#define | UTF32_PREV_CHAR_UNSAFE(s, i, c) |
#define | UTF32_BACK_1_UNSAFE(s, i) |
#define | UTF32_BACK_N_UNSAFE(s, i, n) |
#define | UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) |
#define | UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) |
#define | UTF32_BACK_1_SAFE(s, start, i) |
#define | UTF32_BACK_N_SAFE(s, start, i, n) |
#define | UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) |
#define | UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) |
Estimate the number of code units for a string based on the number of UTF-16 code units. | |
#define | UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) |
#define | UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) |
#define | UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
#define | UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) |
#define | UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
#define | UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
#define | UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) |
#define | UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) |
#define | UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) |
#define | UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) |
#define | UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) |
#define | UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) |
#define | UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) |
#define | UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
#define | UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) |
#define | UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) |
#define | UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) |
#define | UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) |
#define | UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
#define | UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
#define | UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar) |
Does this code unit alone encode a code point (BMP, not a surrogate)? Same as UTF16_IS_SINGLE. | |
#define | UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar) |
Is this code unit the first one of several (a lead surrogate)? Same as UTF16_IS_LEAD. | |
#define | UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar) |
Is this code unit one of several but not the first one (a trail surrogate)? Same as UTF16_IS_TRAIL. | |
#define | UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) |
Does this code point require multiple code units (is it a supplementary code point)? Same as UTF16_NEED_MULTIPLE_UCHAR. | |
#define | UTF_CHAR_LENGTH(c) U16_LENGTH(c) |
How many code units are used to encode this code point (1 or 2)? Same as UTF16_CHAR_LENGTH. | |
#define | UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH |
How many code units are used at most for any Unicode code point (2)? Same as UTF16_MAX_CHAR_LENGTH. | |
#define | UTF_GET_CHAR(s, start, i, length, c) U16_GET(s, start, i, length, c) |
Set c to the code point that contains the code unit i. | |
#define | UTF_NEXT_CHAR(s, i, length, c) U16_NEXT(s, i, length, c) |
Set c to the code point that starts at code unit i and advance i to beyond the code units of this code point (post-increment). | |
#define | UTF_APPEND_CHAR(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
Append the code units of code point c to the string at index i and advance i to beyond the new code units (post-increment). | |
#define | UTF_FWD_1(s, i, length) U16_FWD_1(s, i, length) |
Advance i to beyond the code units of the code point that begins at i. | |
#define | UTF_FWD_N(s, i, length, n) U16_FWD_N(s, i, length, n) |
Advance i to beyond the code units of the n code points where the first one begins at i. | |
#define | UTF_SET_CHAR_START(s, start, i) U16_SET_CP_START(s, start, i) |
Take the random-access index i and adjust it so that it points to the beginning of a code point. | |
#define | UTF_PREV_CHAR(s, start, i, c) U16_PREV(s, start, i, c) |
Set c to the code point that has code units before i and move i backward (towards the beginning of the string) to the first code unit of this code point (pre-increment). | |
#define | UTF_BACK_1(s, start, i) U16_BACK_1(s, start, i) |
Move i backward (towards the beginning of the string) to the first code unit of the code point that has code units before i. | |
#define | UTF_BACK_N(s, start, i, n) U16_BACK_N(s, start, i, n) |
Move i backward (towards the beginning of the string) to the first code unit of the n code points that have code units before i. | |
#define | UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) |
Take the random-access index i and adjust it so that it points beyond a code point. |
Definition in file utf_old.h.
#define UTF16_APPEND_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c | ) |
Value:
{ \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint16_t)(c); \ } else if((uint32_t)(c)<=0x10ffff) { \ if((i)+1<(length)) { \ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ } else /* not enough space */ { \ (s)[(i)++]=UTF_ERROR_VALUE; \ } \ } else /* c>0x10ffff, write error value */ { \ (s)[(i)++]=UTF_ERROR_VALUE; \ } \ }
#define UTF16_APPEND_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
#define UTF16_ARRAY_SIZE | ( | size | ) | (size) |
#define UTF16_BACK_1_SAFE | ( | s, | |||
start, | |||||
i | ) | U16_BACK_1(s, start, i) |
#define UTF16_BACK_1_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ --(i); \ } \ }
#define UTF16_BACK_N_SAFE | ( | s, | |||
start, | |||||
i, | |||||
n | ) | U16_BACK_N(s, start, i, n) |
#define UTF16_BACK_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
Value:
{ \ int32_t __N=(n); \ while(__N>0) { \ UTF16_BACK_1_UNSAFE(s, i); \ --__N; \ } \ }
#define UTF16_CHAR_LENGTH | ( | c | ) | ((uint32_t)(c)<=0xffff ? 1 : 2) |
#define UTF16_FWD_1_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ ++(i); \ } \ }
#define UTF16_FWD_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
Value:
{ \ int32_t __N=(n); \ while(__N>0) { \ UTF16_FWD_1_UNSAFE(s, i); \ --__N; \ } \ }
#define UTF16_GET_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[i]; \ if(UTF_IS_SURROGATE(c)) { \ uint16_t __c2; \ if(UTF_IS_SURROGATE_FIRST(c)) { \ if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \ (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched first surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } else { \ if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched second surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF16_GET_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ (c)=(s)[i]; \ if(UTF_IS_SURROGATE(c)) { \ if(UTF_IS_SURROGATE_FIRST(c)) { \ (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \ } else { \ (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \ } \ } \ }
Assume 0<=i<length.
This could be used for iteration together with UTF16_CHAR_LENGTH() and UTF_IS_ERROR(), but the use of UTF16_NEXT_CHAR[_UNSAFE]() and UTF16_PREV_CHAR[_UNSAFE]() is more efficient for that.
#define UTF16_GET_PAIR_VALUE | ( | first, | |||
second | ) | (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) |
Get the UTF-32 value from the surrogate code units.
#define UTF16_IS_LEAD | ( | uchar | ) | UTF_IS_FIRST_SURROGATE(uchar) |
#define UTF16_IS_SINGLE | ( | uchar | ) | !UTF_IS_SURROGATE(uchar) |
#define UTF16_IS_TRAIL | ( | uchar | ) | UTF_IS_SECOND_SURROGATE(uchar) |
#define UTF16_LEAD | ( | supplementary | ) | UTF_FIRST_SURROGATE(supplementary) |
#define UTF16_MAX_CHAR_LENGTH 2 |
#define UTF16_NEED_MULTIPLE_UCHAR | ( | c | ) | ((uint32_t)(c)>0xffff) |
Does this scalar Unicode value need multiple code units for storage?
#define UTF16_NEXT_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[(i)++]; \ if(UTF_IS_FIRST_SURROGATE(c)) { \ uint16_t __c2; \ if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ ++(i); \ (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched first surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ /* unmatched second surrogate or other non-character */ \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF16_NEXT_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ (c)=(s)[(i)++]; \ if(UTF_IS_FIRST_SURROGATE(c)) { \ (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ } \ }
#define UTF16_PREV_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[--(i)]; \ if(UTF_IS_SECOND_SURROGATE(c)) { \ uint16_t __c2; \ if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ --(i); \ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ } else if(strict) {\ /* unmatched second surrogate */ \ (c)=UTF_ERROR_VALUE; \ } \ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ /* unmatched first surrogate or other non-character */ \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF16_PREV_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ (c)=(s)[--(i)]; \ if(UTF_IS_SECOND_SURROGATE(c)) { \ (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ } \ }
#define UTF16_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ ++(i); \ } \ }
#define UTF16_SET_CHAR_START_SAFE | ( | s, | |||
start, | |||||
i | ) | U16_SET_CP_START(s, start, i) |
#define UTF16_SET_CHAR_START_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ --(i); \ } \ }
#define UTF16_TRAIL | ( | supplementary | ) | UTF_SECOND_SURROGATE(supplementary) |
#define UTF32_APPEND_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c | ) |
Value:
{ \ if((uint32_t)(c)<=0x10ffff) { \ (s)[(i)++]=(c); \ } else /* c>0x10ffff, write 0xfffd */ { \ (s)[(i)++]=0xfffd; \ } \ }
#define UTF32_APPEND_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
#define UTF32_ARRAY_SIZE | ( | size | ) | (size) |
#define UTF32_BACK_1_SAFE | ( | s, | |||
start, | |||||
i | ) |
#define UTF32_BACK_1_UNSAFE | ( | s, | |||
i | ) |
#define UTF32_BACK_N_SAFE | ( | s, | |||
start, | |||||
i, | |||||
n | ) |
Value:
{ \
(i)-=(n); \
if((i)<(start)) { \
(i)=(start); \
} \
}
#define UTF32_BACK_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
#define UTF32_CHAR_LENGTH | ( | c | ) | 1 |
#define UTF32_FWD_1_SAFE | ( | s, | |||
i, | |||||
length | ) |
#define UTF32_FWD_1_UNSAFE | ( | s, | |||
i | ) |
#define UTF32_FWD_N_SAFE | ( | s, | |||
i, | |||||
length, | |||||
n | ) |
#define UTF32_FWD_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
#define UTF32_GET_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[i]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF32_GET_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
#define UTF32_IS_LEAD | ( | uchar | ) | 0 |
#define UTF32_IS_SAFE | ( | c, | |||
strict | ) |
Value:
(!(strict) ? \ (uint32_t)(c)<=0x10ffff : \ UTF_IS_UNICODE_CHAR(c))
#define UTF32_IS_SINGLE | ( | uchar | ) | 1 |
#define UTF32_IS_TRAIL | ( | uchar | ) | 0 |
#define UTF32_MAX_CHAR_LENGTH 1 |
#define UTF32_NEED_MULTIPLE_UCHAR | ( | c | ) | 0 |
#define UTF32_NEXT_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[(i)++]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF32_NEXT_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
#define UTF32_PREV_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[--(i)]; \ if(!UTF32_IS_SAFE(c, strict)) { \ (c)=UTF_ERROR_VALUE; \ } \ }
#define UTF32_PREV_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
#define UTF32_SET_CHAR_LIMIT_SAFE | ( | s, | |||
i, | |||||
length | ) |
#define UTF32_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
i | ) |
#define UTF32_SET_CHAR_START_SAFE | ( | s, | |||
start, | |||||
i | ) |
#define UTF32_SET_CHAR_START_UNSAFE | ( | s, | |||
i | ) |
#define UTF8_APPEND_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c | ) |
#define UTF8_APPEND_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else { \ if((uint32_t)(c)<=0x7ff) { \ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ } else { \ if((uint32_t)(c)<=0xffff) { \ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ } else { \ (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ } \ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ } \ }
#define UTF8_ARRAY_SIZE | ( | size | ) | ((5*(size))/2) |
#define UTF8_BACK_1_SAFE | ( | s, | |||
start, | |||||
i | ) | U8_BACK_1(s, start, i) |
#define UTF8_BACK_1_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ while(UTF8_IS_TRAIL((s)[--(i)])) {} \ }
#define UTF8_BACK_N_SAFE | ( | s, | |||
start, | |||||
i, | |||||
n | ) | U8_BACK_N(s, start, i, n) |
#define UTF8_BACK_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
Value:
{ \ int32_t __N=(n); \ while(__N>0) { \ UTF8_BACK_1_UNSAFE(s, i); \ --__N; \ } \ }
#define UTF8_CHAR_LENGTH | ( | c | ) |
Value:
((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ ) \ )
ICU does not deal with code points >0x10ffff unless necessary for advancing in the byte stream.
These length macros take into account that for values >0x10ffff the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff with 3 bytes. Code point comparisons need to be in uint32_t because UChar32 may be a signed type, and negative values must be recognized.
#define UTF8_COUNT_TRAIL_BYTES | ( | leadByte | ) | (utf8_countTrailBytes[(uint8_t)leadByte]) |
Count the trail bytes for a UTF-8 lead byte.
#define UTF8_ERROR_VALUE_1 0x15 |
UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, which need 1 or 2 bytes in UTF-8:
U+0015 = NAK = Negative Acknowledge, C0 control character
U+009f = highest C1 control character
These are used by UTF8_..._SAFE macros so that they can return an error value that needs the same number of code units (bytes) as were seen by a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
#define UTF8_ERROR_VALUE_2 0x9f |
#define UTF8_FWD_1_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ }
#define UTF8_FWD_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) |
Value:
{ \ int32_t __N=(n); \ while(__N>0) { \ UTF8_FWD_1_UNSAFE(s, i); \ --__N; \ } \ }
#define UTF8_GET_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ int32_t _utf8_get_char_safe_index=(int32_t)(i); \ UTF8_SET_CHAR_START_SAFE(s, start, _utf8_get_char_safe_index); \ UTF8_NEXT_CHAR_SAFE(s, _utf8_get_char_safe_index, length, c, strict); \ }
#define UTF8_GET_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ int32_t _utf8_get_char_unsafe_index=(int32_t)(i); \ UTF8_SET_CHAR_START_UNSAFE(s, _utf8_get_char_unsafe_index); \ UTF8_NEXT_CHAR_UNSAFE(s, _utf8_get_char_unsafe_index, c); \ }
#define UTF8_IS_LEAD | ( | uchar | ) | ((uint8_t)((uchar)-0xc0)<0x3e) |
Is this this code unit the lead code unit (byte) of a code point?
#define UTF8_IS_SINGLE | ( | uchar | ) | (((uchar)&0x80)==0) |
Is this this code point a single code unit (byte)?
#define UTF8_IS_TRAIL | ( | uchar | ) | (((uchar)&0xc0)==0x80) |
Is this this code unit a trailing code unit (byte) of a code point?
#define UTF8_MASK_LEAD_BYTE | ( | leadByte, | |||
countTrailBytes | ) | ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
#define UTF8_MAX_CHAR_LENGTH 4 |
The maximum number of bytes per code point.
#define UTF8_NEED_MULTIPLE_UCHAR | ( | c | ) | ((uint32_t)(c)>0x7f) |
Does this scalar Unicode value need multiple code units for storage?
#define UTF8_NEXT_CHAR_SAFE | ( | s, | |||
i, | |||||
length, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[(i)++]; \ if((c)>=0x80) { \ if(UTF8_IS_LEAD(c)) { \ (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \ } else { \ (c)=UTF8_ERROR_VALUE_1; \ } \ } \ }
#define UTF8_NEXT_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ (c)=(s)[(i)++]; \ if((uint8_t)((c)-0xc0)<0x35) { \ uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ UTF8_MASK_LEAD_BYTE(c, __count); \ switch(__count) { \ /* each following branch falls through to the next one */ \ case 3: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 2: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ case 1: \ (c)=((c)<<6)|((s)[(i)++]&0x3f); \ /* no other branches to optimize switch() */ \ break; \ } \ } \ }
#define UTF8_PREV_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
c, | |||||
strict | ) |
Value:
{ \ (c)=(s)[--(i)]; \ if((c)>=0x80) { \ if((c)<=0xbf) { \ (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ } else { \ (c)=UTF8_ERROR_VALUE_1; \ } \ } \ }
#define UTF8_PREV_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) |
Value:
{ \ (c)=(s)[--(i)]; \ if(UTF8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ /* c is a trail byte */ \ (c)&=0x3f; \ for(;;) { \ __b=(s)[--(i)]; \ if(__b>=0xc0) { \ UTF8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ break; \ } else { \ (c)|=(UChar32)(__b&0x3f)<<__shift; \ ++__count; \ __shift+=6; \ } \ } \ } \ }
#define UTF8_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ UTF8_BACK_1_UNSAFE(s, i); \ UTF8_FWD_1_UNSAFE(s, i); \ }
#define UTF8_SET_CHAR_START_SAFE | ( | s, | |||
start, | |||||
i | ) | U8_SET_CP_START(s, start, i) |
#define UTF8_SET_CHAR_START_UNSAFE | ( | s, | |||
i | ) |
Value:
{ \ while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ }
Append the code units of code point c to the string at index i and advance i to beyond the new code units (post-increment).
The code units beginning at index i will be overwritten. Same as UTF16_APPEND_CHAR.
0<=i<length
#define UTF_APPEND_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) | UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
#define UTF_ARRAY_SIZE | ( | size | ) | UTF16_ARRAY_SIZE(size) |
Estimate the number of code units for a string based on the number of UTF-16 code units.
#define UTF_BACK_1 | ( | s, | |||
start, | |||||
i | ) | U16_BACK_1(s, start, i) |
Move i backward (towards the beginning of the string) to the first code unit of the code point that has code units before i.
I.e., move i backward by one code point. i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_BACK_1.
#define UTF_BACK_1_SAFE | ( | s, | |||
start, | |||||
i | ) | UTF16_BACK_1_SAFE(s, start, i) |
#define UTF_BACK_1_UNSAFE | ( | s, | |||
i | ) | UTF16_BACK_1_UNSAFE(s, i) |
#define UTF_BACK_N | ( | s, | |||
start, | |||||
i, | |||||
n | ) | U16_BACK_N(s, start, i, n) |
Move i backward (towards the beginning of the string) to the first code unit of the n code points that have code units before i.
I.e., move i backward by n code points. i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_BACK_N.
#define UTF_BACK_N_SAFE | ( | s, | |||
start, | |||||
i, | |||||
n | ) | UTF16_BACK_N_SAFE(s, start, i, n) |
#define UTF_BACK_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) | UTF16_BACK_N_UNSAFE(s, i, n) |
#define UTF_CHAR_LENGTH | ( | c | ) | U16_LENGTH(c) |
How many code units are used to encode this code point (1 or 2)? Same as UTF16_CHAR_LENGTH.
#define UTF_ERROR_VALUE 0xffff |
Error value for all UTFs.
This code point value will be set by macros with error checking if an error is detected.
#define UTF_FIRST_SURROGATE | ( | supplementary | ) | (UChar)(((supplementary)>>10)+0xd7c0) |
Advance i to beyond the code units of the code point that begins at i.
I.e., advance i by one code point. Same as UTF16_FWD_1.
#define UTF_FWD_1_UNSAFE | ( | s, | |||
i | ) | UTF16_FWD_1_UNSAFE(s, i) |
Advance i to beyond the code units of the n code points where the first one begins at i.
I.e., advance i by n code points. Same as UT16_FWD_N.
#define UTF_FWD_N_UNSAFE | ( | s, | |||
i, | |||||
n | ) | UTF16_FWD_N_UNSAFE(s, i, n) |
Set c to the code point that contains the code unit i.
i could point to the lead or the trail surrogate for the code point. i is not modified. Same as UTF16_GET_CHAR.
#define UTF_GET_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) | UTF16_GET_CHAR_UNSAFE(s, i, c) |
#define UTF_IS_ERROR | ( | c | ) | (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) |
Is a given 32-bit code an error value as returned by one of the macros for any UTF?
#define UTF_IS_FIRST_SURROGATE | ( | uchar | ) | (((uchar)&0xfffffc00)==0xd800) |
Is uchar a first/lead surrogate?
#define UTF_IS_LEAD | ( | uchar | ) | U16_IS_LEAD(uchar) |
Is this code unit the first one of several (a lead surrogate)? Same as UTF16_IS_LEAD.
#define UTF_IS_SECOND_SURROGATE | ( | uchar | ) | (((uchar)&0xfffffc00)==0xdc00) |
Is uchar a second/trail surrogate?
#define UTF_IS_SINGLE | ( | uchar | ) | U16_IS_SINGLE(uchar) |
Does this code unit alone encode a code point (BMP, not a surrogate)? Same as UTF16_IS_SINGLE.
#define UTF_IS_SURROGATE | ( | uchar | ) | (((uchar)&0xfffff800)==0xd800) |
Is this code unit or code point a surrogate (U+d800.
.U+dfff)?
#define UTF_IS_SURROGATE_FIRST | ( | c | ) | (((c)&0x400)==0) |
Assuming c is a surrogate, is it a first/lead surrogate?
#define UTF_IS_TRAIL | ( | uchar | ) | U16_IS_TRAIL(uchar) |
Is this code unit one of several but not the first one (a trail surrogate)? Same as UTF16_IS_TRAIL.
#define UTF_IS_UNICODE_CHAR | ( | c | ) |
Value:
((uint32_t)(c)<0xd800 || \ ((uint32_t)(c)>0xdfff && \ (uint32_t)(c)<=0x10ffff && \ !UTF_IS_UNICODE_NONCHAR(c)))
.U+10ffff) that can be assigned a character?
Code points that are not characters include:
This means that all code points below U+d800 are character code points, and that boundary is tested first for performance.
#define UTF_IS_UNICODE_NONCHAR | ( | c | ) |
#define UTF_IS_VALID | ( | c | ) |
Value:
(UTF_IS_UNICODE_CHAR(c) && \ (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
#define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH |
How many code units are used at most for any Unicode code point (2)? Same as UTF16_MAX_CHAR_LENGTH.
#define UTF_NEED_MULTIPLE_UCHAR | ( | c | ) | UTF16_NEED_MULTIPLE_UCHAR(c) |
Does this code point require multiple code units (is it a supplementary code point)? Same as UTF16_NEED_MULTIPLE_UCHAR.
Set c to the code point that starts at code unit i and advance i to beyond the code units of this code point (post-increment).
i must point to the first code unit of a code point. Otherwise c is set to the trail unit (surrogate) itself. Same as UTF16_NEXT_CHAR.
#define UTF_NEXT_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) | UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
#define UTF_PREV_CHAR | ( | s, | |||
start, | |||||
i, | |||||
c | ) | U16_PREV(s, start, i, c) |
Set c to the code point that has code units before i and move i backward (towards the beginning of the string) to the first code unit of this code point (pre-increment).
i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_PREV_CHAR.
#define UTF_PREV_CHAR_SAFE | ( | s, | |||
start, | |||||
i, | |||||
c, | |||||
strict | ) | UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
#define UTF_PREV_CHAR_UNSAFE | ( | s, | |||
i, | |||||
c | ) | UTF16_PREV_CHAR_UNSAFE(s, i, c) |
#define UTF_SAFE |
The default choice for general Unicode string macros is to use the .
.._SAFE macro implementations with strict=FALSE.
#define UTF_SECOND_SURROGATE | ( | supplementary | ) | (UChar)(((supplementary)&0x3ff)|0xdc00) |
Take the random-access index i and adjust it so that it points beyond a code point.
The input index points beyond any code unit of a code point and is moved to point beyond the last code unit of the same code point. i is never decremented. In other words, if i points to a trail surrogate that is preceded by a matching lead surrogate, then i is incremented. Otherwise it is not modified. This can be used to start an iteration with UTF_PREV_CHAR() from a random index. Same as UTF16_SET_CHAR_LIMIT.
#define UTF_SET_CHAR_LIMIT_SAFE | ( | s, | |||
start, | |||||
i, | |||||
length | ) | UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
#define UTF_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
i | ) | UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
#define UTF_SET_CHAR_START | ( | s, | |||
start, | |||||
i | ) | U16_SET_CP_START(s, start, i) |
Take the random-access index i and adjust it so that it points to the beginning of a code point.
The input index points to any code unit of a code point and is moved to point to the first code unit of the same code point. i is never incremented. In other words, if i points to a trail surrogate that is preceded by a matching lead surrogate, then i is decremented. Otherwise it is not modified. This can be used to start an iteration with UTF_NEXT_CHAR() from a random index. Same as UTF16_SET_CHAR_START.
#define UTF_SET_CHAR_START_SAFE | ( | s, | |||
start, | |||||
i | ) | UTF16_SET_CHAR_START_SAFE(s, start, i) |
#define UTF_SET_CHAR_START_UNSAFE | ( | s, | |||
i | ) | UTF16_SET_CHAR_START_UNSAFE(s, i) |
#define UTF_SIZE 16 |
Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode.
#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
Helper constant for UTF16_GET_PAIR_VALUE.