#include "unicode/utypes.h"
#include "unicode/uchar.h"
Go to the source code of this file.
Data Structures | |
struct | USerializedSet |
A serialized form of a Unicode set. More... | |
Typedefs | |
typedef USet | USet |
A UnicodeSet. | |
Enumerations | |
enum | { USET_IGNORE_SPACE = 1, USET_CASE_INSENSITIVE = 2, USET_ADD_CASE_MAPPINGS = 4, USET_SERIALIZED_STATIC_ARRAY_CAPACITY = 8 } |
Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter. More... | |
Functions | |
USet * | uset_open (UChar32 start, UChar32 end) |
Creates a USet object that contains the range of characters start. | |
USet * | uset_openPattern (const UChar *pattern, int32_t patternLength, UErrorCode *ec) |
Creates a set from the given pattern. | |
USet * | uset_openPatternOptions (const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *ec) |
Creates a set from the given pattern. | |
void | uset_close (USet *set) |
Disposes of the storage used by a USet object. | |
void | uset_set (USet *set, UChar32 start, UChar32 end) |
Causes the USet object to represent the range start - end . | |
int32_t | uset_applyPattern (USet *set, const UChar *pattern, int32_t patternLength, uint32_t options, UErrorCode *status) |
Modifies the set to represent the set specified by the given pattern. | |
void | uset_applyIntPropertyValue (USet *set, UProperty prop, int32_t value, UErrorCode *ec) |
Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. | |
void | uset_applyPropertyAlias (USet *set, const UChar *prop, int32_t propLength, const UChar *value, int32_t valueLength, UErrorCode *ec) |
Modifies the set to contain those code points which have the given value for the given property. | |
UBool | uset_resemblesPattern (const UChar *pattern, int32_t patternLength, int32_t pos) |
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern. | |
int32_t | uset_toPattern (const USet *set, UChar *result, int32_t resultCapacity, UBool escapeUnprintable, UErrorCode *ec) |
Returns a string representation of this set. | |
void | uset_add (USet *set, UChar32 c) |
Adds the given character to the given USet. | |
void | uset_addAll (USet *set, const USet *additionalSet) |
Adds all of the elements in the specified set to this set if they're not already present. | |
void | uset_addRange (USet *set, UChar32 start, UChar32 end) |
Adds the given range of characters to the given USet. | |
void | uset_addString (USet *set, const UChar *str, int32_t strLen) |
Adds the given string to the given USet. | |
void | uset_addAllCodePoints (USet *set, const UChar *str, int32_t strLen) |
Adds each of the characters in this string to the set. | |
void | uset_remove (USet *set, UChar32 c) |
Removes the given character from the given USet. | |
void | uset_removeRange (USet *set, UChar32 start, UChar32 end) |
Removes the given range of characters from the given USet. | |
void | uset_removeString (USet *set, const UChar *str, int32_t strLen) |
Removes the given string to the given USet. | |
void | uset_removeAll (USet *set, const USet *removeSet) |
Removes from this set all of its elements that are contained in the specified set. | |
void | uset_retain (USet *set, UChar32 start, UChar32 end) |
Retain only the elements in this set that are contained in the specified range. | |
void | uset_retainAll (USet *set, const USet *retain) |
Retains only the elements in this set that are contained in the specified set. | |
void | uset_compact (USet *set) |
Reallocate this objects internal structures to take up the least possible space, without changing this object's value. | |
void | uset_complement (USet *set) |
Inverts this set. | |
void | uset_complementAll (USet *set, const USet *complement) |
Complements in this set all elements contained in the specified set. | |
void | uset_clear (USet *set) |
Removes all of the elements from this set. | |
UBool | uset_isEmpty (const USet *set) |
Returns TRUE if the given USet contains no characters and no strings. | |
UBool | uset_contains (const USet *set, UChar32 c) |
Returns TRUE if the given USet contains the given character. | |
UBool | uset_containsRange (const USet *set, UChar32 start, UChar32 end) |
Returns TRUE if the given USet contains all characters c where start <= c && c <= end. | |
UBool | uset_containsString (const USet *set, const UChar *str, int32_t strLen) |
Returns TRUE if the given USet contains the given string. | |
int32_t | uset_indexOf (const USet *set, UChar32 c) |
Returns the index of the given character within this set, where the set is ordered by ascending code point. | |
UChar32 | uset_charAt (const USet *set, int32_t index) |
Returns the character at the given index within this set, where the set is ordered by ascending code point. | |
int32_t | uset_size (const USet *set) |
Returns the number of characters and strings contained in the given USet. | |
int32_t | uset_getItemCount (const USet *set) |
Returns the number of items in this set. | |
int32_t | uset_getItem (const USet *set, int32_t itemIndex, UChar32 *start, UChar32 *end, UChar *str, int32_t strCapacity, UErrorCode *ec) |
Returns an item of this set. | |
UBool | uset_containsAll (const USet *set1, const USet *set2) |
Returns true if set1 contains all the characters and strings of set2. | |
UBool | uset_containsAllCodePoints (const USet *set, const UChar *str, int32_t strLen) |
Returns true if this set contains all the characters of the given string. | |
UBool | uset_containsNone (const USet *set1, const USet *set2) |
Returns true if set1 contains none of the characters and strings of set2. | |
UBool | uset_containsSome (const USet *set1, const USet *set2) |
Returns true if set1 contains some of the characters and strings of set2. | |
UBool | uset_equals (const USet *set1, const USet *set2) |
Returns true if set1 contains all of the characters and strings of set2, and vis versa. | |
int32_t | uset_serialize (const USet *set, uint16_t *dest, int32_t destCapacity, UErrorCode *pErrorCode) |
Serializes this set into an array of 16-bit integers. | |
UBool | uset_getSerializedSet (USerializedSet *fillSet, const uint16_t *src, int32_t srcLength) |
Given a serialized array, fill in the given serialized set object. | |
void | uset_setSerializedToOne (USerializedSet *fillSet, UChar32 c) |
Set the USerializedSet to contain the given character (and nothing else). | |
UBool | uset_serializedContains (const USerializedSet *set, UChar32 c) |
Returns TRUE if the given USerializedSet contains the given character. | |
int32_t | uset_getSerializedRangeCount (const USerializedSet *set) |
Returns the number of disjoint ranges of characters contained in the given serialized set. | |
UBool | uset_getSerializedRange (const USerializedSet *set, int32_t rangeIndex, UChar32 *pStart, UChar32 *pEnd) |
Returns a range of characters contained in the given serialized set. |
This is a C wrapper around the C++ UnicodeSet class.
Definition in file uset.h.
A UnicodeSet.
Use the uset_* API to manipulate. Create with uset_open*, and destroy with uset_close.
anonymous enum |
Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter.
USET_IGNORE_SPACE |
Ignore white space within patterns unless quoted or escaped.
|
USET_CASE_INSENSITIVE |
Enable case insensitive matching.
E.g., "[ab]" with this flag will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will match all except 'a', 'A', 'b', and 'B'. This performs a full closure over case mappings, e.g. U+017F for s. The resulting set is a superset of the input for the code points but not for the strings. It performs a case mapping closure of the code points and adds full case folding strings for the code points, and reduces strings of the original set to their full case folding equivalents. This is designed for case-insensitive matches, for example in regular expressions. The full code point case closure allows checking of an input character directly against the closure set. Strings are matched by comparing the case-folded form from the closure set with an incremental case folding of the string in question. The closure set will also contain single code points if the original set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). This is not necessary (that is, redundant) for the above matching method but results in the same closure sets regardless of whether the original set contained the code point or a string.
|
USET_ADD_CASE_MAPPINGS |
Enable case insensitive matching.
E.g., "[ab]" with this flag will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will match all except 'a', 'A', 'b', and 'B'. This adds the lower-, title-, and uppercase mappings as well as the case folding of each existing element in the set.
|
USET_SERIALIZED_STATIC_ARRAY_CAPACITY |
Enough for any single-code point set.
|
Adds the given character to the given USet.
After this call, uset_contains(set, c) will return TRUE.
set | the object to which to add the character | |
c | the character to add |
Adds all of the elements in the specified set to this set if they're not already present.
This operation effectively modifies this set so that its value is the union of the two sets. The behavior of this operation is unspecified if the specified collection is modified while the operation is in progress.
set | the object to which to add the set | |
additionalSet | the source set whose elements are to be added to this set. |
Adds each of the characters in this string to the set.
Thus "ch" => {"c", "h"} If this set already any particular character, it has no effect on that character.
set | the object to which to add the character | |
str | the source string | |
strLen | the length of the string or -1 if null terminated. |
Adds the given range of characters to the given USet.
After this call, uset_contains(set, start, end) will return TRUE.
set | the object to which to add the character | |
start | the first character of the range to add, inclusive | |
end | the last character of the range to add, inclusive |
Adds the given string to the given USet.
After this call, uset_containsString(set, str, strLen) will return TRUE.
set | the object to which to add the character | |
str | the string to add | |
strLen | the length of the string or -1 if null terminated. |
void uset_applyIntPropertyValue | ( | USet * | set, | |
UProperty | prop, | |||
int32_t | value, | |||
UErrorCode * | ec | |||
) |
Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue.
Prior contents of this set are lost.
set | the object to contain the code points defined by the property | |
prop | a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 or UCHAR_INT_START..UCHAR_INT_LIMIT-1 or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. | |
value | a value in the range u_getIntPropertyMinValue(prop).. u_getIntPropertyMaxValue(prop), with one exception. If prop is UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but rather a mask value produced by U_GET_GC_MASK(). This allows grouped categories such as [:L:] to be represented. | |
ec | error code input/output parameter |
int32_t uset_applyPattern | ( | USet * | set, | |
const UChar * | pattern, | |||
int32_t | patternLength, | |||
uint32_t | options, | |||
UErrorCode * | status | |||
) |
Modifies the set to represent the set specified by the given pattern.
See the UnicodeSet class description for the syntax of the pattern language. See also the User Guide chapter about UnicodeSet. Empties the set passed before applying the pattern.
set | The set to which the pattern is to be applied. | |
pattern | A pointer to UChar string specifying what characters are in the set. The character at pattern[0] must be a '['. | |
patternLength | The length of the UChar string. -1 if NUL terminated. | |
options | A bitmask for options to apply to the pattern. Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. | |
status | Returns an error if the pattern cannot be parsed. |
void uset_applyPropertyAlias | ( | USet * | set, | |
const UChar * | prop, | |||
int32_t | propLength, | |||
const UChar * | value, | |||
int32_t | valueLength, | |||
UErrorCode * | ec | |||
) |
Modifies the set to contain those code points which have the given value for the given property.
Prior contents of this set are lost.
set | the object to contain the code points defined by the given property and value alias | |
prop | a string specifying a property alias, either short or long. The name is matched loosely. See PropertyAliases.txt for names and a description of loose matching. If the value string is empty, then this string is interpreted as either a General_Category value alias, a Script value alias, a binary property alias, or a special ID. Special IDs are matched loosely and correspond to the following sets: |
propLength | the length of the prop, or -1 if NULL | |
value | a string specifying a value alias, either short or long. The name is matched loosely. See PropertyValueAliases.txt for names and a description of loose matching. In addition to aliases listed, numeric values and canonical combining classes may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string may also be empty. | |
valueLength | the length of the value, or -1 if NULL | |
ec | error code input/output parameter |
Returns the character at the given index within this set, where the set is ordered by ascending code point.
If the index is out of range, return (UChar32)-1. The inverse of this method is indexOf()
.
set | the set | |
index | an index from 0..size()-1 to obtain the char for |
void uset_clear | ( | USet * | set | ) |
Removes all of the elements from this set.
This set will be empty after this call returns.
set | the set |
void uset_close | ( | USet * | set | ) |
Disposes of the storage used by a USet object.
This function should be called exactly once for objects returned by uset_open().
set | the object to dispose of |
void uset_compact | ( | USet * | set | ) |
Reallocate this objects internal structures to take up the least possible space, without changing this object's value.
set | the object on which to perfrom the compact |
void uset_complement | ( | USet * | set | ) |
Inverts this set.
This operation modifies this set so that its value is its complement. This operation does not affect the multicharacter strings, if any.
set | the set |
Complements in this set all elements contained in the specified set.
Any character in the other set will be removed if it is in this set, or will be added if it is not in this set.
set | the set with which to complement | |
complement | set that defines which elements will be xor'ed from this set. |
Returns TRUE if the given USet contains the given character.
set | the set | |
c | The codepoint to check for within the set |
Returns true if set1 contains all the characters and strings of set2.
It answers the question, 'Is set1 a superset of set2?'
set1 | set to be checked for containment | |
set2 | set to be checked for containment |
Returns true if this set contains all the characters of the given string.
This is does not check containment of grapheme clusters, like uset_containsString.
set | set of characters to be checked for containment | |
str | string containing codepoints to be checked for containment | |
strLen | the length of the string or -1 if null terminated. |
Returns true if set1 contains none of the characters and strings of set2.
It answers the question, 'Is set1 a disjoint set of set2?'
set1 | set to be checked for containment | |
set2 | set to be checked for containment |
Returns TRUE if the given USet contains all characters c where start <= c && c <= end.
set | the set | |
start | the first character of the range to test, inclusive | |
end | the last character of the range to test, inclusive |
Returns true if set1 contains some of the characters and strings of set2.
It answers the question, 'Does set1 and set2 have an intersection?'
set1 | set to be checked for containment | |
set2 | set to be checked for containment |
Returns TRUE if the given USet contains the given string.
set | the set | |
str | the string | |
strLen | the length of the string or -1 if null terminated. |
Returns true if set1 contains all of the characters and strings of set2, and vis versa.
It answers the question, 'Is set1 equal to set2?'
set1 | set to be checked for containment | |
set2 | set to be checked for containment |
int32_t uset_getItem | ( | const USet * | set, | |
int32_t | itemIndex, | |||
UChar32 * | start, | |||
UChar32 * | end, | |||
UChar * | str, | |||
int32_t | strCapacity, | |||
UErrorCode * | ec | |||
) |
Returns an item of this set.
An item is either a range of characters or a single multicharacter string.
set | the set | |
itemIndex | a non-negative integer in the range 0.. uset_getItemCount(set)-1 | |
start | pointer to variable to receive first character in range, inclusive | |
end | pointer to variable to receive last character in range, inclusive | |
str | buffer to receive the string, may be NULL | |
strCapacity | capacity of str, or 0 if str is NULL | |
ec | error code |
Returns the number of items in this set.
An item is either a range of characters or a single multicharacter string.
set | the set |
UBool uset_getSerializedRange | ( | const USerializedSet * | set, | |
int32_t | rangeIndex, | |||
UChar32 * | pStart, | |||
UChar32 * | pEnd | |||
) |
Returns a range of characters contained in the given serialized set.
set | the serialized set | |
rangeIndex | a non-negative integer in the range 0.. uset_getSerializedRangeCount(set)-1 | |
pStart | pointer to variable to receive first character in range, inclusive | |
pEnd | pointer to variable to receive last character in range, inclusive |
int32_t uset_getSerializedRangeCount | ( | const USerializedSet * | set | ) |
Returns the number of disjoint ranges of characters contained in the given serialized set.
Ignores any strings contained in the set.
set | the serialized set |
UBool uset_getSerializedSet | ( | USerializedSet * | fillSet, | |
const uint16_t * | src, | |||
int32_t | srcLength | |||
) |
Given a serialized array, fill in the given serialized set object.
fillSet | pointer to result | |
src | pointer to start of array | |
srcLength | length of array |
Returns the index of the given character within this set, where the set is ordered by ascending code point.
If the character is not in this set, return -1. The inverse of this method is charAt()
.
set | the set | |
c | the character to obtain the index for |
Returns TRUE if the given USet contains no characters and no strings.
set | the set |
Creates a USet object that contains the range of characters start.
.end, inclusive.
start | first character of the range, inclusive | |
end | last character of the range, inclusive |
USet* uset_openPattern | ( | const UChar * | pattern, | |
int32_t | patternLength, | |||
UErrorCode * | ec | |||
) |
Creates a set from the given pattern.
See the UnicodeSet class description for the syntax of the pattern language.
pattern | a string specifying what characters are in the set | |
patternLength | the length of the pattern, or -1 if null terminated | |
ec | the error code |
USet* uset_openPatternOptions | ( | const UChar * | pattern, | |
int32_t | patternLength, | |||
uint32_t | options, | |||
UErrorCode * | ec | |||
) |
Creates a set from the given pattern.
See the UnicodeSet class description for the syntax of the pattern language.
pattern | a string specifying what characters are in the set | |
patternLength | the length of the pattern, or -1 if null terminated | |
options | bitmask for options to apply to the pattern. Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. | |
ec | the error code |
Removes the given character from the given USet.
After this call, uset_contains(set, c) will return FALSE.
set | the object from which to remove the character | |
c | the character to remove |
Removes from this set all of its elements that are contained in the specified set.
This operation effectively modifies this set so that its value is the asymmetric set difference of the two sets.
set | the object from which the elements are to be removed | |
removeSet | the object that defines which elements will be removed from this set |
Removes the given range of characters from the given USet.
After this call, uset_contains(set, start, end) will return FALSE.
set | the object to which to add the character | |
start | the first character of the range to remove, inclusive | |
end | the last character of the range to remove, inclusive |
Removes the given string to the given USet.
After this call, uset_containsString(set, str, strLen) will return FALSE.
set | the object to which to add the character | |
str | the string to remove | |
strLen | the length of the string or -1 if null terminated. |
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern.
pattern | a string specifying the pattern | |
patternLength | the length of the pattern, or -1 if NULL | |
pos | the given position |
Retain only the elements in this set that are contained in the specified range.
If start > end
then an empty range is retained, leaving the set empty. This is equivalent to a boolean logic AND, or a set INTERSECTION.
set | the object for which to retain only the specified range | |
start | first character, inclusive, of range to be retained to this set. | |
end | last character, inclusive, of range to be retained to this set. |
Retains only the elements in this set that are contained in the specified set.
In other words, removes from this set all of its elements that are not contained in the specified set. This operation effectively modifies this set so that its value is the intersection of the two sets.
set | the object on which to perform the retain | |
retain | set that defines which elements this set will retain |
int32_t uset_serialize | ( | const USet * | set, | |
uint16_t * | dest, | |||
int32_t | destCapacity, | |||
UErrorCode * | pErrorCode | |||
) |
Serializes this set into an array of 16-bit integers.
Serialization (currently) only records the characters in the set; multicharacter strings are ignored.
The array has following format (each line is one 16-bit integer):
length = (n+2*m) | (m!=0?0x8000:0) bmpLength = n; present if m!=0 bmp[0] bmp[1] ... bmp[n-1] supp-high[0] supp-low[0] supp-high[1] supp-low[1] ... supp-high[m-1] supp-low[m-1]
The array starts with a header. After the header are n bmp code points, then m supplementary code points. Either n or m or both may be zero. n+2*m is always <= 0x7FFF.
If there are no supplementary characters (if m==0) then the header is one 16-bit integer, 'length', with value n.
If there are supplementary characters (if m!=0) then the header is two 16-bit integers. The first, 'length', has value (n+2*m)|0x8000. The second, 'bmpLength', has value n.
After the header the code points are stored in ascending order. Supplementary code points are stored as most significant 16 bits followed by least significant 16 bits.
set | the set | |
dest | pointer to buffer of destCapacity 16-bit integers. May be NULL only if destCapacity is zero. | |
destCapacity | size of dest, or zero. Must not be negative. | |
pErrorCode | pointer to the error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. |
UBool uset_serializedContains | ( | const USerializedSet * | set, | |
UChar32 | c | |||
) |
Returns TRUE if the given USerializedSet contains the given character.
set | the serialized set | |
c | The codepoint to check for within the set |
Causes the USet object to represent the range start - end
.
If start > end
then this USet is set to an empty range.
set | the object to set to the given range | |
start | first character in the set, inclusive | |
end | last character in the set, inclusive |
void uset_setSerializedToOne | ( | USerializedSet * | fillSet, | |
UChar32 | c | |||
) |
Set the USerializedSet to contain the given character (and nothing else).
fillSet | pointer to result | |
c | The codepoint to set |
Returns the number of characters and strings contained in the given USet.
set | the set |
int32_t uset_toPattern | ( | const USet * | set, | |
UChar * | result, | |||
int32_t | resultCapacity, | |||
UBool | escapeUnprintable, | |||
UErrorCode * | ec | |||
) |
Returns a string representation of this set.
If the result of calling this function is passed to a uset_openPattern(), it will produce another set that is equal to this one.
set | the set | |
result | the string to receive the rules, may be NULL | |
resultCapacity | the capacity of result, may be 0 if result is NULL | |
escapeUnprintable | if TRUE then convert unprintable character to their hex escape representations, \uxxxx or \Uxxxxxxxx. Unprintable characters are those other than U+000A, U+0020..U+007E. | |
ec | error code. |