• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/utf_16le.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   utf_16le.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regenc.h"
00031 
00032 #define UTF16_IS_SURROGATE_FIRST(c)    (((c) & 0xfc) == 0xd8)
00033 #define UTF16_IS_SURROGATE_SECOND(c)   (((c) & 0xfc) == 0xdc)
00034 #define UTF16_IS_SURROGATE(c)          (((c) & 0xf8) == 0xd8)
00035 
00036 static const int EncLen_UTF16[] = {
00037   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00038   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00039   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00040   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00041   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00044   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00045   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00046   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050   2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
00051   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00052   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
00053 };
00054 
00055 static int
00056 utf16le_mbc_enc_len(const UChar* p, const OnigUChar* e,
00057                     OnigEncoding enc ARG_UNUSED)
00058 {
00059   int len = e-p, byte;
00060   if (len < 2)
00061     return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
00062   byte = p[1];
00063   if (!UTF16_IS_SURROGATE(byte)) {
00064     return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2);
00065   }
00066   if (UTF16_IS_SURROGATE_FIRST(byte)) {
00067     if (len < 4)
00068       return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(4-len);
00069     if (UTF16_IS_SURROGATE_SECOND(p[3]))
00070       return ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4);
00071   }
00072   return ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00073 }
00074 
00075 static int
00076 utf16le_is_mbc_newline(const UChar* p, const UChar* end,
00077                        OnigEncoding enc ARG_UNUSED)
00078 {
00079   if (p + 1 < end) {
00080     if (*p == 0x0a && *(p+1) == 0x00)
00081       return 1;
00082 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
00083     if ((
00084 #ifndef USE_CRNL_AS_LINE_TERMINATOR
00085          *p == 0x0d ||
00086 #endif
00087          *p == 0x85) && *(p+1) == 0x00)
00088       return 1;
00089     if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))
00090       return 1;
00091 #endif
00092   }
00093   return 0;
00094 }
00095 
00096 static OnigCodePoint
00097 utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED,
00098                     OnigEncoding enc ARG_UNUSED)
00099 {
00100   OnigCodePoint code;
00101   UChar c0 = *p;
00102   UChar c1 = *(p+1);
00103 
00104   if (UTF16_IS_SURROGATE_FIRST(c1)) {
00105     code = ((((c1 << 8) + c0) & 0x03ff) << 10)
00106          + (((p[3] << 8) + p[2]) & 0x03ff) + 0x10000;
00107   }
00108   else {
00109     code = c1 * 256 + p[0];
00110   }
00111   return code;
00112 }
00113 
00114 static int
00115 utf16le_code_to_mbclen(OnigCodePoint code,
00116                        OnigEncoding enc ARG_UNUSED)
00117 {
00118   return (code > 0xffff ? 4 : 2);
00119 }
00120 
00121 static int
00122 utf16le_code_to_mbc(OnigCodePoint code, UChar *buf,
00123                     OnigEncoding enc ARG_UNUSED)
00124 {
00125   UChar* p = buf;
00126 
00127   if (code > 0xffff) {
00128     unsigned int high = (code >> 10) + 0xD7C0;
00129     unsigned int low = (code & 0x3FF) + 0xDC00;
00130     *p++ = high & 0xFF;
00131     *p++ = (high >> 8) & 0xFF;
00132     *p++ = low & 0xFF;
00133     *p++ = (low >> 8) & 0xFF;
00134     return 4;
00135   }
00136   else {
00137     *p++ = (UChar )(code & 0xff);
00138     *p++ = (UChar )((code & 0xff00) >> 8);
00139     return 2;
00140   }
00141 }
00142 
00143 static int
00144 utf16le_mbc_case_fold(OnigCaseFoldType flag,
00145                       const UChar** pp, const UChar* end, UChar* fold,
00146                       OnigEncoding enc)
00147 {
00148   const UChar* p = *pp;
00149 
00150   if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {
00151 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
00152     if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {
00153       if (*p == 0x49) {
00154         *fold++ = 0x31;
00155         *fold   = 0x01;
00156         (*pp) += 2;
00157         return 2;
00158       }
00159     }
00160 #endif
00161 
00162     *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00163     *fold   = 0;
00164     *pp += 2;
00165     return 2;
00166   }
00167   else
00168     return onigenc_unicode_mbc_case_fold(enc, flag, pp,
00169                                          end, fold);
00170 }
00171 
00172 #if 0
00173 static int
00174 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,
00175                          const UChar* end)
00176 {
00177   const UChar* p = *pp;
00178 
00179   (*pp) += EncLen_UTF16[*(p+1)];
00180 
00181   if (*(p+1) == 0) {
00182     int c, v;
00183 
00184     if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {
00185       return TRUE;
00186     }
00187 
00188     c = *p;
00189     v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,
00190                        (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));
00191     if ((v | BIT_CTYPE_LOWER) != 0) {
00192       /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
00193       if (c >= 0xaa && c <= 0xba)
00194         return FALSE;
00195       else
00196         return TRUE;
00197     }
00198     return (v != 0 ? TRUE : FALSE);
00199   }
00200 
00201   return FALSE;
00202 }
00203 #endif
00204 
00205 static UChar*
00206 utf16le_left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end,
00207                               OnigEncoding enc ARG_UNUSED)
00208 {
00209   if (s <= start) return (UChar* )s;
00210 
00211   if ((s - start) % 2 == 1) {
00212     s--;
00213   }
00214 
00215   if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
00216     s -= 2;
00217 
00218   return (UChar* )s;
00219 }
00220 
00221 static int
00222 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,
00223                                    const OnigUChar* p, const OnigUChar* end,
00224                                    OnigCaseFoldCodeItem items[],
00225                                    OnigEncoding enc)
00226 {
00227   return onigenc_unicode_get_case_fold_codes_by_str(enc,
00228                                                     flag, p, end, items);
00229 }
00230 
00231 OnigEncodingDefine(utf_16le, UTF_16LE) = {
00232   utf16le_mbc_enc_len,
00233   "UTF-16LE",   /* name */
00234   4,            /* max byte length */
00235   2,            /* min byte length */
00236   utf16le_is_mbc_newline,
00237   utf16le_mbc_to_code,
00238   utf16le_code_to_mbclen,
00239   utf16le_code_to_mbc,
00240   utf16le_mbc_case_fold,
00241   onigenc_unicode_apply_all_case_fold,
00242   utf16le_get_case_fold_codes_by_str,
00243   onigenc_unicode_property_name_to_ctype,
00244   onigenc_unicode_is_code_ctype,
00245   onigenc_utf16_32_get_ctype_code_range,
00246   utf16le_left_adjust_char_head,
00247   onigenc_always_false_is_allowed_reverse_match
00248 };
00249 

Generated on Wed Sep 8 2010 21:53:11 for Ruby by  doxygen 1.7.1