00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include "regint.h"
00031
00032
00033 #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
00034
00035 static const int EncLen_EUCJP[] = {
00036 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00042 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00043 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
00045 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00047 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00049 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00050 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
00052 };
00053
00054 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
00055 #define A ACCEPT
00056 #define F FAILURE
00057 static const signed char trans[][0x100] = {
00058 {
00059 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00060 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00061 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00062 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00063 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00064 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00065 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00066 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00067 F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
00068 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00069 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00073 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00074 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00075 },
00076 {
00077 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00078 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00079 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00080 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00081 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00082 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00083 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00084 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00085 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00086 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00087 F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00088 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092 A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
00093 },
00094 {
00095 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00096 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00101 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00102 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00103 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00104 F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00105 F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
00111 },
00112
00113 };
00114 #undef A
00115 #undef F
00116
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120 int firstbyte = *p++;
00121 state_t s;
00122 s = trans[0][firstbyte];
00123 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
00126 s = trans[s][*p++];
00127 if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
00130 s = trans[s][*p++];
00131 return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
00132 ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00133 }
00134
00135 static OnigCodePoint
00136 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00137 {
00138 int c, i, len;
00139 OnigCodePoint n;
00140
00141 len = enclen(enc, p, end);
00142 n = (OnigCodePoint )*p++;
00143 if (len == 1) return n;
00144
00145 for (i = 1; i < len; i++) {
00146 if (p >= end) break;
00147 c = *p++;
00148 n <<= 8; n += c;
00149 }
00150 return n;
00151 }
00152
00153 static int
00154 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00155 {
00156 if (ONIGENC_IS_CODE_ASCII(code)) return 1;
00157 else if (code > 0xffffff) return 0;
00158 else if ((code & 0xff0000) >= 0x800000) return 3;
00159 else if ((code & 0xff00) >= 0x8000) return 2;
00160 else
00161 return ONIGERR_INVALID_CODE_POINT_VALUE;
00162 }
00163
00164 #if 0
00165 static int
00166 code_to_mbc_first(OnigCodePoint code)
00167 {
00168 int first;
00169
00170 if ((code & 0xff0000) != 0) {
00171 first = (code >> 16) & 0xff;
00172 }
00173 else if ((code & 0xff00) != 0) {
00174 first = (code >> 8) & 0xff;
00175 }
00176 else {
00177 return (int )code;
00178 }
00179 return first;
00180 }
00181 #endif
00182
00183 static int
00184 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00185 {
00186 UChar *p = buf;
00187
00188 if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff));
00189 if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff));
00190 *p++ = (UChar )(code & 0xff);
00191
00192 #if 1
00193 if (enclen(enc, buf, p) != (p - buf))
00194 return ONIGERR_INVALID_CODE_POINT_VALUE;
00195 #endif
00196 return p - buf;
00197 }
00198
00199 static int
00200 mbc_case_fold(OnigCaseFoldType flag,
00201 const UChar** pp, const UChar* end, UChar* lower,
00202 OnigEncoding enc)
00203 {
00204 int len;
00205 const UChar* p = *pp;
00206
00207 if (ONIGENC_IS_MBC_ASCII(p)) {
00208 *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00209 (*pp)++;
00210 return 1;
00211 }
00212 else {
00213 int i;
00214
00215 len = enclen(enc, p, end);
00216 for (i = 0; i < len; i++) {
00217 *lower++ = *p++;
00218 }
00219 (*pp) += len;
00220 return len;
00221 }
00222 }
00223
00224 static UChar*
00225 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00226 {
00227
00228
00229
00230 const UChar *p;
00231 int len;
00232
00233 if (s <= start) return (UChar* )s;
00234 p = s;
00235
00236 while (!eucjp_islead(*p) && p > start) p--;
00237 len = enclen(enc, p, end);
00238 if (p + len > s) return (UChar* )p;
00239 p += len;
00240 return (UChar* )(p + ((s - p) & ~1));
00241 }
00242
00243 static int
00244 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00245 {
00246 const UChar c = *s;
00247 if (c <= 0x7e || c == 0x8e || c == 0x8f)
00248 return TRUE;
00249 else
00250 return FALSE;
00251 }
00252
00253
00254 static int PropertyInited = 0;
00255 static const OnigCodePoint** PropertyList;
00256 static int PropertyListNum;
00257 static int PropertyListSize;
00258 static hash_table_type* PropertyNameTable;
00259
00260 static const OnigCodePoint CR_Hiragana[] = {
00261 1,
00262 0xa4a1, 0xa4f3
00263 };
00264
00265 static const OnigCodePoint CR_Katakana[] = {
00266 3,
00267 0xa5a1, 0xa5f6,
00268 0xaaa6, 0xaaaf,
00269 0xaab1, 0xaadd
00270 };
00271
00272 static int
00273 init_property_list(void)
00274 {
00275 int r;
00276
00277 PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
00278 PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
00279 PropertyInited = 1;
00280
00281 end:
00282 return r;
00283 }
00284
00285 static int
00286 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00287 {
00288 st_data_t ctype;
00289
00290 PROPERTY_LIST_INIT_CHECK;
00291
00292 if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
00293 return onigenc_minimum_property_name_to_ctype(enc, p, end);
00294 }
00295
00296 return ctype;
00297 }
00298
00299 static int
00300 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSED)
00301 {
00302 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00303 if (code < 128)
00304 return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00305 else {
00306 if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00307 return (code_to_mbclen(code, enc) > 1 ? TRUE : FALSE);
00308 }
00309 }
00310 }
00311 else {
00312 PROPERTY_LIST_INIT_CHECK;
00313
00314 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00315 if (ctype >= (unsigned int )PropertyListNum)
00316 return ONIGERR_TYPE_BUG;
00317
00318 return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00319 }
00320
00321 return FALSE;
00322 }
00323
00324 static int
00325 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00326 const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00327 {
00328 if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00329 return ONIG_NO_SUPPORT_CONFIG;
00330 }
00331 else {
00332 *sb_out = 0x80;
00333
00334 PROPERTY_LIST_INIT_CHECK;
00335
00336 ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00337 if (ctype >= (OnigCtype )PropertyListNum)
00338 return ONIGERR_TYPE_BUG;
00339
00340 *ranges = PropertyList[ctype];
00341 return 0;
00342 }
00343 }
00344
00345
00346 OnigEncodingDefine(euc_jp, EUC_JP) = {
00347 mbc_enc_len,
00348 "EUC-JP",
00349 3,
00350 1,
00351 onigenc_is_mbc_newline_0x0a,
00352 mbc_to_code,
00353 code_to_mbclen,
00354 code_to_mbc,
00355 mbc_case_fold,
00356 onigenc_ascii_apply_all_case_fold,
00357 onigenc_ascii_get_case_fold_codes_by_str,
00358 property_name_to_ctype,
00359 is_code_ctype,
00360 get_ctype_code_range,
00361 left_adjust_char_head,
00362 is_allowed_reverse_match,
00363 0
00364 };
00365
00366
00367
00368
00369
00370
00371 ENC_ALIAS("eucJP", "EUC-JP")
00372
00373
00374
00375
00376
00377
00378
00379 ENC_REPLICATE("eucJP-ms", "EUC-JP")
00380 ENC_ALIAS("euc-jp-ms", "eucJP-ms")
00381
00382
00383
00384
00385
00386
00387
00388 ENC_REPLICATE("CP51932", "EUC-JP")
00389