• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

enc/shift_jis.c

Go to the documentation of this file.
00001 /**********************************************************************
00002   sjis.c -  Oniguruma (regular expression library)
00003 **********************************************************************/
00004 /*-
00005  * Copyright (c) 2002-2008  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
00006  * All rights reserved.
00007  *
00008  * Redistribution and use in source and binary forms, with or without
00009  * modification, are permitted provided that the following conditions
00010  * are met:
00011  * 1. Redistributions of source code must retain the above copyright
00012  *    notice, this list of conditions and the following disclaimer.
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in the
00015  *    documentation and/or other materials provided with the distribution.
00016  *
00017  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
00018  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
00021  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00022  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00023  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00024  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00025  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00026  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00027  * SUCH DAMAGE.
00028  */
00029 
00030 #include "regint.h"
00031 
00032 static const int EncLen_SJIS[] = {
00033   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00034   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00035   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00036   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00037   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00038   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00039   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00041   1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00042   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00043   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00044   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00045   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00046   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00047   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00048   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1
00049 };
00050 
00051 static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
00052   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00053   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00054   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00055   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00056   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00057   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00058   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00059   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
00060   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00061   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00062   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00063   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00064   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00065   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00066   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00067   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00068 };
00069 
00070 #define SJIS_ISMB_FIRST(byte)  (EncLen_SJIS[byte] > 1)
00071 #define SJIS_ISMB_TRAIL(byte)  SJIS_CAN_BE_TRAIL_TABLE[(byte)]
00072 
00073 typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
00074 #define A ACCEPT
00075 #define F FAILURE
00076 static const signed char trans[][0x100] = {
00077   { /* S0   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00078     /* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00079     /* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00080     /* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00081     /* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00082     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00083     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00084     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00085     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00086     /* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00087     /* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00088     /* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00089     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00090     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00091     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00092     /* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00093     /* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
00094   },
00095   { /* S1   0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f */
00096     /* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00097     /* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00098     /* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00099     /* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
00100     /* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00101     /* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00102     /* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00103     /* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
00104     /* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00105     /* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00106     /* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00107     /* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00108     /* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00109     /* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00110     /* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
00111     /* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
00112   }
00113 };
00114 #undef A
00115 #undef F
00116 
00117 static int
00118 mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
00119 {
00120   int firstbyte = *p++;
00121   state_t s;
00122   s = trans[0][firstbyte];
00123   if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
00124                                   ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00125   if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
00126   s = trans[s][*p++];
00127   return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
00128                        ONIGENC_CONSTRUCT_MBCLEN_INVALID();
00129 }
00130 
00131 static int
00132 code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED)
00133 {
00134   if (code < 256) {
00135     if (EncLen_SJIS[(int )code] == 1)
00136       return 1;
00137     else
00138       return 0;
00139   }
00140   else if (code <= 0xffff) {
00141     return 2;
00142   }
00143   else
00144     return ONIGERR_INVALID_CODE_POINT_VALUE;
00145 }
00146 
00147 static OnigCodePoint
00148 mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc)
00149 {
00150   int c, i, len;
00151   OnigCodePoint n;
00152 
00153   len = enclen(enc, p, end);
00154   c = *p++;
00155   n = c;
00156   if (len == 1) return n;
00157 
00158   for (i = 1; i < len; i++) {
00159     if (p >= end) break;
00160     c = *p++;
00161     n <<= 8;  n += c;
00162   }
00163   return n;
00164 }
00165 
00166 static int
00167 code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc)
00168 {
00169   UChar *p = buf;
00170 
00171   if ((code & 0xff00) != 0) *p++ = (UChar )(((code >>  8) & 0xff));
00172   *p++ = (UChar )(code & 0xff);
00173 
00174 #if 0
00175   if (enclen(enc, buf) != (p - buf))
00176     return REGERR_INVALID_CODE_POINT_VALUE;
00177 #endif
00178   return p - buf;
00179 }
00180 
00181 static int
00182 mbc_case_fold(OnigCaseFoldType flag,
00183               const UChar** pp, const UChar* end, UChar* lower,
00184               OnigEncoding enc)
00185 {
00186   const UChar* p = *pp;
00187 
00188   if (ONIGENC_IS_MBC_ASCII(p)) {
00189     *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
00190     (*pp)++;
00191     return 1;
00192   }
00193   else {
00194     int i;
00195     int len = enclen(enc, p, end);
00196 
00197     for (i = 0; i < len; i++) {
00198       *lower++ = *p++;
00199     }
00200     (*pp) += len;
00201     return len; /* return byte length of converted char to lower */
00202   }
00203 }
00204 
00205 #if 0
00206 static int
00207 is_mbc_ambiguous(OnigCaseFoldType flag,
00208                  const UChar** pp, const UChar* end)
00209 {
00210   return onigenc_mbn_is_mbc_ambiguous(enc, flag, pp, end);
00211                                       
00212 }
00213 #endif
00214 
00215 #if 0
00216 static int
00217 is_code_ctype(OnigCodePoint code, unsigned int ctype)
00218 {
00219   if (code < 128)
00220     return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00221   else {
00222     if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00223       return (code_to_mbclen(code) > 1 ? TRUE : FALSE);
00224     }
00225   }
00226 
00227   return FALSE;
00228 }
00229 #endif
00230 
00231 static UChar*
00232 left_adjust_char_head(const UChar* start, const UChar* s, const UChar* end, OnigEncoding enc)
00233 {
00234   const UChar *p;
00235   int len;
00236 
00237   if (s <= start) return (UChar* )s;
00238   p = s;
00239 
00240   if (SJIS_ISMB_TRAIL(*p)) {
00241     while (p > start) {
00242       if (! SJIS_ISMB_FIRST(*--p)) {
00243         p++;
00244         break;
00245       }
00246     } 
00247   }
00248   len = enclen(enc, p, end);
00249   if (p + len > s) return (UChar* )p;
00250   p += len;
00251   return (UChar* )(p + ((s - p) & ~1));
00252 }
00253 
00254 static int
00255 is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc ARG_UNUSED)
00256 {
00257   const UChar c = *s;
00258   return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE);
00259 }
00260 
00261 
00262 static int PropertyInited = 0;
00263 static const OnigCodePoint** PropertyList;
00264 static int PropertyListNum;
00265 static int PropertyListSize;
00266 static hash_table_type* PropertyNameTable;
00267 
00268 static const OnigCodePoint CR_Hiragana[] = {
00269   1,
00270   0x829f, 0x82f1
00271 }; /* CR_Hiragana */
00272 
00273 static const OnigCodePoint CR_Katakana[] = {
00274   4,
00275   0x00a6, 0x00af,
00276   0x00b1, 0x00dd,
00277   0x8340, 0x837e,
00278   0x8380, 0x8396,
00279 }; /* CR_Katakana */
00280 
00281 static int
00282 init_property_list(void)
00283 {
00284   int r;
00285 
00286   PROPERTY_LIST_ADD_PROP("Hiragana", CR_Hiragana);
00287   PROPERTY_LIST_ADD_PROP("Katakana", CR_Katakana);
00288   PropertyInited = 1;
00289 
00290  end:
00291   return r;
00292 }
00293 
00294 static int
00295 property_name_to_ctype(OnigEncoding enc, UChar* p, UChar* end)
00296 {
00297   hash_data_type ctype;
00298 
00299   PROPERTY_LIST_INIT_CHECK;
00300 
00301   if (onig_st_lookup_strend(PropertyNameTable, p, end, &ctype) == 0) {
00302     return onigenc_minimum_property_name_to_ctype(enc, p, end);
00303   }
00304 
00305   return (int)ctype;
00306 }
00307 
00308 static int
00309 is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
00310 {
00311   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00312     if (code < 128)
00313       return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
00314     else {
00315       if (CTYPE_IS_WORD_GRAPH_PRINT(ctype)) {
00316         return TRUE;
00317       }
00318     }
00319   }
00320   else {
00321     PROPERTY_LIST_INIT_CHECK;
00322 
00323     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00324     if (ctype >= (unsigned int )PropertyListNum)
00325       return ONIGERR_TYPE_BUG;
00326 
00327     return onig_is_in_code_range((UChar* )PropertyList[ctype], code);
00328   }
00329 
00330   return FALSE;
00331 }
00332 
00333 static int
00334 get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out,
00335                      const OnigCodePoint* ranges[], OnigEncoding enc ARG_UNUSED)
00336 {
00337   if (ctype <= ONIGENC_MAX_STD_CTYPE) {
00338     return ONIG_NO_SUPPORT_CONFIG;
00339   }
00340   else {
00341     *sb_out = 0x80;
00342 
00343     PROPERTY_LIST_INIT_CHECK;
00344 
00345     ctype -= (ONIGENC_MAX_STD_CTYPE + 1);
00346     if (ctype >= (OnigCtype )PropertyListNum)
00347       return ONIGERR_TYPE_BUG;
00348 
00349     *ranges = PropertyList[ctype];
00350     return 0;
00351   }
00352 }
00353 
00354 OnigEncodingDefine(shift_jis, Shift_JIS) = {
00355   mbc_enc_len,
00356   "Shift_JIS",   /* name */
00357   2,             /* max byte length */
00358   1,             /* min byte length */
00359   onigenc_is_mbc_newline_0x0a,
00360   mbc_to_code,
00361   code_to_mbclen,
00362   code_to_mbc,
00363   mbc_case_fold,
00364   onigenc_ascii_apply_all_case_fold,
00365   onigenc_ascii_get_case_fold_codes_by_str,
00366   property_name_to_ctype,
00367   is_code_ctype,
00368   get_ctype_code_range,
00369   left_adjust_char_head,
00370   is_allowed_reverse_match,
00371   0
00372 };
00373 /*
00374  * Name: Shift_JIS
00375  * MIBenum: 17
00376  * Link: http://www.iana.org/assignments/character-sets
00377  * Link: http://ja.wikipedia.org/wiki/Shift_JIS
00378  */
00379 ENC_ALIAS("SJIS", "Shift_JIS")
00380 
00381 /*
00382  * Name: Windows-31J
00383  * MIBenum: 2024
00384  * Link: http://www.iana.org/assignments/character-sets
00385  * Link: http://www.microsoft.com/globaldev/reference/dbcs/932.mspx
00386  * Link: http://ja.wikipedia.org/wiki/Windows-31J
00387  * Link: http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-932-2000.ucm
00388  */
00389 ENC_REPLICATE("Windows-31J", "Shift_JIS")
00390 ENC_ALIAS("CP932", "Windows-31J")
00391 ENC_ALIAS("csWindows31J", "Windows-31J") /* IANA.  IE6 don't accept Windows-31J but csWindows31J. */
00392 
00393 /*
00394  * Name: MacJapanese
00395  * Link: http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/JAPANESE.TXT
00396  * Link: http://ja.wikipedia.org/wiki/MacJapanese
00397  */
00398 ENC_REPLICATE("MacJapanese", "Shift_JIS")
00399 ENC_ALIAS("MacJapan", "MacJapanese")
00400 

Generated on Wed Sep 8 2010 21:51:49 for Ruby by  doxygen 1.7.1