• Main Page
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

re.c

Go to the documentation of this file.
00001 /**********************************************************************
00002 
00003   re.c -
00004 
00005   $Author: yugui $
00006   created at: Mon Aug  9 18:24:49 JST 1993
00007 
00008   Copyright (C) 1993-2007 Yukihiro Matsumoto
00009 
00010 **********************************************************************/
00011 
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "regint.h"
00017 #include <ctype.h>
00018 
00019 VALUE rb_eRegexpError;
00020 
00021 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00022 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00023 
00024 #define BEG(no) regs->beg[no]
00025 #define END(no) regs->end[no]
00026 
00027 #if 'a' == 97   /* it's ascii */
00028 static const char casetable[] = {
00029         '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00030         '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00031         '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00032         '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00033         /* ' '     '!'     '"'     '#'     '$'     '%'     '&'     ''' */
00034         '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00035         /* '('     ')'     '*'     '+'     ','     '-'     '.'     '/' */
00036         '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00037         /* '0'     '1'     '2'     '3'     '4'     '5'     '6'     '7' */
00038         '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00039         /* '8'     '9'     ':'     ';'     '<'     '='     '>'     '?' */
00040         '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00041         /* '@'     'A'     'B'     'C'     'D'     'E'     'F'     'G' */
00042         '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00043         /* 'H'     'I'     'J'     'K'     'L'     'M'     'N'     'O' */
00044         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00045         /* 'P'     'Q'     'R'     'S'     'T'     'U'     'V'     'W' */
00046         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00047         /* 'X'     'Y'     'Z'     '['     '\'     ']'     '^'     '_' */
00048         '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00049         /* '`'     'a'     'b'     'c'     'd'     'e'     'f'     'g' */
00050         '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00051         /* 'h'     'i'     'j'     'k'     'l'     'm'     'n'     'o' */
00052         '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00053         /* 'p'     'q'     'r'     's'     't'     'u'     'v'     'w' */
00054         '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00055         /* 'x'     'y'     'z'     '{'     '|'     '}'     '~' */
00056         '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00057         '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00058         '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00059         '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00060         '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00061         '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00062         '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00063         '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00064         '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00065         '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00066         '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00067         '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00068         '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00069         '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00070         '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00071         '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00072         '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00073 };
00074 #else
00075 # error >>> "You lose. You will need a translation table for your character set." <<<
00076 #endif
00077 
00078 int
00079 rb_memcicmp(const void *x, const void *y, long len)
00080 {
00081     const unsigned char *p1 = x, *p2 = y;
00082     int tmp;
00083 
00084     while (len--) {
00085         if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00086             return tmp;
00087     }
00088     return 0;
00089 }
00090 
00091 #undef rb_memcmp
00092 
00093 int
00094 rb_memcmp(const void *p1, const void *p2, long len)
00095 {
00096     return memcmp(p1, p2, len);
00097 }
00098 
00099 static inline long
00100 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00101 {
00102     const unsigned char *x = xs, *xe = xs + m;
00103     const unsigned char *y = ys, *ye = ys + n;
00104 #ifndef VALUE_MAX
00105 # if SIZEOF_VALUE == 8
00106 #  define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00107 # elif SIZEOF_VALUE == 4
00108 #  define VALUE_MAX 0xFFFFFFFFUL
00109 # endif
00110 #endif
00111     VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00112 
00113     if (m > SIZEOF_VALUE)
00114         rb_bug("!!too long pattern string!!");
00115 
00116     /* Prepare hash value */
00117     for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00118         hx <<= CHAR_BIT;
00119         hy <<= CHAR_BIT;
00120         hx |= *x;
00121         hy |= *y;
00122     }
00123     /* Searching */
00124     while (hx != hy) {
00125         if (y == ye)
00126             return -1;
00127         hy <<= CHAR_BIT;
00128         hy |= *y;
00129         hy &= mask;
00130         y++;
00131     }
00132     return y - ys - m;
00133 }
00134 
00135 static inline long
00136 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00137 {
00138     const unsigned char *x = xs, *xe = xs + m;
00139     const unsigned char *y = ys;
00140     VALUE i, qstable[256];
00141 
00142     /* Preprocessing */
00143     for (i = 0; i < 256; ++i)
00144         qstable[i] = m + 1;
00145     for (; x < xe; ++x)
00146         qstable[*x] = xe - x;
00147     /* Searching */
00148     for (; y + m <= ys + n; y += *(qstable + y[m])) {
00149         if (*xs == *y && memcmp(xs, y, m) == 0)
00150             return y - ys;
00151     }
00152     return -1;
00153 }
00154 
00155 static inline unsigned int
00156 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00157 {
00158     register const unsigned int mix = 8353;
00159     register unsigned int h = *x;
00160     if (h < 0xC0) {
00161         return h + 256;
00162     }
00163     else if (h < 0xE0) {
00164         h *= mix;
00165         h += x[1];
00166     }
00167     else if (h < 0xF0) {
00168         h *= mix;
00169         h += x[1];
00170         h *= mix;
00171         h += x[2];
00172     }
00173     else if (h < 0xF5) {
00174         h *= mix;
00175         h += x[1];
00176         h *= mix;
00177         h += x[2];
00178         h *= mix;
00179         h += x[3];
00180     }
00181     else {
00182         return h + 256;
00183     }
00184     return (unsigned char)h;
00185 }
00186 
00187 static inline long
00188 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00189 {
00190     const unsigned char *x = xs, *xe = xs + m;
00191     const unsigned char *y = ys;
00192     VALUE i, qstable[512];
00193 
00194     /* Preprocessing */
00195     for (i = 0; i < 512; ++i) {
00196         qstable[i] = m + 1;
00197     }
00198     for (; x < xe; ++x) {
00199         qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00200     }
00201     /* Searching */
00202     for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00203         if (*xs == *y && memcmp(xs, y, m) == 0)
00204             return y - ys;
00205     }
00206     return -1;
00207 }
00208 
00209 long
00210 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00211 {
00212     const unsigned char *x = x0, *y = y0;
00213 
00214     if (m > n) return -1;
00215     else if (m == n) {
00216         return memcmp(x0, y0, m) == 0 ? 0 : -1;
00217     }
00218     else if (m < 1) {
00219         return 0;
00220     }
00221     else if (m == 1) {
00222         const unsigned char *ys = y, *ye = ys + n;
00223         for (; y < ye; ++y) {
00224             if (*x == *y)
00225                 return y - ys;
00226         }
00227         return -1;
00228     }
00229     else if (m <= SIZEOF_VALUE) {
00230         return rb_memsearch_ss(x0, m, y0, n);
00231     }
00232     else if (enc == rb_utf8_encoding()){
00233         return rb_memsearch_qs_utf8(x0, m, y0, n);
00234     }
00235     else {
00236         return rb_memsearch_qs(x0, m, y0, n);
00237     }
00238 }
00239 
00240 #define REG_LITERAL FL_USER5
00241 #define REG_ENCODING_NONE FL_USER6
00242 
00243 #define KCODE_FIXED FL_USER4
00244 
00245 #define ARG_REG_OPTION_MASK \
00246     (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00247 #define ARG_ENCODING_FIXED    16
00248 #define ARG_ENCODING_NONE     32
00249 
00250 static int
00251 char_to_option(int c)
00252 {
00253     int val;
00254 
00255     switch (c) {
00256       case 'i':
00257         val = ONIG_OPTION_IGNORECASE;
00258         break;
00259       case 'x':
00260         val = ONIG_OPTION_EXTEND;
00261         break;
00262       case 'm':
00263         val = ONIG_OPTION_MULTILINE;
00264         break;
00265       default:
00266         val = 0;
00267         break;
00268     }
00269     return val;
00270 }
00271 
00272 static char *
00273 option_to_str(char str[4], int options)
00274 {
00275     char *p = str;
00276     if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00277     if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00278     if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00279     *p = 0;
00280     return str;
00281 }
00282 
00283 extern int
00284 rb_char_to_option_kcode(int c, int *option, int *kcode)
00285 {
00286     *option = 0;
00287 
00288     switch (c) {
00289       case 'n':
00290         *kcode = rb_ascii8bit_encindex();
00291         return (*option = ARG_ENCODING_NONE);
00292       case 'e':
00293         *kcode = rb_enc_find_index("EUC-JP");
00294         break;
00295       case 's':
00296         *kcode = rb_enc_find_index("Windows-31J");
00297         break;
00298       case 'u':
00299         *kcode = rb_utf8_encindex();
00300         break;
00301       default:
00302         *kcode = -1;
00303         return (*option = char_to_option(c));
00304     }
00305     *option = ARG_ENCODING_FIXED;
00306     return 1;
00307 }
00308 
00309 static void
00310 rb_reg_check(VALUE re)
00311 {
00312     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00313         rb_raise(rb_eTypeError, "uninitialized Regexp");
00314     }
00315 }
00316 
00317 int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
00318 
00319 static void
00320 rb_reg_expr_str(VALUE str, const char *s, long len,
00321         rb_encoding *enc, rb_encoding *resenc)
00322 {
00323     const char *p, *pend;
00324     int need_escape = 0;
00325     int c, clen;
00326 
00327     p = s; pend = p + len;
00328     if (rb_enc_asciicompat(enc)) {
00329         while (p < pend) {
00330             c = rb_enc_ascget(p, pend, &clen, enc);
00331             if (c == -1) {
00332                 if (enc == resenc) {
00333                     p += mbclen(p, pend, enc);
00334                 }
00335                 else {
00336                     need_escape = 1;
00337                     break;
00338                 }
00339             }
00340             else if (c != '/' && rb_enc_isprint(c, enc)) {
00341                 p += clen;
00342             }
00343             else {
00344                 need_escape = 1;
00345                 break;
00346             }
00347         }
00348     }
00349     else {
00350         need_escape = 1;
00351     }
00352 
00353     if (!need_escape) {
00354         rb_str_buf_cat(str, s, len);
00355     }
00356     else {
00357         int unicode_p = rb_enc_unicode_p(enc);
00358         p = s;
00359         while (p<pend) {
00360             c = rb_enc_ascget(p, pend, &clen, enc);
00361             if (c == '\\' && p+clen < pend) {
00362                 int n = clen + mbclen(p+clen, pend, enc);
00363                 rb_str_buf_cat(str, p, n);
00364                 p += n;
00365                 continue;
00366             }
00367             else if (c == '/') {
00368                 char c = '\\';
00369                 rb_str_buf_cat(str, &c, 1);
00370                 rb_str_buf_cat(str, p, clen);
00371             }
00372             else if (c == -1) {
00373                 clen = rb_enc_precise_mbclen(p, pend, enc);
00374                 if (!MBCLEN_CHARFOUND_P(clen)) {
00375                     c = (unsigned char)*p;
00376                     clen = 1;
00377                     goto hex;
00378                 }
00379                 if (resenc) {
00380                     unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00381                     rb_str_buf_cat_escaped_char(str, c, unicode_p);
00382                 }
00383                 else {
00384                     clen = MBCLEN_CHARFOUND_LEN(clen);
00385                     rb_str_buf_cat(str, p, clen);
00386                 }
00387             }
00388             else if (rb_enc_isprint(c, enc)) {
00389                 rb_str_buf_cat(str, p, clen);
00390             }
00391             else if (!rb_enc_isspace(c, enc)) {
00392                 char b[8];
00393 
00394               hex:
00395                 snprintf(b, sizeof(b), "\\x%02X", c);
00396                 rb_str_buf_cat(str, b, 4);
00397             }
00398             else {
00399                 rb_str_buf_cat(str, p, clen);
00400             }
00401             p += clen;
00402         }
00403     }
00404 }
00405 
00406 static VALUE
00407 rb_reg_desc(const char *s, long len, VALUE re)
00408 {
00409     rb_encoding *enc = rb_enc_get(re);
00410     VALUE str = rb_str_buf_new2("/");
00411     rb_encoding *resenc = rb_default_internal_encoding();
00412     if (resenc == NULL) resenc = rb_default_external_encoding();
00413 
00414     if (re && rb_enc_asciicompat(enc)) {
00415         rb_enc_copy(str, re);
00416     }
00417     else {
00418         rb_enc_associate(str, rb_usascii_encoding());
00419     }
00420     rb_reg_expr_str(str, s, len, enc, resenc);
00421     rb_str_buf_cat2(str, "/");
00422     if (re) {
00423         char opts[4];
00424         rb_reg_check(re);
00425         if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00426             rb_str_buf_cat2(str, opts);
00427         if (RBASIC(re)->flags & REG_ENCODING_NONE)
00428             rb_str_buf_cat2(str, "n");
00429     }
00430     OBJ_INFECT(str, re);
00431     return str;
00432 }
00433 
00434 
00435 /*
00436  *  call-seq:
00437  *      rxp.source   -> str
00438  *
00439  *  Returns the original string of the pattern.
00440  *
00441  *      /ab+c/ix.source #=> "ab+c"
00442  *
00443  *  Note that escape sequences are retained as is.
00444  *
00445  *     /\x20\+/.source  #=> "\\x20\\+"
00446  *
00447  */
00448 
00449 static VALUE
00450 rb_reg_source(VALUE re)
00451 {
00452     VALUE str;
00453 
00454     rb_reg_check(re);
00455     str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00456     if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00457     return str;
00458 }
00459 
00460 /*
00461  * call-seq:
00462  *    rxp.inspect   -> string
00463  *
00464  * Produce a nicely formatted string-version of _rxp_. Perhaps surprisingly,
00465  * <code>#inspect</code> actually produces the more natural version of
00466  * the string than <code>#to_s</code>.
00467  *
00468  *      /ab+c/ix.inspect        #=> "/ab+c/ix"
00469  *
00470  */
00471 
00472 static VALUE
00473 rb_reg_inspect(VALUE re)
00474 {
00475     if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00476         return rb_any_to_s(re);
00477     }
00478     return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00479 }
00480 
00481 
00482 /*
00483  *  call-seq:
00484  *     rxp.to_s   -> str
00485  *
00486  *  Returns a string containing the regular expression and its options (using the
00487  *  <code>(?opts:source)</code> notation. This string can be fed back in to
00488  *  <code>Regexp::new</code> to a regular expression with the same semantics as
00489  *  the original. (However, <code>Regexp#==</code> may not return true when
00490  *  comparing the two, as the source of the regular expression itself may
00491  *  differ, as the example shows).  <code>Regexp#inspect</code> produces a
00492  *  generally more readable version of <i>rxp</i>.
00493  *
00494  *      r1 = /ab+c/ix           #=> /ab+c/ix
00495  *      s1 = r1.to_s            #=> "(?ix-m:ab+c)"
00496  *      r2 = Regexp.new(s1)     #=> /(?ix-m:ab+c)/
00497  *      r1 == r2                #=> false
00498  *      r1.source               #=> "ab+c"
00499  *      r2.source               #=> "(?ix-m:ab+c)"
00500  */
00501 
00502 static VALUE
00503 rb_reg_to_s(VALUE re)
00504 {
00505     int options, opt;
00506     const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00507     long len;
00508     const UChar* ptr;
00509     VALUE str = rb_str_buf_new2("(?");
00510     char optbuf[5];
00511     rb_encoding *enc = rb_enc_get(re);
00512 
00513     rb_reg_check(re);
00514 
00515     rb_enc_copy(str, re);
00516     options = RREGEXP(re)->ptr->options;
00517     ptr = (UChar*)RREGEXP_SRC_PTR(re);
00518     len = RREGEXP_SRC_LEN(re);
00519   again:
00520     if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00521         int err = 1;
00522         ptr += 2;
00523         if ((len -= 2) > 0) {
00524             do {
00525                 opt = char_to_option((int )*ptr);
00526                 if (opt != 0) {
00527                     options |= opt;
00528                 }
00529                 else {
00530                     break;
00531                 }
00532                 ++ptr;
00533             } while (--len > 0);
00534         }
00535         if (len > 1 && *ptr == '-') {
00536             ++ptr;
00537             --len;
00538             do {
00539                 opt = char_to_option((int )*ptr);
00540                 if (opt != 0) {
00541                     options &= ~opt;
00542                 }
00543                 else {
00544                     break;
00545                 }
00546                 ++ptr;
00547             } while (--len > 0);
00548         }
00549         if (*ptr == ')') {
00550             --len;
00551             ++ptr;
00552             goto again;
00553         }
00554         if (*ptr == ':' && ptr[len-1] == ')') {
00555             Regexp *rp;
00556 
00557             ++ptr;
00558             len -= 2;
00559             err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00560                            enc, OnigDefaultSyntax, NULL);
00561             onig_free(rp);
00562         }
00563         if (err) {
00564             options = RREGEXP(re)->ptr->options;
00565             ptr = (UChar*)RREGEXP_SRC_PTR(re);
00566             len = RREGEXP_SRC_LEN(re);
00567         }
00568     }
00569 
00570     if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00571 
00572     if ((options & embeddable) != embeddable) {
00573         optbuf[0] = '-';
00574         option_to_str(optbuf + 1, ~options);
00575         rb_str_buf_cat2(str, optbuf);
00576     }
00577 
00578     rb_str_buf_cat2(str, ":");
00579     rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00580     rb_str_buf_cat2(str, ")");
00581     rb_enc_copy(str, re);
00582 
00583     OBJ_INFECT(str, re);
00584     return str;
00585 }
00586 
00587 static void
00588 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00589 {
00590     VALUE desc = rb_reg_desc(s, len, re);
00591 
00592     rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00593 }
00594 
00595 static VALUE
00596 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00597 {
00598     char opts[6];
00599     VALUE desc = rb_str_buf_new2(err);
00600     rb_encoding *resenc = rb_default_internal_encoding();
00601     if (resenc == NULL) resenc = rb_default_external_encoding();
00602 
00603     rb_enc_associate(desc, enc);
00604     rb_str_buf_cat2(desc, ": /");
00605     rb_reg_expr_str(desc, s, len, enc, resenc);
00606     opts[0] = '/';
00607     option_to_str(opts + 1, options);
00608     rb_str_buf_cat2(desc, opts);
00609     return rb_exc_new3(rb_eRegexpError, desc);
00610 }
00611 
00612 static void
00613 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00614 {
00615     rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00616 }
00617 
00618 static VALUE
00619 rb_reg_error_desc(VALUE str, int options, const char *err)
00620 {
00621     return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00622                                  rb_enc_get(str), options, err);
00623 }
00624 
00625 static void
00626 rb_reg_raise_str(VALUE str, int options, const char *err)
00627 {
00628     rb_exc_raise(rb_reg_error_desc(str, options, err));
00629 }
00630 
00631 
00632 /*
00633  *  call-seq:
00634  *     rxp.casefold?   -> true or false
00635  *
00636  *  Returns the value of the case-insensitive flag.
00637  *
00638  *      /a/.casefold?           #=> false
00639  *      /a/i.casefold?          #=> true
00640  *      /(?i:a)/.casefold?      #=> false
00641  */
00642 
00643 static VALUE
00644 rb_reg_casefold_p(VALUE re)
00645 {
00646     rb_reg_check(re);
00647     if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00648     return Qfalse;
00649 }
00650 
00651 
00652 /*
00653  *  call-seq:
00654  *     rxp.options   -> fixnum
00655  *
00656  *  Returns the set of bits corresponding to the options used when creating this
00657  *  Regexp (see <code>Regexp::new</code> for details. Note that additional bits
00658  *  may be set in the returned options: these are used internally by the regular
00659  *  expression code. These extra bits are ignored if the options are passed to
00660  *  <code>Regexp::new</code>.
00661  *
00662  *     Regexp::IGNORECASE                  #=> 1
00663  *     Regexp::EXTENDED                    #=> 2
00664  *     Regexp::MULTILINE                   #=> 4
00665  *
00666  *     /cat/.options                       #=> 0
00667  *     /cat/ix.options                     #=> 3
00668  *     Regexp.new('cat', true).options     #=> 1
00669  *     /\xa1\xa2/e.options                 #=> 16
00670  *
00671  *     r = /cat/ix
00672  *     Regexp.new(r.source, r.options)     #=> /cat/ix
00673  */
00674 
00675 static VALUE
00676 rb_reg_options_m(VALUE re)
00677 {
00678     int options = rb_reg_options(re);
00679     return INT2NUM(options);
00680 }
00681 
00682 static int
00683 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00684           int back_num, int *back_refs, OnigRegex regex, void *arg)
00685 {
00686     VALUE ary = (VALUE)arg;
00687     rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00688     return 0;
00689 }
00690 
00691 /*
00692  * call-seq:
00693  *    rxp.names   -> [name1, name2, ...]
00694  *
00695  * Returns a list of names of captures as an array of strings.
00696  *
00697  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.names
00698  *     #=> ["foo", "bar", "baz"]
00699  *
00700  *     /(?<foo>.)(?<foo>.)/.names
00701  *     #=> ["foo"]
00702  *
00703  *     /(.)(.)/.names
00704  *     #=> []
00705  */
00706 
00707 static VALUE
00708 rb_reg_names(VALUE re)
00709 {
00710     VALUE ary = rb_ary_new();
00711     rb_reg_check(re);
00712     onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00713     return ary;
00714 }
00715 
00716 static int
00717 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00718           int back_num, int *back_refs, OnigRegex regex, void *arg)
00719 {
00720     VALUE hash = (VALUE)arg;
00721     VALUE ary = rb_ary_new2(back_num);
00722     int i;
00723 
00724     for(i = 0; i < back_num; i++)
00725         rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00726 
00727     rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00728 
00729     return 0;
00730 }
00731 
00732 /*
00733  * call-seq:
00734  *    rxp.named_captures  -> hash
00735  *
00736  * Returns a hash representing information about named captures of <i>rxp</i>.
00737  *
00738  * A key of the hash is a name of the named captures.
00739  * A value of the hash is an array which is list of indexes of corresponding
00740  * named captures.
00741  *
00742  *    /(?<foo>.)(?<bar>.)/.named_captures
00743  *    #=> {"foo"=>[1], "bar"=>[2]}
00744  *
00745  *    /(?<foo>.)(?<foo>.)/.named_captures
00746  *    #=> {"foo"=>[1, 2]}
00747  *
00748  * If there are no named captures, an empty hash is returned.
00749  *
00750  *    /(.)(.)/.named_captures
00751  *    #=> {}
00752  */
00753 
00754 static VALUE
00755 rb_reg_named_captures(VALUE re)
00756 {
00757     VALUE hash = rb_hash_new();
00758     rb_reg_check(re);
00759     onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00760     return hash;
00761 }
00762 
00763 static int
00764 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00765           OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00766           OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00767 {
00768   int r;
00769 
00770   *reg = (regex_t* )xmalloc(sizeof(regex_t));
00771   if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00772 
00773   r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00774   if (r) goto err;
00775 
00776   r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00777   if (r) {
00778   err:
00779     onig_free(*reg);
00780     *reg = NULL;
00781   }
00782   return r;
00783 }
00784 
00785 static Regexp*
00786 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00787         const char *sourcefile, int sourceline)
00788 {
00789     Regexp *rp;
00790     int r;
00791     OnigErrorInfo einfo;
00792 
00793     /* Handle escaped characters first. */
00794 
00795     /* Build a copy of the string (in dest) with the
00796        escaped characters translated,  and generate the regex
00797        from that.
00798     */
00799 
00800     r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00801                  enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00802     if (r) {
00803         onig_error_code_to_str((UChar*)err, r, &einfo);
00804         return 0;
00805     }
00806     return rp;
00807 }
00808 
00809 
00810 /*
00811  *  Document-class: MatchData
00812  *
00813  *  <code>MatchData</code> is the type of the special variable <code>$~</code>,
00814  *  and is the type of the object returned by <code>Regexp#match</code> and
00815  *  <code>Regexp.last_match</code>. It encapsulates all the results of a pattern
00816  *  match, results normally accessed through the special variables
00817  *  <code>$&</code>, <code>$'</code>, <code>$`</code>, <code>$1</code>,
00818  *  <code>$2</code>, and so on.
00819  *
00820  */
00821 
00822 VALUE rb_cMatch;
00823 
00824 static VALUE
00825 match_alloc(VALUE klass)
00826 {
00827     NEWOBJ(match, struct RMatch);
00828     OBJSETUP(match, klass, T_MATCH);
00829 
00830     match->str = 0;
00831     match->rmatch = 0;
00832     match->regexp = 0;
00833     match->rmatch = ALLOC(struct rmatch);
00834     MEMZERO(match->rmatch, struct rmatch, 1);
00835 
00836     return (VALUE)match;
00837 }
00838 
00839 typedef struct {
00840     long byte_pos;
00841     long char_pos;
00842 } pair_t;
00843 
00844 static int
00845 pair_byte_cmp(const void *pair1, const void *pair2)
00846 {
00847     long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00848 #if SIZEOF_LONG > SIZEOF_INT
00849     return diff ? diff > 0 ? 1 : -1 : 0;
00850 #else
00851     return (int)diff;
00852 #endif
00853 }
00854 
00855 static void
00856 update_char_offset(VALUE match)
00857 {
00858     struct rmatch *rm = RMATCH(match)->rmatch;
00859     struct re_registers *regs;
00860     int i, num_regs, num_pos;
00861     long c;
00862     char *s, *p, *q, *e;
00863     rb_encoding *enc;
00864     pair_t *pairs;
00865 
00866     if (rm->char_offset_updated)
00867         return;
00868 
00869     regs = &rm->regs;
00870     num_regs = rm->regs.num_regs;
00871 
00872     if (rm->char_offset_num_allocated < num_regs) {
00873         REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00874         rm->char_offset_num_allocated = num_regs;
00875     }
00876 
00877     enc = rb_enc_get(RMATCH(match)->str);
00878     if (rb_enc_mbmaxlen(enc) == 1) {
00879         for (i = 0; i < num_regs; i++) {
00880             rm->char_offset[i].beg = BEG(i);
00881             rm->char_offset[i].end = END(i);
00882         }
00883         rm->char_offset_updated = 1;
00884         return;
00885     }
00886 
00887     pairs = ALLOCA_N(pair_t, num_regs*2);
00888     num_pos = 0;
00889     for (i = 0; i < num_regs; i++) {
00890         if (BEG(i) < 0)
00891             continue;
00892         pairs[num_pos++].byte_pos = BEG(i);
00893         pairs[num_pos++].byte_pos = END(i);
00894     }
00895     qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00896 
00897     s = p = RSTRING_PTR(RMATCH(match)->str);
00898     e = s + RSTRING_LEN(RMATCH(match)->str);
00899     c = 0;
00900     for (i = 0; i < num_pos; i++) {
00901         q = s + pairs[i].byte_pos;
00902         c += rb_enc_strlen(p, q, enc);
00903         pairs[i].char_pos = c;
00904         p = q;
00905     }
00906 
00907     for (i = 0; i < num_regs; i++) {
00908         pair_t key, *found;
00909         if (BEG(i) < 0) {
00910             rm->char_offset[i].beg = -1;
00911             rm->char_offset[i].end = -1;
00912             continue;
00913         }
00914 
00915         key.byte_pos = BEG(i);
00916         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00917         rm->char_offset[i].beg = found->char_pos;
00918 
00919         key.byte_pos = END(i);
00920         found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00921         rm->char_offset[i].end = found->char_pos;
00922     }
00923 
00924     rm->char_offset_updated = 1;
00925 }
00926 
00927 static void
00928 match_check(VALUE match)
00929 {
00930     if (!RMATCH(match)->regexp) {
00931         rb_raise(rb_eTypeError, "uninitialized Match");
00932     }
00933 }
00934 
00935 /* :nodoc: */
00936 static VALUE
00937 match_init_copy(VALUE obj, VALUE orig)
00938 {
00939     struct rmatch *rm;
00940 
00941     if (obj == orig) return obj;
00942 
00943     if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00944         rb_raise(rb_eTypeError, "wrong argument class");
00945     }
00946     RMATCH(obj)->str = RMATCH(orig)->str;
00947     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00948 
00949     rm = RMATCH(obj)->rmatch;
00950     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00951 
00952     if (!RMATCH(orig)->rmatch->char_offset_updated) {
00953         rm->char_offset_updated = 0;
00954     }
00955     else {
00956         if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00957             REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00958             rm->char_offset_num_allocated = rm->regs.num_regs;
00959         }
00960         MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00961                struct rmatch_offset, rm->regs.num_regs);
00962         rm->char_offset_updated = 1;
00963     }
00964 
00965     return obj;
00966 }
00967 
00968 
00969 /*
00970  * call-seq:
00971  *    mtch.regexp   -> regexp
00972  *
00973  * Returns the regexp.
00974  *
00975  *     m = /a.*b/.match("abc")
00976  *     m.regexp #=> /a.*b/
00977  */
00978 
00979 static VALUE
00980 match_regexp(VALUE match)
00981 {
00982     match_check(match);
00983     return RMATCH(match)->regexp;
00984 }
00985 
00986 /*
00987  * call-seq:
00988  *    mtch.names   -> [name1, name2, ...]
00989  *
00990  * Returns a list of names of captures as an array of strings.
00991  * It is same as mtch.regexp.names.
00992  *
00993  *     /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").names
00994  *     #=> ["foo", "bar", "baz"]
00995  *
00996  *     m = /(?<x>.)(?<y>.)?/.match("a") #=> #<MatchData "a" x:"a" y:nil>
00997  *     m.names                          #=> ["x", "y"]
00998  */
00999 
01000 static VALUE
01001 match_names(VALUE match)
01002 {
01003     match_check(match);
01004     return rb_reg_names(RMATCH(match)->regexp);
01005 }
01006 
01007 /*
01008  *  call-seq:
01009  *     mtch.length   -> integer
01010  *     mtch.size     -> integer
01011  *
01012  *  Returns the number of elements in the match array.
01013  *
01014  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01015  *     m.length   #=> 5
01016  *     m.size     #=> 5
01017  */
01018 
01019 static VALUE
01020 match_size(VALUE match)
01021 {
01022     match_check(match);
01023     return INT2FIX(RMATCH_REGS(match)->num_regs);
01024 }
01025 
01026 static int
01027 match_backref_number(VALUE match, VALUE backref)
01028 {
01029     const char *name;
01030     int num;
01031 
01032     struct re_registers *regs = RMATCH_REGS(match);
01033     VALUE regexp = RMATCH(match)->regexp;
01034 
01035     match_check(match);
01036     switch(TYPE(backref)) {
01037       default:
01038         return NUM2INT(backref);
01039 
01040       case T_SYMBOL:
01041         name = rb_id2name(SYM2ID(backref));
01042         break;
01043 
01044       case T_STRING:
01045         name = StringValueCStr(backref);
01046         break;
01047     }
01048 
01049     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01050               (const unsigned char*)name,
01051               (const unsigned char*)name + strlen(name),
01052               regs);
01053 
01054     if (num < 1) {
01055         rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01056     }
01057 
01058     return num;
01059 }
01060 
01061 int
01062 rb_reg_backref_number(VALUE match, VALUE backref)
01063 {
01064     return match_backref_number(match, backref);
01065 }
01066 
01067 /*
01068  *  call-seq:
01069  *     mtch.offset(n)   -> array
01070  *
01071  *  Returns a two-element array containing the beginning and ending offsets of
01072  *  the <em>n</em>th match.
01073  *  <em>n</em> can be a string or symbol to reference a named capture.
01074  *
01075  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01076  *     m.offset(0)      #=> [1, 7]
01077  *     m.offset(4)      #=> [6, 7]
01078  *
01079  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01080  *     p m.offset(:foo) #=> [0, 1]
01081  *     p m.offset(:bar) #=> [2, 3]
01082  *
01083  */
01084 
01085 static VALUE
01086 match_offset(VALUE match, VALUE n)
01087 {
01088     int i = match_backref_number(match, n);
01089     struct re_registers *regs = RMATCH_REGS(match);
01090 
01091     match_check(match);
01092     if (i < 0 || regs->num_regs <= i)
01093         rb_raise(rb_eIndexError, "index %d out of matches", i);
01094 
01095     if (BEG(i) < 0)
01096         return rb_assoc_new(Qnil, Qnil);
01097 
01098     update_char_offset(match);
01099     return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01100                         INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01101 }
01102 
01103 
01104 /*
01105  *  call-seq:
01106  *     mtch.begin(n)   -> integer
01107  *
01108  *  Returns the offset of the start of the <em>n</em>th element of the match
01109  *  array in the string.
01110  *  <em>n</em> can be a string or symbol to reference a named capture.
01111  *
01112  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01113  *     m.begin(0)       #=> 1
01114  *     m.begin(2)       #=> 2
01115  *
01116  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01117  *     p m.begin(:foo)  #=> 0
01118  *     p m.begin(:bar)  #=> 2
01119  */
01120 
01121 static VALUE
01122 match_begin(VALUE match, VALUE n)
01123 {
01124     int i = match_backref_number(match, n);
01125     struct re_registers *regs = RMATCH_REGS(match);
01126 
01127     match_check(match);
01128     if (i < 0 || regs->num_regs <= i)
01129         rb_raise(rb_eIndexError, "index %d out of matches", i);
01130 
01131     if (BEG(i) < 0)
01132         return Qnil;
01133 
01134     update_char_offset(match);
01135     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01136 }
01137 
01138 
01139 /*
01140  *  call-seq:
01141  *     mtch.end(n)   -> integer
01142  *
01143  *  Returns the offset of the character immediately following the end of the
01144  *  <em>n</em>th element of the match array in the string.
01145  *  <em>n</em> can be a string or symbol to reference a named capture.
01146  *
01147  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01148  *     m.end(0)         #=> 7
01149  *     m.end(2)         #=> 3
01150  *
01151  *     m = /(?<foo>.)(.)(?<bar>.)/.match("hoge")
01152  *     p m.end(:foo)    #=> 1
01153  *     p m.end(:bar)    #=> 3
01154  */
01155 
01156 static VALUE
01157 match_end(VALUE match, VALUE n)
01158 {
01159     int i = match_backref_number(match, n);
01160     struct re_registers *regs = RMATCH_REGS(match);
01161 
01162     match_check(match);
01163     if (i < 0 || regs->num_regs <= i)
01164         rb_raise(rb_eIndexError, "index %d out of matches", i);
01165 
01166     if (BEG(i) < 0)
01167         return Qnil;
01168 
01169     update_char_offset(match);
01170     return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01171 }
01172 
01173 #define MATCH_BUSY FL_USER2
01174 
01175 void
01176 rb_match_busy(VALUE match)
01177 {
01178     FL_SET(match, MATCH_BUSY);
01179 }
01180 
01181 /*
01182  *  call-seq:
01183  *     rxp.fixed_encoding?   -> true or false
01184  *
01185  *  Returns false if rxp is applicable to
01186  *  a string with any ASCII compatible encoding.
01187  *  Returns true otherwise.
01188  *
01189  *      r = /a/
01190  *      r.fixed_encoding?                               #=> false
01191  *      r =~ "\u{6666} a"                               #=> 2
01192  *      r =~ "\xa1\xa2 a".force_encoding("euc-jp")      #=> 2
01193  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01194  *
01195  *      r = /a/u
01196  *      r.fixed_encoding?                               #=> true
01197  *      r.encoding                                      #=> #<Encoding:UTF-8>
01198  *      r =~ "\u{6666} a"                               #=> 2
01199  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01200  *      r =~ "abc".force_encoding("euc-jp")             #=> 0
01201  *
01202  *      r = /\u{6666}/
01203  *      r.fixed_encoding?                               #=> true
01204  *      r.encoding                                      #=> #<Encoding:UTF-8>
01205  *      r =~ "\u{6666} a"                               #=> 0
01206  *      r =~ "\xa1\xa2".force_encoding("euc-jp")        #=> ArgumentError
01207  *      r =~ "abc".force_encoding("euc-jp")             #=> nil
01208  */
01209 
01210 static VALUE
01211 rb_reg_fixed_encoding_p(VALUE re)
01212 {
01213     if (FL_TEST(re, KCODE_FIXED))
01214         return Qtrue;
01215     else
01216         return Qfalse;
01217 }
01218 
01219 static VALUE
01220 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01221         rb_encoding **fixed_enc, onig_errmsg_buffer err);
01222 
01223 
01224 static void
01225 reg_enc_error(VALUE re, VALUE str)
01226 {
01227     rb_raise(rb_eEncCompatError,
01228              "incompatible encoding regexp match (%s regexp with %s string)",
01229              rb_enc_name(rb_enc_get(re)),
01230              rb_enc_name(rb_enc_get(str)));
01231 }
01232 
01233 static rb_encoding*
01234 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01235 {
01236     rb_encoding *enc = 0;
01237 
01238     if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01239         rb_raise(rb_eArgError,
01240             "invalid byte sequence in %s",
01241             rb_enc_name(rb_enc_get(str)));
01242     }
01243 
01244     rb_reg_check(re);
01245     enc = rb_enc_get(str);
01246     if (!rb_enc_str_asciicompat_p(str)) {
01247         if (RREGEXP(re)->ptr->enc != enc) {
01248             reg_enc_error(re, str);
01249         }
01250     }
01251     else if (rb_reg_fixed_encoding_p(re)) {
01252         if (RREGEXP(re)->ptr->enc != enc &&
01253             (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01254              rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01255             reg_enc_error(re, str);
01256         }
01257         enc = RREGEXP(re)->ptr->enc;
01258     }
01259     if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01260         enc != rb_ascii8bit_encoding() &&
01261         rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01262         rb_warn("regexp match /.../n against to %s string",
01263                 rb_enc_name(enc));
01264     }
01265     return enc;
01266 }
01267 
01268 regex_t *
01269 rb_reg_prepare_re(VALUE re, VALUE str)
01270 {
01271     regex_t *reg = RREGEXP(re)->ptr;
01272     onig_errmsg_buffer err = "";
01273     int r;
01274     OnigErrorInfo einfo;
01275     const char *pattern;
01276     VALUE unescaped;
01277     rb_encoding *fixed_enc = 0;
01278     rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01279 
01280     if (reg->enc == enc) return reg;
01281 
01282     rb_reg_check(re);
01283     reg = RREGEXP(re)->ptr;
01284     pattern = RREGEXP_SRC_PTR(re);
01285 
01286     unescaped = rb_reg_preprocess(
01287         pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01288         &fixed_enc, err);
01289 
01290     if (unescaped == Qnil) {
01291         rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01292     }
01293 
01294     r = onig_new(&reg, (UChar* )RSTRING_PTR(unescaped),
01295                  (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01296                  reg->options, enc,
01297                  OnigDefaultSyntax, &einfo);
01298     if (r) {
01299         onig_error_code_to_str((UChar*)err, r, &einfo);
01300         rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01301     }
01302 
01303     RB_GC_GUARD(unescaped);
01304     return reg;
01305 }
01306 
01307 long
01308 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01309 {
01310     long range;
01311     rb_encoding *enc;
01312     UChar *p, *string;
01313 
01314     enc = rb_reg_prepare_enc(re, str, 0);
01315 
01316     if (reverse) {
01317         range = -pos;
01318     }
01319     else {
01320         range = RSTRING_LEN(str) - pos;
01321     }
01322 
01323     if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01324          string = (UChar*)RSTRING_PTR(str);
01325 
01326          if (range > 0) {
01327               p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01328          }
01329          else {
01330               p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01331          }
01332          return p - string;
01333     }
01334 
01335     return pos;
01336 }
01337 
01338 long
01339 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01340 {
01341     long result;
01342     VALUE match;
01343     struct re_registers regi, *regs = &regi;
01344     char *range = RSTRING_PTR(str);
01345     regex_t *reg;
01346     int tmpreg;
01347 
01348     if (pos > RSTRING_LEN(str) || pos < 0) {
01349         rb_backref_set(Qnil);
01350         return -1;
01351     }
01352 
01353     reg = rb_reg_prepare_re(re, str);
01354     tmpreg = reg != RREGEXP(re)->ptr;
01355     if (!tmpreg) RREGEXP(re)->usecnt++;
01356 
01357     match = rb_backref_get();
01358     if (!NIL_P(match)) {
01359         if (FL_TEST(match, MATCH_BUSY)) {
01360             match = Qnil;
01361         }
01362         else {
01363             regs = RMATCH_REGS(match);
01364         }
01365     }
01366     if (NIL_P(match)) {
01367         MEMZERO(regs, struct re_registers, 1);
01368     }
01369     if (!reverse) {
01370         range += RSTRING_LEN(str);
01371     }
01372     result = onig_search(reg,
01373                          (UChar*)(RSTRING_PTR(str)),
01374                          ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01375                          ((UChar*)(RSTRING_PTR(str)) + pos),
01376                          ((UChar*)range),
01377                          regs, ONIG_OPTION_NONE);
01378     if (!tmpreg) RREGEXP(re)->usecnt--;
01379     if (tmpreg) {
01380         if (RREGEXP(re)->usecnt) {
01381             onig_free(reg);
01382         }
01383         else {
01384             onig_free(RREGEXP(re)->ptr);
01385             RREGEXP(re)->ptr = reg;
01386         }
01387     }
01388     if (result < 0) {
01389         if (regs == &regi)
01390             onig_region_free(regs, 0);
01391         if (result == ONIG_MISMATCH) {
01392             rb_backref_set(Qnil);
01393             return result;
01394         }
01395         else {
01396             onig_errmsg_buffer err = "";
01397             onig_error_code_to_str((UChar*)err, (int)result);
01398             rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, 0);
01399         }
01400     }
01401 
01402     if (NIL_P(match)) {
01403         match = match_alloc(rb_cMatch);
01404         onig_region_copy(RMATCH_REGS(match), regs);
01405         onig_region_free(regs, 0);
01406     }
01407     else {
01408         if (rb_safe_level() >= 3)
01409             OBJ_TAINT(match);
01410         else
01411             FL_UNSET(match, FL_TAINT);
01412     }
01413 
01414     RMATCH(match)->str = rb_str_new4(str);
01415     RMATCH(match)->regexp = re;
01416     RMATCH(match)->rmatch->char_offset_updated = 0;
01417     rb_backref_set(match);
01418 
01419     OBJ_INFECT(match, re);
01420     OBJ_INFECT(match, str);
01421 
01422     return result;
01423 }
01424 
01425 VALUE
01426 rb_reg_nth_defined(int nth, VALUE match)
01427 {
01428     struct re_registers *regs;
01429     if (NIL_P(match)) return Qnil;
01430     match_check(match);
01431     regs = RMATCH_REGS(match);
01432     if (nth >= regs->num_regs) {
01433         return Qnil;
01434     }
01435     if (nth < 0) {
01436         nth += regs->num_regs;
01437         if (nth <= 0) return Qnil;
01438     }
01439     if (BEG(nth) == -1) return Qfalse;
01440     return Qtrue;
01441 }
01442 
01443 VALUE
01444 rb_reg_nth_match(int nth, VALUE match)
01445 {
01446     VALUE str;
01447     long start, end, len;
01448     struct re_registers *regs;
01449 
01450     if (NIL_P(match)) return Qnil;
01451     match_check(match);
01452     regs = RMATCH_REGS(match);
01453     if (nth >= regs->num_regs) {
01454         return Qnil;
01455     }
01456     if (nth < 0) {
01457         nth += regs->num_regs;
01458         if (nth <= 0) return Qnil;
01459     }
01460     start = BEG(nth);
01461     if (start == -1) return Qnil;
01462     end = END(nth);
01463     len = end - start;
01464     str = rb_str_subseq(RMATCH(match)->str, start, len);
01465     OBJ_INFECT(str, match);
01466     return str;
01467 }
01468 
01469 VALUE
01470 rb_reg_last_match(VALUE match)
01471 {
01472     return rb_reg_nth_match(0, match);
01473 }
01474 
01475 
01476 /*
01477  *  call-seq:
01478  *     mtch.pre_match   -> str
01479  *
01480  *  Returns the portion of the original string before the current match.
01481  *  Equivalent to the special variable <code>$`</code>.
01482  *
01483  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01484  *     m.pre_match   #=> "T"
01485  */
01486 
01487 VALUE
01488 rb_reg_match_pre(VALUE match)
01489 {
01490     VALUE str;
01491     struct re_registers *regs;
01492 
01493     if (NIL_P(match)) return Qnil;
01494     match_check(match);
01495     regs = RMATCH_REGS(match);
01496     if (BEG(0) == -1) return Qnil;
01497     str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01498     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01499     return str;
01500 }
01501 
01502 
01503 /*
01504  *  call-seq:
01505  *     mtch.post_match   -> str
01506  *
01507  *  Returns the portion of the original string after the current match.
01508  *  Equivalent to the special variable <code>$'</code>.
01509  *
01510  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01511  *     m.post_match   #=> ": The Movie"
01512  */
01513 
01514 VALUE
01515 rb_reg_match_post(VALUE match)
01516 {
01517     VALUE str;
01518     long pos;
01519     struct re_registers *regs;
01520 
01521     if (NIL_P(match)) return Qnil;
01522     match_check(match);
01523     regs = RMATCH_REGS(match);
01524     if (BEG(0) == -1) return Qnil;
01525     str = RMATCH(match)->str;
01526     pos = END(0);
01527     str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01528     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01529     return str;
01530 }
01531 
01532 VALUE
01533 rb_reg_match_last(VALUE match)
01534 {
01535     int i;
01536     struct re_registers *regs;
01537 
01538     if (NIL_P(match)) return Qnil;
01539     match_check(match);
01540     regs = RMATCH_REGS(match);
01541     if (BEG(0) == -1) return Qnil;
01542 
01543     for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01544         ;
01545     if (i == 0) return Qnil;
01546     return rb_reg_nth_match(i, match);
01547 }
01548 
01549 static VALUE
01550 last_match_getter(void)
01551 {
01552     return rb_reg_last_match(rb_backref_get());
01553 }
01554 
01555 static VALUE
01556 prematch_getter(void)
01557 {
01558     return rb_reg_match_pre(rb_backref_get());
01559 }
01560 
01561 static VALUE
01562 postmatch_getter(void)
01563 {
01564     return rb_reg_match_post(rb_backref_get());
01565 }
01566 
01567 static VALUE
01568 last_paren_match_getter(void)
01569 {
01570     return rb_reg_match_last(rb_backref_get());
01571 }
01572 
01573 static VALUE
01574 match_array(VALUE match, int start)
01575 {
01576     struct re_registers *regs;
01577     VALUE ary;
01578     VALUE target;
01579     int i;
01580     int taint = OBJ_TAINTED(match);
01581 
01582     match_check(match);
01583     regs = RMATCH_REGS(match);
01584     ary = rb_ary_new2(regs->num_regs);
01585     target = RMATCH(match)->str;
01586 
01587     for (i=start; i<regs->num_regs; i++) {
01588         if (regs->beg[i] == -1) {
01589             rb_ary_push(ary, Qnil);
01590         }
01591         else {
01592             VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01593             if (taint) OBJ_TAINT(str);
01594             rb_ary_push(ary, str);
01595         }
01596     }
01597     return ary;
01598 }
01599 
01600 
01601 /* [MG]:FIXME: I put parens around the /.../.match() in the first line of the
01602    second example to prevent the '*' followed by a '/' from ending the
01603    comment. */
01604 
01605 /*
01606  *  call-seq:
01607  *     mtch.to_a   -> anArray
01608  *
01609  *  Returns the array of matches.
01610  *
01611  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01612  *     m.to_a   #=> ["HX1138", "H", "X", "113", "8"]
01613  *
01614  *  Because <code>to_a</code> is called when expanding
01615  *  <code>*</code><em>variable</em>, there's a useful assignment
01616  *  shortcut for extracting matched fields. This is slightly slower than
01617  *  accessing the fields directly (as an intermediate array is
01618  *  generated).
01619  *
01620  *     all,f1,f2,f3 = *(/(.)(.)(\d+)(\d)/.match("THX1138."))
01621  *     all   #=> "HX1138"
01622  *     f1    #=> "H"
01623  *     f2    #=> "X"
01624  *     f3    #=> "113"
01625  */
01626 
01627 static VALUE
01628 match_to_a(VALUE match)
01629 {
01630     return match_array(match, 0);
01631 }
01632 
01633 
01634 /*
01635  *  call-seq:
01636  *     mtch.captures   -> array
01637  *
01638  *  Returns the array of captures; equivalent to <code>mtch.to_a[1..-1]</code>.
01639  *
01640  *     f1,f2,f3,f4 = /(.)(.)(\d+)(\d)/.match("THX1138.").captures
01641  *     f1    #=> "H"
01642  *     f2    #=> "X"
01643  *     f3    #=> "113"
01644  *     f4    #=> "8"
01645  */
01646 static VALUE
01647 match_captures(VALUE match)
01648 {
01649     return match_array(match, 1);
01650 }
01651 
01652 static int
01653 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01654 {
01655     int num;
01656 
01657     num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01658         (const unsigned char* )name, (const unsigned char* )name_end, regs);
01659     if (num >= 1) {
01660         return num;
01661     }
01662     else {
01663         VALUE s = rb_str_new(name, (long )(name_end - name));
01664         rb_raise(rb_eIndexError, "undefined group name reference: %s",
01665                                  StringValuePtr(s));
01666     }
01667 }
01668 
01669 /*
01670  *  call-seq:
01671  *     mtch[i]               -> str or nil
01672  *     mtch[start, length]   -> array
01673  *     mtch[range]           -> array
01674  *     mtch[name]            -> str or nil
01675  *
01676  *  Match Reference---<code>MatchData</code> acts as an array, and may be
01677  *  accessed using the normal array indexing techniques.  <i>mtch</i>[0] is
01678  *  equivalent to the special variable <code>$&</code>, and returns the entire
01679  *  matched string.  <i>mtch</i>[1], <i>mtch</i>[2], and so on return the values
01680  *  of the matched backreferences (portions of the pattern between parentheses).
01681  *
01682  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01683  *     m          #=> #<MatchData "HX1138" 1:"H" 2:"X" 3:"113" 4:"8">
01684  *     m[0]       #=> "HX1138"
01685  *     m[1, 2]    #=> ["H", "X"]
01686  *     m[1..3]    #=> ["H", "X", "113"]
01687  *     m[-3, 2]   #=> ["X", "113"]
01688  *
01689  *     m = /(?<foo>a+)b/.match("ccaaab")
01690  *     m          #=> #<MatchData "aaab" foo:"aaa">
01691  *     m["foo"]   #=> "aaa"
01692  *     m[:foo]    #=> "aaa"
01693  */
01694 
01695 static VALUE
01696 match_aref(int argc, VALUE *argv, VALUE match)
01697 {
01698     VALUE idx, rest;
01699 
01700     match_check(match);
01701     rb_scan_args(argc, argv, "11", &idx, &rest);
01702 
01703     if (NIL_P(rest)) {
01704         if (FIXNUM_P(idx)) {
01705             if (FIX2INT(idx) >= 0) {
01706                 return rb_reg_nth_match(FIX2INT(idx), match);
01707             }
01708         }
01709         else {
01710             const char *p;
01711             int num;
01712 
01713             switch (TYPE(idx)) {
01714               case T_SYMBOL:
01715                 p = rb_id2name(SYM2ID(idx));
01716                 goto name_to_backref;
01717                 break;
01718               case T_STRING:
01719                 p = StringValuePtr(idx);
01720 
01721               name_to_backref:
01722                 num = name_to_backref_number(RMATCH_REGS(match),
01723                                              RMATCH(match)->regexp, p, p + strlen(p));
01724                 return rb_reg_nth_match(num, match);
01725                 break;
01726 
01727               default:
01728                 break;
01729             }
01730         }
01731     }
01732 
01733     return rb_ary_aref(argc, argv, match_to_a(match));
01734 }
01735 
01736 static VALUE
01737 match_entry(VALUE match, long n)
01738 {
01739     /* n should not exceed num_regs */
01740     return rb_reg_nth_match((int)n, match);
01741 }
01742 
01743 
01744 /*
01745  *  call-seq:
01746  *
01747  *     mtch.values_at([index]*)   -> array
01748  *
01749  *  Uses each <i>index</i> to access the matching values, returning an array of
01750  *  the corresponding matches.
01751  *
01752  *     m = /(.)(.)(\d+)(\d)/.match("THX1138: The Movie")
01753  *     m.to_a               #=> ["HX1138", "H", "X", "113", "8"]
01754  *     m.values_at(0, 2, -2)   #=> ["HX1138", "X", "113"]
01755  */
01756 
01757 static VALUE
01758 match_values_at(int argc, VALUE *argv, VALUE match)
01759 {
01760     struct re_registers *regs;
01761 
01762     match_check(match);
01763     regs = RMATCH_REGS(match);
01764     return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01765 }
01766 
01767 
01768 /*
01769  *  call-seq:
01770  *     mtch.to_s   -> str
01771  *
01772  *  Returns the entire matched string.
01773  *
01774  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01775  *     m.to_s   #=> "HX1138"
01776  */
01777 
01778 static VALUE
01779 match_to_s(VALUE match)
01780 {
01781     VALUE str = rb_reg_last_match(match);
01782 
01783     match_check(match);
01784     if (NIL_P(str)) str = rb_str_new(0,0);
01785     if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01786     if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01787     return str;
01788 }
01789 
01790 
01791 /*
01792  *  call-seq:
01793  *     mtch.string   -> str
01794  *
01795  *  Returns a frozen copy of the string passed in to <code>match</code>.
01796  *
01797  *     m = /(.)(.)(\d+)(\d)/.match("THX1138.")
01798  *     m.string   #=> "THX1138."
01799  */
01800 
01801 static VALUE
01802 match_string(VALUE match)
01803 {
01804     match_check(match);
01805     return RMATCH(match)->str;  /* str is frozen */
01806 }
01807 
01808 struct backref_name_tag {
01809     const UChar *name;
01810     long len;
01811 };
01812 
01813 static int
01814 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01815           int back_num, int *back_refs, OnigRegex regex, void *arg0)
01816 {
01817     struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01818     int i;
01819 
01820     for (i = 0; i < back_num; i++) {
01821         arg[back_refs[i]].name = name;
01822         arg[back_refs[i]].len = name_end - name;
01823     }
01824     return 0;
01825 }
01826 
01827 /*
01828  * call-seq:
01829  *    mtch.inspect   -> str
01830  *
01831  * Returns a printable version of <i>mtch</i>.
01832  *
01833  *     puts /.$/.match("foo").inspect
01834  *     #=> #<MatchData "o">
01835  *
01836  *     puts /(.)(.)(.)/.match("foo").inspect
01837  *     #=> #<MatchData "foo" 1:"f" 2:"o" 3:"o">
01838  *
01839  *     puts /(.)(.)?(.)/.match("fo").inspect
01840  *     #=> #<MatchData "fo" 1:"f" 2:nil 3:"o">
01841  *
01842  *     puts /(?<foo>.)(?<bar>.)(?<baz>.)/.match("hoge").inspect
01843  *     #=> #<MatchData "hog" foo:"h" bar:"o" baz:"g">
01844  *
01845  */
01846 
01847 static VALUE
01848 match_inspect(VALUE match)
01849 {
01850     const char *cname = rb_obj_classname(match);
01851     VALUE str;
01852     int i;
01853     struct re_registers *regs = RMATCH_REGS(match);
01854     int num_regs = regs->num_regs;
01855     struct backref_name_tag *names;
01856     VALUE regexp = RMATCH(match)->regexp;
01857 
01858     if (regexp == 0) {
01859         return rb_sprintf("#<%s:%p>", cname, (void*)match);
01860     }
01861 
01862     names = ALLOCA_N(struct backref_name_tag, num_regs);
01863     MEMZERO(names, struct backref_name_tag, num_regs);
01864 
01865     onig_foreach_name(RREGEXP(regexp)->ptr,
01866             match_inspect_name_iter, names);
01867 
01868     str = rb_str_buf_new2("#<");
01869     rb_str_buf_cat2(str, cname);
01870 
01871     for (i = 0; i < num_regs; i++) {
01872         VALUE v;
01873         rb_str_buf_cat2(str, " ");
01874         if (0 < i) {
01875             if (names[i].name)
01876                 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01877             else {
01878                 rb_str_catf(str, "%d", i);
01879             }
01880             rb_str_buf_cat2(str, ":");
01881         }
01882         v = rb_reg_nth_match(i, match);
01883         if (v == Qnil)
01884             rb_str_buf_cat2(str, "nil");
01885         else
01886             rb_str_buf_append(str, rb_str_inspect(v));
01887     }
01888     rb_str_buf_cat2(str, ">");
01889 
01890     return str;
01891 }
01892 
01893 VALUE rb_cRegexp;
01894 
01895 static int
01896 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01897 {
01898     const char *p = *pp;
01899     int code;
01900     int meta_prefix = 0, ctrl_prefix = 0;
01901     size_t len;
01902     int retbyte;
01903 
01904     retbyte = -1;
01905     if (p == end || *p++ != '\\') {
01906         errcpy(err, "too short escaped multibyte character");
01907         return -1;
01908     }
01909 
01910 again:
01911     if (p == end) {
01912         errcpy(err, "too short escape sequence");
01913         return -1;
01914     }
01915     switch (*p++) {
01916       case '\\': code = '\\'; break;
01917       case 'n': code = '\n'; break;
01918       case 't': code = '\t'; break;
01919       case 'r': code = '\r'; break;
01920       case 'f': code = '\f'; break;
01921       case 'v': code = '\013'; break;
01922       case 'a': code = '\007'; break;
01923       case 'e': code = '\033'; break;
01924 
01925       /* \OOO */
01926       case '0': case '1': case '2': case '3':
01927       case '4': case '5': case '6': case '7':
01928         p--;
01929         code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01930         p += len;
01931         break;
01932 
01933       case 'x': /* \xHH */
01934         code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01935         if (len < 1) {
01936             errcpy(err, "invalid hex escape");
01937             return -1;
01938         }
01939         p += len;
01940         break;
01941 
01942       case 'M': /* \M-X, \M-\C-X, \M-\cX */
01943         if (meta_prefix) {
01944             errcpy(err, "duplicate meta escape");
01945             return -1;
01946         }
01947         meta_prefix = 1;
01948         if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01949             if (*p == '\\') {
01950                 p++;
01951                 goto again;
01952             }
01953             else {
01954                 code = *p++;
01955                 break;
01956             }
01957         }
01958         errcpy(err, "too short meta escape");
01959         return -1;
01960 
01961       case 'C': /* \C-X, \C-\M-X */
01962         if (p == end || *p++ != '-') {
01963             errcpy(err, "too short control escape");
01964             return -1;
01965         }
01966       case 'c': /* \cX, \c\M-X */
01967         if (ctrl_prefix) {
01968             errcpy(err, "duplicate control escape");
01969             return -1;
01970         }
01971         ctrl_prefix = 1;
01972         if (p < end && (*p & 0x80) == 0) {
01973             if (*p == '\\') {
01974                 p++;
01975                 goto again;
01976             }
01977             else {
01978                 code = *p++;
01979                 break;
01980             }
01981         }
01982         errcpy(err, "too short control escape");
01983         return -1;
01984 
01985       default:
01986         errcpy(err, "unexpected escape sequence");
01987         return -1;
01988     }
01989     if (code < 0 || 0xff < code) {
01990         errcpy(err, "invalid escape code");
01991         return -1;
01992     }
01993 
01994     if (ctrl_prefix)
01995         code &= 0x1f;
01996     if (meta_prefix)
01997         code |= 0x80;
01998 
01999     *pp = p;
02000     return code;
02001 }
02002 
02003 static int
02004 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02005         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02006 {
02007     const char *p = *pp;
02008     int chmaxlen = rb_enc_mbmaxlen(enc);
02009     char *chbuf = ALLOCA_N(char, chmaxlen);
02010     int chlen = 0;
02011     int byte;
02012     int l;
02013 
02014     memset(chbuf, 0, chmaxlen);
02015 
02016     byte = read_escaped_byte(&p, end, err);
02017     if (byte == -1) {
02018         return -1;
02019     }
02020 
02021     chbuf[chlen++] = byte;
02022     while (chlen < chmaxlen &&
02023            MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02024         byte = read_escaped_byte(&p, end, err);
02025         if (byte == -1) {
02026             return -1;
02027         }
02028         chbuf[chlen++] = byte;
02029     }
02030 
02031     l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02032     if (MBCLEN_INVALID_P(l)) {
02033         errcpy(err, "invalid multibyte escape");
02034         return -1;
02035     }
02036     if (1 < chlen || (chbuf[0] & 0x80)) {
02037         rb_str_buf_cat(buf, chbuf, chlen);
02038 
02039         if (*encp == 0)
02040             *encp = enc;
02041         else if (*encp != enc) {
02042             errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02043             return -1;
02044         }
02045     }
02046     else {
02047         char escbuf[5];
02048         snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02049         rb_str_buf_cat(buf, escbuf, 4);
02050     }
02051     *pp = p;
02052     return 0;
02053 }
02054 
02055 static int
02056 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02057 {
02058     if ((0xd800 <= code && code <= 0xdfff) || /* Surrogates */
02059         0x10ffff < code) {
02060         errcpy(err, "invalid Unicode range");
02061         return -1;
02062     }
02063     return 0;
02064 }
02065 
02066 static int
02067 append_utf8(unsigned long uv,
02068         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02069 {
02070     if (check_unicode_range(uv, err) != 0)
02071         return -1;
02072     if (uv < 0x80) {
02073         char escbuf[5];
02074         snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02075         rb_str_buf_cat(buf, escbuf, 4);
02076     }
02077     else {
02078         int len;
02079         char utf8buf[6];
02080         len = rb_uv_to_utf8(utf8buf, uv);
02081         rb_str_buf_cat(buf, utf8buf, len);
02082 
02083         if (*encp == 0)
02084             *encp = rb_utf8_encoding();
02085         else if (*encp != rb_utf8_encoding()) {
02086             errcpy(err, "UTF-8 character in non UTF-8 regexp");
02087             return -1;
02088         }
02089     }
02090     return 0;
02091 }
02092 
02093 static int
02094 unescape_unicode_list(const char **pp, const char *end,
02095         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02096 {
02097     const char *p = *pp;
02098     int has_unicode = 0;
02099     unsigned long code;
02100     size_t len;
02101 
02102     while (p < end && ISSPACE(*p)) p++;
02103 
02104     while (1) {
02105         code = ruby_scan_hex(p, end-p, &len);
02106         if (len == 0)
02107             break;
02108         if (6 < len) { /* max 10FFFF */
02109             errcpy(err, "invalid Unicode range");
02110             return -1;
02111         }
02112         p += len;
02113         if (append_utf8(code, buf, encp, err) != 0)
02114             return -1;
02115         has_unicode = 1;
02116 
02117         while (p < end && ISSPACE(*p)) p++;
02118     }
02119 
02120     if (has_unicode == 0) {
02121         errcpy(err, "invalid Unicode list");
02122         return -1;
02123     }
02124 
02125     *pp = p;
02126 
02127     return 0;
02128 }
02129 
02130 static int
02131 unescape_unicode_bmp(const char **pp, const char *end,
02132         VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02133 {
02134     const char *p = *pp;
02135     size_t len;
02136     unsigned long code;
02137 
02138     if (end < p+4) {
02139         errcpy(err, "invalid Unicode escape");
02140         return -1;
02141     }
02142     code = ruby_scan_hex(p, 4, &len);
02143     if (len != 4) {
02144         errcpy(err, "invalid Unicode escape");
02145         return -1;
02146     }
02147     if (append_utf8(code, buf, encp, err) != 0)
02148         return -1;
02149     *pp = p + 4;
02150     return 0;
02151 }
02152 
02153 static int
02154 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02155         VALUE buf, rb_encoding **encp, int *has_property,
02156         onig_errmsg_buffer err)
02157 {
02158     char c;
02159     char smallbuf[2];
02160 
02161     while (p < end) {
02162         int chlen = rb_enc_precise_mbclen(p, end, enc);
02163         if (!MBCLEN_CHARFOUND_P(chlen)) {
02164             errcpy(err, "invalid multibyte character");
02165             return -1;
02166         }
02167         chlen = MBCLEN_CHARFOUND_LEN(chlen);
02168         if (1 < chlen || (*p & 0x80)) {
02169             rb_str_buf_cat(buf, p, chlen);
02170             p += chlen;
02171             if (*encp == 0)
02172                 *encp = enc;
02173             else if (*encp != enc) {
02174                 errcpy(err, "non ASCII character in UTF-8 regexp");
02175                 return -1;
02176             }
02177             continue;
02178         }
02179 
02180         switch (c = *p++) {
02181           case '\\':
02182             if (p == end) {
02183                 errcpy(err, "too short escape sequence");
02184                 return -1;
02185             }
02186             switch (c = *p++) {
02187               case '1': case '2': case '3':
02188               case '4': case '5': case '6': case '7': /* \O, \OO, \OOO or backref */
02189                 {
02190                     size_t octlen;
02191                     if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02192                         /* backref or 7bit octal.
02193                            no need to unescape anyway.
02194                            re-escaping may break backref */
02195                         goto escape_asis;
02196                     }
02197                 }
02198                 /* xxx: How about more than 199 subexpressions? */
02199 
02200               case '0': /* \0, \0O, \0OO */
02201 
02202               case 'x': /* \xHH */
02203               case 'c': /* \cX, \c\M-X */
02204               case 'C': /* \C-X, \C-\M-X */
02205               case 'M': /* \M-X, \M-\C-X, \M-\cX */
02206                 p = p-2;
02207                 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02208                     return -1;
02209                 break;
02210 
02211               case 'u':
02212                 if (p == end) {
02213                     errcpy(err, "too short escape sequence");
02214                     return -1;
02215                 }
02216                 if (*p == '{') {
02217                     /* \u{H HH HHH HHHH HHHHH HHHHHH ...} */
02218                     p++;
02219                     if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02220                         return -1;
02221                     if (p == end || *p++ != '}') {
02222                         errcpy(err, "invalid Unicode list");
02223                         return -1;
02224                     }
02225                     break;
02226                 }
02227                 else {
02228                     /* \uHHHH */
02229                     if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02230                         return -1;
02231                     break;
02232                 }
02233 
02234               case 'p': /* \p{Hiragana} */
02235               case 'P':
02236                 if (!*encp) {
02237                     *has_property = 1;
02238                 }
02239                 goto escape_asis;
02240 
02241               default: /* \n, \\, \d, \9, etc. */
02242 escape_asis:
02243                 smallbuf[0] = '\\';
02244                 smallbuf[1] = c;
02245                 rb_str_buf_cat(buf, smallbuf, 2);
02246                 break;
02247             }
02248             break;
02249 
02250           default:
02251             rb_str_buf_cat(buf, &c, 1);
02252             break;
02253         }
02254     }
02255 
02256     return 0;
02257 }
02258 
02259 static VALUE
02260 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02261         rb_encoding **fixed_enc, onig_errmsg_buffer err)
02262 {
02263     VALUE buf;
02264     int has_property = 0;
02265 
02266     buf = rb_str_buf_new(0);
02267 
02268     if (rb_enc_asciicompat(enc))
02269         *fixed_enc = 0;
02270     else {
02271         *fixed_enc = enc;
02272         rb_enc_associate(buf, enc);
02273     }
02274 
02275     if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02276         return Qnil;
02277 
02278     if (has_property && !*fixed_enc) {
02279         *fixed_enc = enc;
02280     }
02281 
02282     if (*fixed_enc) {
02283         rb_enc_associate(buf, *fixed_enc);
02284     }
02285 
02286     return buf;
02287 }
02288 
02289 VALUE
02290 rb_reg_check_preprocess(VALUE str)
02291 {
02292     rb_encoding *fixed_enc = 0;
02293     onig_errmsg_buffer err = "";
02294     VALUE buf;
02295     char *p, *end;
02296     rb_encoding *enc;
02297 
02298     StringValue(str);
02299     p = RSTRING_PTR(str);
02300     end = p + RSTRING_LEN(str);
02301     enc = rb_enc_get(str);
02302 
02303     buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02304     RB_GC_GUARD(str);
02305 
02306     if (buf == Qnil) {
02307         return rb_reg_error_desc(str, 0, err);
02308     }
02309     return Qnil;
02310 }
02311 
02312 static VALUE
02313 rb_reg_preprocess_dregexp(VALUE ary, int options)
02314 {
02315     rb_encoding *fixed_enc = 0;
02316     rb_encoding *regexp_enc = 0;
02317     onig_errmsg_buffer err = "";
02318     int i;
02319     VALUE result = 0;
02320     rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02321 
02322     if (RARRAY_LEN(ary) == 0) {
02323         rb_raise(rb_eArgError, "no arguments given");
02324     }
02325 
02326     for (i = 0; i < RARRAY_LEN(ary); i++) {
02327         VALUE str = RARRAY_PTR(ary)[i];
02328         VALUE buf;
02329         char *p, *end;
02330         rb_encoding *src_enc;
02331 
02332         src_enc = rb_enc_get(str);
02333         if (options & ARG_ENCODING_NONE &&
02334                 src_enc != ascii8bit) {
02335             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02336                 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02337             else
02338                 src_enc = ascii8bit;
02339         }
02340 
02341         StringValue(str);
02342         p = RSTRING_PTR(str);
02343         end = p + RSTRING_LEN(str);
02344 
02345         buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02346 
02347         if (buf == Qnil)
02348             rb_raise(rb_eArgError, "%s", err);
02349 
02350         if (fixed_enc != 0) {
02351             if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02352                 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02353                          rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02354             }
02355             regexp_enc = fixed_enc;
02356         }
02357 
02358         if (!result)
02359             result = rb_str_new3(str);
02360         else
02361             rb_str_buf_append(result, str);
02362     }
02363     if (regexp_enc) {
02364         rb_enc_associate(result, regexp_enc);
02365     }
02366 
02367     return result;
02368 }
02369 
02370 static int
02371 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02372                   int options, onig_errmsg_buffer err,
02373                   const char *sourcefile, int sourceline)
02374 {
02375     struct RRegexp *re = RREGEXP(obj);
02376     VALUE unescaped;
02377     rb_encoding *fixed_enc = 0;
02378     rb_encoding *a_enc = rb_ascii8bit_encoding();
02379 
02380     if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02381         rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02382     rb_check_frozen(obj);
02383     if (FL_TEST(obj, REG_LITERAL))
02384         rb_raise(rb_eSecurityError, "can't modify literal regexp");
02385     if (re->ptr)
02386         rb_raise(rb_eTypeError, "already initialized regexp");
02387     re->ptr = 0;
02388 
02389     if (rb_enc_dummy_p(enc)) {
02390             errcpy(err, "can't make regexp with dummy encoding");
02391             return -1;
02392     }
02393 
02394     unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02395     if (unescaped == Qnil)
02396         return -1;
02397 
02398     if (fixed_enc) {
02399         if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02400             (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02401             errcpy(err, "incompatible character encoding");
02402             return -1;
02403         }
02404         if (fixed_enc != a_enc) {
02405             options |= ARG_ENCODING_FIXED;
02406             enc = fixed_enc;
02407         }
02408     }
02409     else if (!(options & ARG_ENCODING_FIXED)) {
02410        enc = rb_usascii_encoding();
02411     }
02412 
02413     rb_enc_associate((VALUE)re, enc);
02414     if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02415         re->basic.flags |= KCODE_FIXED;
02416     }
02417     if (options & ARG_ENCODING_NONE) {
02418         re->basic.flags |= REG_ENCODING_NONE;
02419     }
02420 
02421     re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02422                           options & ARG_REG_OPTION_MASK, err,
02423                           sourcefile, sourceline);
02424     if (!re->ptr) return -1;
02425     re->src = rb_enc_str_new(s, len, enc);
02426     OBJ_FREEZE(re->src);
02427     RB_GC_GUARD(unescaped);
02428     return 0;
02429 }
02430 
02431 static int
02432 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02433         const char *sourcefile, int sourceline)
02434 {
02435     int ret;
02436     rb_encoding *enc = rb_enc_get(str);
02437     if (options & ARG_ENCODING_NONE) {
02438         rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02439         if (enc != ascii8bit) {
02440             if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02441                 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02442                 return -1;
02443             }
02444             enc = ascii8bit;
02445         }
02446     }
02447     ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02448                             options, err, sourcefile, sourceline);
02449     RB_GC_GUARD(str);
02450     return ret;
02451 }
02452 
02453 static VALUE
02454 rb_reg_s_alloc(VALUE klass)
02455 {
02456     NEWOBJ(re, struct RRegexp);
02457     OBJSETUP(re, klass, T_REGEXP);
02458 
02459     re->ptr = 0;
02460     re->src = 0;
02461     re->usecnt = 0;
02462 
02463     return (VALUE)re;
02464 }
02465 
02466 VALUE
02467 rb_reg_alloc(void)
02468 {
02469     return rb_reg_s_alloc(rb_cRegexp);
02470 }
02471 
02472 VALUE
02473 rb_reg_new_str(VALUE s, int options)
02474 {
02475     return rb_reg_init_str(rb_reg_alloc(), s, options);
02476 }
02477 
02478 VALUE
02479 rb_reg_init_str(VALUE re, VALUE s, int options)
02480 {
02481     onig_errmsg_buffer err = "";
02482 
02483     if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02484         rb_reg_raise_str(s, options, err);
02485     }
02486 
02487     return re;
02488 }
02489 
02490 VALUE
02491 rb_reg_new_ary(VALUE ary, int opt)
02492 {
02493     return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02494 }
02495 
02496 VALUE
02497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02498 {
02499     VALUE re = rb_reg_alloc();
02500     onig_errmsg_buffer err = "";
02501 
02502     if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02503         rb_enc_reg_raise(s, len, enc, options, err);
02504     }
02505 
02506     return re;
02507 }
02508 
02509 VALUE
02510 rb_reg_new(const char *s, long len, int options)
02511 {
02512     return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02513 }
02514 
02515 VALUE
02516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02517 {
02518     VALUE re = rb_reg_alloc();
02519     onig_errmsg_buffer err = "";
02520 
02521     if (!str) str = rb_str_new(0,0);
02522     if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02523         rb_set_errinfo(rb_reg_error_desc(str, options, err));
02524         return Qnil;
02525     }
02526     FL_SET(re, REG_LITERAL);
02527     return re;
02528 }
02529 
02530 static VALUE reg_cache;
02531 
02532 VALUE
02533 rb_reg_regcomp(VALUE str)
02534 {
02535     volatile VALUE save_str = str;
02536     if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02537         && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02538         && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02539         return reg_cache;
02540 
02541     return reg_cache = rb_reg_new_str(save_str, 0);
02542 }
02543 
02544 static st_index_t reg_hash(VALUE re);
02545 /*
02546  * call-seq:
02547  *   rxp.hash   -> fixnum
02548  *
02549  * Produce a hash based on the text and options of this regular expression.
02550  */
02551 
02552 static VALUE
02553 rb_reg_hash(VALUE re)
02554 {
02555     st_index_t hashval = reg_hash(re);
02556     return LONG2FIX(hashval);
02557 }
02558 
02559 static st_index_t
02560 reg_hash(VALUE re)
02561 {
02562     st_index_t hashval;
02563 
02564     rb_reg_check(re);
02565     hashval = RREGEXP(re)->ptr->options;
02566     hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02567     return rb_hash_end(hashval);
02568 }
02569 
02570 
02571 /*
02572  *  call-seq:
02573  *     rxp == other_rxp      -> true or false
02574  *     rxp.eql?(other_rxp)   -> true or false
02575  *
02576  *  Equality---Two regexps are equal if their patterns are identical, they have
02577  *  the same character set code, and their <code>casefold?</code> values are the
02578  *  same.
02579  *
02580  *     /abc/  == /abc/x   #=> false
02581  *     /abc/  == /abc/i   #=> false
02582  *     /abc/  == /abc/n   #=> false
02583  *     /abc/u == /abc/n   #=> false
02584  */
02585 
02586 static VALUE
02587 rb_reg_equal(VALUE re1, VALUE re2)
02588 {
02589     if (re1 == re2) return Qtrue;
02590     if (TYPE(re2) != T_REGEXP) return Qfalse;
02591     rb_reg_check(re1); rb_reg_check(re2);
02592     if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02593     if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02594     if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02595     if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02596     if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02597         return Qtrue;
02598     }
02599     return Qfalse;
02600 }
02601 
02602 /*
02603  * call-seq:
02604  *    mtch.hash   -> integer
02605  *
02606  * Produce a hash based on the target string, regexp and matched
02607  * positions of this matchdata.
02608  */
02609 
02610 static VALUE
02611 match_hash(VALUE match)
02612 {
02613     const struct re_registers *regs;
02614     st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02615 
02616     rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02617     regs = RMATCH_REGS(match);
02618     hashval = rb_hash_uint(hashval, regs->num_regs);
02619     hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02620     hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02621     hashval = rb_hash_end(hashval);
02622     return LONG2FIX(hashval);
02623 }
02624 
02625 /*
02626  * call-seq:
02627  *    mtch == mtch2   -> true or false
02628  *
02629  *  Equality---Two matchdata are equal if their target strings,
02630  *  patterns, and matched positions are identical.
02631  */
02632 
02633 static VALUE
02634 match_equal(VALUE match1, VALUE match2)
02635 {
02636     const struct re_registers *regs1, *regs2;
02637     if (match1 == match2) return Qtrue;
02638     if (TYPE(match2) != T_MATCH) return Qfalse;
02639     if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02640     if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02641     regs1 = RMATCH_REGS(match1);
02642     regs2 = RMATCH_REGS(match2);
02643     if (regs1->num_regs != regs2->num_regs) return Qfalse;
02644     if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02645     if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02646     return Qtrue;
02647 }
02648 
02649 static VALUE
02650 reg_operand(VALUE s, int check)
02651 {
02652     if (SYMBOL_P(s)) {
02653         return rb_sym_to_s(s);
02654     }
02655     else {
02656         VALUE tmp = rb_check_string_type(s);
02657         if (check && NIL_P(tmp)) {
02658             rb_raise(rb_eTypeError, "can't convert %s to String",
02659                      rb_obj_classname(s));
02660         }
02661         return tmp;
02662     }
02663 }
02664 
02665 static long
02666 reg_match_pos(VALUE re, VALUE *strp, long pos)
02667 {
02668     VALUE str = *strp;
02669 
02670     if (NIL_P(str)) {
02671         rb_backref_set(Qnil);
02672         return -1;
02673     }
02674     *strp = str = reg_operand(str, TRUE);
02675     if (pos != 0) {
02676         if (pos < 0) {
02677             VALUE l = rb_str_length(str);
02678             pos += NUM2INT(l);
02679             if (pos < 0) {
02680                 return pos;
02681             }
02682         }
02683         pos = rb_str_offset(str, pos);
02684     }
02685     return rb_reg_search(re, str, pos, 0);
02686 }
02687 
02688 /*
02689  *  call-seq:
02690  *     rxp =~ str    -> integer or nil
02691  *
02692  *  Match---Matches <i>rxp</i> against <i>str</i>.
02693  *
02694  *     /at/ =~ "input data"   #=> 7
02695  *     /ax/ =~ "input data"   #=> nil
02696  *
02697  *  If <code>=~</code> is used with a regexp literal with named captures,
02698  *  captured strings (or nil) is assigned to local variables named by
02699  *  the capture names.
02700  *
02701  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = y  "
02702  *     p lhs    #=> "x"
02703  *     p rhs    #=> "y"
02704  *
02705  *  If it is not matched, nil is assigned for the variables.
02706  *
02707  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "  x = "
02708  *     p lhs    #=> nil
02709  *     p rhs    #=> nil
02710  *
02711  *  This assignment is implemented in the Ruby parser.
02712  *  The parser detects 'regexp-literal =~ expression' for the assignment.
02713  *  The regexp must be a literal without interpolation and placed at left hand side.
02714  *
02715  *  The assignment is not occur if the regexp is not a literal.
02716  *
02717  *     re = /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02718  *     re =~ "  x = y  "
02719  *     p lhs    # undefined local variable
02720  *     p rhs    # undefined local variable
02721  *
02722  *  A regexp interpolation, <code>#{}</code>, also disables
02723  *  the assignment.
02724  *
02725  *     rhs_pat = /(?<rhs>\w+)/
02726  *     /(?<lhs>\w+)\s*=\s*#{rhs_pat}/ =~ "x = y"
02727  *     p lhs    # undefined local variable
02728  *
02729  *  The assignment is not occur if the regexp is placed at right hand side.
02730  *
02731  *    "  x = y  " =~ /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/
02732  *    p lhs, rhs # undefined local variable
02733  *
02734  */
02735 
02736 VALUE
02737 rb_reg_match(VALUE re, VALUE str)
02738 {
02739     long pos = reg_match_pos(re, &str, 0);
02740     if (pos < 0) return Qnil;
02741     pos = rb_str_sublen(str, pos);
02742     return LONG2FIX(pos);
02743 }
02744 
02745 /*
02746  *  call-seq:
02747  *     rxp === str   -> true or false
02748  *
02749  *  Case Equality---Synonym for <code>Regexp#=~</code> used in case statements.
02750  *
02751  *     a = "HELLO"
02752  *     case a
02753  *     when /^[a-z]*$/; print "Lower case\n"
02754  *     when /^[A-Z]*$/; print "Upper case\n"
02755  *     else;            print "Mixed case\n"
02756  *     end
02757  *
02758  *  <em>produces:</em>
02759  *
02760  *     Upper case
02761  */
02762 
02763 VALUE
02764 rb_reg_eqq(VALUE re, VALUE str)
02765 {
02766     long start;
02767 
02768     str = reg_operand(str, FALSE);
02769     if (NIL_P(str)) {
02770         rb_backref_set(Qnil);
02771         return Qfalse;
02772     }
02773     start = rb_reg_search(re, str, 0, 0);
02774     if (start < 0) {
02775         return Qfalse;
02776     }
02777     return Qtrue;
02778 }
02779 
02780 
02781 /*
02782  *  call-seq:
02783  *     ~ rxp   -> integer or nil
02784  *
02785  *  Match---Matches <i>rxp</i> against the contents of <code>$_</code>.
02786  *  Equivalent to <code><i>rxp</i> =~ $_</code>.
02787  *
02788  *     $_ = "input data"
02789  *     ~ /at/   #=> 7
02790  */
02791 
02792 VALUE
02793 rb_reg_match2(VALUE re)
02794 {
02795     long start;
02796     VALUE line = rb_lastline_get();
02797 
02798     if (TYPE(line) != T_STRING) {
02799         rb_backref_set(Qnil);
02800         return Qnil;
02801     }
02802 
02803     start = rb_reg_search(re, line, 0, 0);
02804     if (start < 0) {
02805         return Qnil;
02806     }
02807     start = rb_str_sublen(line, start);
02808     return LONG2FIX(start);
02809 }
02810 
02811 
02812 /*
02813  *  call-seq:
02814  *     rxp.match(str)       -> matchdata or nil
02815  *     rxp.match(str,pos)   -> matchdata or nil
02816  *
02817  *  Returns a <code>MatchData</code> object describing the match, or
02818  *  <code>nil</code> if there was no match. This is equivalent to retrieving the
02819  *  value of the special variable <code>$~</code> following a normal match.
02820  *  If the second parameter is present, it specifies the position in the string
02821  *  to begin the search.
02822  *
02823  *     /(.)(.)(.)/.match("abc")[2]   #=> "b"
02824  *     /(.)(.)/.match("abc", 1)[2]   #=> "c"
02825  *
02826  *  If a block is given, invoke the block with MatchData if match succeed, so
02827  *  that you can write
02828  *
02829  *     pat.match(str) {|m| ...}
02830  *
02831  *  instead of
02832  *
02833  *     if m = pat.match(str)
02834  *       ...
02835  *     end
02836  *
02837  *  The return value is a value from block execution in this case.
02838  */
02839 
02840 static VALUE
02841 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02842 {
02843     VALUE result, str, initpos;
02844     long pos;
02845 
02846     if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02847         pos = NUM2LONG(initpos);
02848     }
02849     else {
02850         pos = 0;
02851     }
02852 
02853     pos = reg_match_pos(re, &str, pos);
02854     if (pos < 0) {
02855         rb_backref_set(Qnil);
02856         return Qnil;
02857     }
02858     result = rb_backref_get();
02859     rb_match_busy(result);
02860     if (!NIL_P(result) && rb_block_given_p()) {
02861         return rb_yield(result);
02862     }
02863     return result;
02864 }
02865 
02866 /*
02867  * Document-method: compile
02868  *
02869  * Synonym for <code>Regexp.new</code>
02870  */
02871 
02872 
02873 /*
02874  *  call-seq:
02875  *     Regexp.new(string, [options [, lang]])        -> regexp
02876  *     Regexp.new(regexp)                            -> regexp
02877  *     Regexp.compile(string, [options [, lang]])    -> regexp
02878  *     Regexp.compile(regexp)                        -> regexp
02879  *
02880  *  Constructs a new regular expression from <i>pattern</i>, which can be either
02881  *  a <code>String</code> or a <code>Regexp</code> (in which case that regexp's
02882  *  options are propagated, and new options may not be specified (a change as of
02883  *  Ruby 1.8). If <i>options</i> is a <code>Fixnum</code>, it should be one or
02884  *  more of the constants <code>Regexp::EXTENDED</code>,
02885  *  <code>Regexp::IGNORECASE</code>, and <code>Regexp::MULTILINE</code>,
02886  *  <em>or</em>-ed together. Otherwise, if <i>options</i> is not
02887  *  <code>nil</code>, the regexp will be case insensitive.
02888  *  When the <i>lang</i> parameter is `n' or `N' sets the regexp no encoding.
02889  *
02890  *     r1 = Regexp.new('^a-z+:\\s+\w+')           #=> /^a-z+:\s+\w+/
02891  *     r2 = Regexp.new('cat', true)               #=> /cat/i
02892  *     r3 = Regexp.new('dog', Regexp::EXTENDED)   #=> /dog/x
02893  *     r4 = Regexp.new(r2)                        #=> /cat/i
02894  */
02895 
02896 static VALUE
02897 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02898 {
02899     onig_errmsg_buffer err = "";
02900     int flags = 0;
02901     VALUE str;
02902     rb_encoding *enc;
02903     const char *ptr;
02904     long len;
02905 
02906     if (argc == 0 || argc > 3) {
02907         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02908     }
02909     if (TYPE(argv[0]) == T_REGEXP) {
02910         VALUE re = argv[0];
02911 
02912         if (argc > 1) {
02913             rb_warn("flags ignored");
02914         }
02915         rb_reg_check(re);
02916         flags = rb_reg_options(re);
02917         ptr = RREGEXP_SRC_PTR(re);
02918         len = RREGEXP_SRC_LEN(re);
02919         enc = rb_enc_get(re);
02920         if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02921             str = rb_enc_str_new(ptr, len, enc);
02922             rb_reg_raise_str(str, flags, err);
02923         }
02924     }
02925     else {
02926         if (argc >= 2) {
02927             if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02928             else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02929         }
02930         enc = 0;
02931         if (argc == 3 && !NIL_P(argv[2])) {
02932             char *kcode = StringValuePtr(argv[2]);
02933             if (kcode[0] == 'n' || kcode[0] == 'N') {
02934                 enc = rb_ascii8bit_encoding();
02935                 flags |= ARG_ENCODING_NONE;
02936             }
02937             else {
02938                 rb_warn("encoding option is ignored - %s", kcode);
02939             }
02940         }
02941         str = argv[0];
02942         ptr = StringValuePtr(str);
02943         if (enc
02944             ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02945             : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02946             rb_reg_raise_str(str, flags, err);
02947         }
02948     }
02949     return self;
02950 }
02951 
02952 VALUE
02953 rb_reg_quote(VALUE str)
02954 {
02955     rb_encoding *enc = rb_enc_get(str);
02956     char *s, *send, *t;
02957     VALUE tmp;
02958     int c, clen;
02959     int ascii_only = rb_enc_str_asciionly_p(str);
02960 
02961     s = RSTRING_PTR(str);
02962     send = s + RSTRING_LEN(str);
02963     while (s < send) {
02964         c = rb_enc_ascget(s, send, &clen, enc);
02965         if (c == -1) {
02966             s += mbclen(s, send, enc);
02967             continue;
02968         }
02969         switch (c) {
02970           case '[': case ']': case '{': case '}':
02971           case '(': case ')': case '|': case '-':
02972           case '*': case '.': case '\\':
02973           case '?': case '+': case '^': case '$':
02974           case ' ': case '#':
02975           case '\t': case '\f': case '\v': case '\n': case '\r':
02976             goto meta_found;
02977         }
02978         s += clen;
02979     }
02980     tmp = rb_str_new3(str);
02981     if (ascii_only) {
02982         rb_enc_associate(tmp, rb_usascii_encoding());
02983     }
02984     return tmp;
02985 
02986   meta_found:
02987     tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02988     if (ascii_only) {
02989         rb_enc_associate(tmp, rb_usascii_encoding());
02990     }
02991     else {
02992         rb_enc_copy(tmp, str);
02993     }
02994     t = RSTRING_PTR(tmp);
02995     /* copy upto metacharacter */
02996     memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02997     t += s - RSTRING_PTR(str);
02998 
02999     while (s < send) {
03000         c = rb_enc_ascget(s, send, &clen, enc);
03001         if (c == -1) {
03002             int n = mbclen(s, send, enc);
03003 
03004             while (n--)
03005                 *t++ = *s++;
03006             continue;
03007         }
03008         s += clen;
03009         switch (c) {
03010           case '[': case ']': case '{': case '}':
03011           case '(': case ')': case '|': case '-':
03012           case '*': case '.': case '\\':
03013           case '?': case '+': case '^': case '$':
03014           case '#':
03015             t += rb_enc_mbcput('\\', t, enc);
03016             break;
03017           case ' ':
03018             t += rb_enc_mbcput('\\', t, enc);
03019             t += rb_enc_mbcput(' ', t, enc);
03020             continue;
03021           case '\t':
03022             t += rb_enc_mbcput('\\', t, enc);
03023             t += rb_enc_mbcput('t', t, enc);
03024             continue;
03025           case '\n':
03026             t += rb_enc_mbcput('\\', t, enc);
03027             t += rb_enc_mbcput('n', t, enc);
03028             continue;
03029           case '\r':
03030             t += rb_enc_mbcput('\\', t, enc);
03031             t += rb_enc_mbcput('r', t, enc);
03032             continue;
03033           case '\f':
03034             t += rb_enc_mbcput('\\', t, enc);
03035             t += rb_enc_mbcput('f', t, enc);
03036             continue;
03037           case '\v':
03038             t += rb_enc_mbcput('\\', t, enc);
03039             t += rb_enc_mbcput('v', t, enc);
03040             continue;
03041         }
03042         t += rb_enc_mbcput(c, t, enc);
03043     }
03044     rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03045     OBJ_INFECT(tmp, str);
03046     return tmp;
03047 }
03048 
03049 
03050 /*
03051  *  call-seq:
03052  *     Regexp.escape(str)   -> string
03053  *     Regexp.quote(str)    -> string
03054  *
03055  *  Escapes any characters that would have special meaning in a regular
03056  *  expression. Returns a new escaped string, or self if no characters are
03057  *  escaped.  For any string,
03058  *  <code>Regexp.new(Regexp.escape(<i>str</i>))=~<i>str</i></code> will be true.
03059  *
03060  *     Regexp.escape('\*?{}.')   #=> \\\*\?\{\}\.
03061  *
03062  */
03063 
03064 static VALUE
03065 rb_reg_s_quote(VALUE c, VALUE str)
03066 {
03067     return rb_reg_quote(reg_operand(str, TRUE));
03068 }
03069 
03070 int
03071 rb_reg_options(VALUE re)
03072 {
03073     int options;
03074 
03075     rb_reg_check(re);
03076     options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03077     if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03078     if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03079     return options;
03080 }
03081 
03082 VALUE
03083 rb_check_regexp_type(VALUE re)
03084 {
03085     return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03086 }
03087 
03088 /*
03089  *  call-seq:
03090  *     Regexp.try_convert(obj) -> re or nil
03091  *
03092  *  Try to convert <i>obj</i> into a Regexp, using to_regexp method.
03093  *  Returns converted regexp or nil if <i>obj</i> cannot be converted
03094  *  for any reason.
03095  *
03096  *     Regexp.try_convert(/re/)         #=> /re/
03097  *     Regexp.try_convert("re")         #=> nil
03098  *
03099  *     o = Object.new
03100  *     Regexp.try_convert(o)            #=> nil
03101  *     def o.to_regexp() /foo/ end
03102  *     Regexp.try_convert(o)            #=> /foo/
03103  *
03104  */
03105 static VALUE
03106 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03107 {
03108     return rb_check_regexp_type(re);
03109 }
03110 
03111 static VALUE
03112 rb_reg_s_union(VALUE self, VALUE args0)
03113 {
03114     long argc = RARRAY_LEN(args0);
03115 
03116     if (argc == 0) {
03117         VALUE args[1];
03118         args[0] = rb_str_new2("(?!)");
03119         return rb_class_new_instance(1, args, rb_cRegexp);
03120     }
03121     else if (argc == 1) {
03122         VALUE arg = rb_ary_entry(args0, 0);
03123         VALUE re = rb_check_regexp_type(arg);
03124         if (!NIL_P(re))
03125             return re;
03126         else {
03127             VALUE quoted;
03128             quoted = rb_reg_s_quote(Qnil, arg);
03129             return rb_reg_new_str(quoted, 0);
03130         }
03131     }
03132     else {
03133         int i;
03134         VALUE source = rb_str_buf_new(0);
03135         rb_encoding *result_enc;
03136 
03137         int has_asciionly = 0;
03138         rb_encoding *has_ascii_compat_fixed = 0;
03139         rb_encoding *has_ascii_incompat = 0;
03140 
03141         for (i = 0; i < argc; i++) {
03142             volatile VALUE v;
03143             VALUE e = rb_ary_entry(args0, i);
03144 
03145             if (0 < i)
03146                 rb_str_buf_cat_ascii(source, "|");
03147 
03148             v = rb_check_regexp_type(e);
03149             if (!NIL_P(v)) {
03150                 rb_encoding *enc = rb_enc_get(v);
03151                 if (!rb_enc_asciicompat(enc)) {
03152                     if (!has_ascii_incompat)
03153                         has_ascii_incompat = enc;
03154                     else if (has_ascii_incompat != enc)
03155                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03156                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03157                 }
03158                 else if (rb_reg_fixed_encoding_p(v)) {
03159                     if (!has_ascii_compat_fixed)
03160                         has_ascii_compat_fixed = enc;
03161                     else if (has_ascii_compat_fixed != enc)
03162                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03163                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03164                 }
03165                 else {
03166                     has_asciionly = 1;
03167                 }
03168                 v = rb_reg_to_s(v);
03169             }
03170             else {
03171                 rb_encoding *enc;
03172                 StringValue(e);
03173                 enc = rb_enc_get(e);
03174                 if (!rb_enc_str_asciicompat_p(e)) {
03175                     if (!has_ascii_incompat)
03176                         has_ascii_incompat = enc;
03177                     else if (has_ascii_incompat != enc)
03178                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03179                             rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03180                 }
03181                 else if (rb_enc_str_asciionly_p(e)) {
03182                     has_asciionly = 1;
03183                 }
03184                 else {
03185                     if (!has_ascii_compat_fixed)
03186                         has_ascii_compat_fixed = enc;
03187                     else if (has_ascii_compat_fixed != enc)
03188                         rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03189                             rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03190                 }
03191                 v = rb_reg_s_quote(Qnil, e);
03192             }
03193             if (has_ascii_incompat) {
03194                 if (has_asciionly) {
03195                     rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03196                         rb_enc_name(has_ascii_incompat));
03197                 }
03198                 if (has_ascii_compat_fixed) {
03199                     rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03200                         rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03201                 }
03202             }
03203 
03204             if (i == 0) {
03205                 rb_enc_copy(source, v);
03206             }
03207             rb_str_append(source, v);
03208         }
03209 
03210         if (has_ascii_incompat) {
03211             result_enc = has_ascii_incompat;
03212         }
03213         else if (has_ascii_compat_fixed) {
03214             result_enc = has_ascii_compat_fixed;
03215         }
03216         else {
03217             result_enc = rb_ascii8bit_encoding();
03218         }
03219 
03220         rb_enc_associate(source, result_enc);
03221         return rb_class_new_instance(1, &source, rb_cRegexp);
03222     }
03223 }
03224 
03225 /*
03226  *  call-seq:
03227  *     Regexp.union(pat1, pat2, ...)            -> new_regexp
03228  *     Regexp.union(pats_ary)                   -> new_regexp
03229  *
03230  *  Return a <code>Regexp</code> object that is the union of the given
03231  *  <em>pattern</em>s, i.e., will match any of its parts. The <em>pattern</em>s
03232  *  can be Regexp objects, in which case their options will be preserved, or
03233  *  Strings. If no patterns are given, returns <code>/(?!)/</code>.
03234  *  The behavior is unspecified if any given <em>pattern</em> contains capture.
03235  *
03236  *     Regexp.union                         #=> /(?!)/
03237  *     Regexp.union("penzance")             #=> /penzance/
03238  *     Regexp.union("a+b*c")                #=> /a\+b\*c/
03239  *     Regexp.union("skiing", "sledding")   #=> /skiing|sledding/
03240  *     Regexp.union(["skiing", "sledding"]) #=> /skiing|sledding/
03241  *     Regexp.union(/dogs/, /cats/i)        #=> /(?-mix:dogs)|(?i-mx:cats)/
03242  */
03243 static VALUE
03244 rb_reg_s_union_m(VALUE self, VALUE args)
03245 {
03246     VALUE v;
03247     if (RARRAY_LEN(args) == 1 &&
03248         !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03249         return rb_reg_s_union(self, v);
03250     }
03251     return rb_reg_s_union(self, args);
03252 }
03253 
03254 /* :nodoc: */
03255 static VALUE
03256 rb_reg_init_copy(VALUE copy, VALUE re)
03257 {
03258     onig_errmsg_buffer err = "";
03259     const char *s;
03260     long len;
03261 
03262     if (copy == re) return copy;
03263     rb_check_frozen(copy);
03264     /* need better argument type check */
03265     if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03266         rb_raise(rb_eTypeError, "wrong argument type");
03267     }
03268     rb_reg_check(re);
03269     s = RREGEXP_SRC_PTR(re);
03270     len = RREGEXP_SRC_LEN(re);
03271     if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03272                 err, NULL, 0) != 0) {
03273         rb_reg_raise(s, len, err, re);
03274     }
03275     return copy;
03276 }
03277 
03278 VALUE
03279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03280 {
03281     VALUE val = 0;
03282     char *p, *s, *e;
03283     int no, clen;
03284     rb_encoding *str_enc = rb_enc_get(str);
03285     rb_encoding *src_enc = rb_enc_get(src);
03286     int acompat = rb_enc_asciicompat(str_enc);
03287 #define ASCGET(s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : rb_enc_ascget(s, e, cl, str_enc))
03288 
03289     p = s = RSTRING_PTR(str);
03290     e = s + RSTRING_LEN(str);
03291 
03292     while (s < e) {
03293         int c = ASCGET(s, e, &clen);
03294         char *ss;
03295 
03296         if (c == -1) {
03297             s += mbclen(s, e, str_enc);
03298             continue;
03299         }
03300         ss = s;
03301         s += clen;
03302 
03303         if (c != '\\' || s == e) continue;
03304 
03305         if (!val) {
03306             val = rb_str_buf_new(ss-p);
03307         }
03308         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03309 
03310         c = ASCGET(s, e, &clen);
03311         if (c == -1) {
03312             s += mbclen(s, e, str_enc);
03313             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03314             p = s;
03315             continue;
03316         }
03317         s += clen;
03318 
03319         p = s;
03320         switch (c) {
03321           case '1': case '2': case '3': case '4':
03322           case '5': case '6': case '7': case '8': case '9':
03323             if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03324                 no = c - '0';
03325             }
03326             else {
03327                 continue;
03328             }
03329             break;
03330 
03331           case 'k':
03332             if (s < e && ASCGET(s, e, &clen) == '<') {
03333                 char *name, *name_end;
03334 
03335                 name_end = name = s + clen;
03336                 while (name_end < e) {
03337                     c = ASCGET(name_end, e, &clen);
03338                     if (c == '>') break;
03339                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03340                 }
03341                 if (name_end < e) {
03342                     no = name_to_backref_number(regs, regexp, name, name_end);
03343                     p = s = name_end + clen;
03344                     break;
03345                 }
03346                 else {
03347                     rb_raise(rb_eRuntimeError, "invalid group name reference format");
03348                 }
03349             }
03350 
03351             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03352             continue;
03353 
03354           case '0':
03355           case '&':
03356             no = 0;
03357             break;
03358 
03359           case '`':
03360             rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03361             continue;
03362 
03363           case '\'':
03364             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03365             continue;
03366 
03367           case '+':
03368             no = regs->num_regs-1;
03369             while (BEG(no) == -1 && no > 0) no--;
03370             if (no == 0) continue;
03371             break;
03372 
03373           case '\\':
03374             rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03375             continue;
03376 
03377           default:
03378             rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03379             continue;
03380         }
03381 
03382         if (no >= 0) {
03383             if (no >= regs->num_regs) continue;
03384             if (BEG(no) == -1) continue;
03385             rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03386         }
03387     }
03388 
03389     if (!val) return str;
03390     if (p < e) {
03391         rb_enc_str_buf_cat(val, p, e-p, str_enc);
03392     }
03393 
03394     return val;
03395 }
03396 
03397 static VALUE
03398 kcode_getter(void)
03399 {
03400     rb_warn("variable $KCODE is no longer effective");
03401     return Qnil;
03402 }
03403 
03404 static void
03405 kcode_setter(VALUE val, ID id)
03406 {
03407     rb_warn("variable $KCODE is no longer effective; ignored");
03408 }
03409 
03410 static VALUE
03411 ignorecase_getter(void)
03412 {
03413     rb_warn("variable $= is no longer effective");
03414     return Qfalse;
03415 }
03416 
03417 static void
03418 ignorecase_setter(VALUE val, ID id)
03419 {
03420     rb_warn("variable $= is no longer effective; ignored");
03421 }
03422 
03423 static VALUE
03424 match_getter(void)
03425 {
03426     VALUE match = rb_backref_get();
03427 
03428     if (NIL_P(match)) return Qnil;
03429     rb_match_busy(match);
03430     return match;
03431 }
03432 
03433 static void
03434 match_setter(VALUE val)
03435 {
03436     if (!NIL_P(val)) {
03437         Check_Type(val, T_MATCH);
03438     }
03439     rb_backref_set(val);
03440 }
03441 
03442 /*
03443  *  call-seq:
03444  *     Regexp.last_match           -> matchdata
03445  *     Regexp.last_match(n)        -> str
03446  *
03447  *  The first form returns the <code>MatchData</code> object generated by the
03448  *  last successful pattern match. Equivalent to reading the global variable
03449  *  <code>$~</code>. The second form returns the <i>n</i>th field in this
03450  *  <code>MatchData</code> object.
03451  *  <em>n</em> can be a string or symbol to reference a named capture.
03452  *
03453  *  Note that the <code>last_match</code> is local to the thread and method scope
03454  *  of the method that did the pattern match.
03455  *
03456  *     /c(.)t/ =~ 'cat'        #=> 0
03457  *     Regexp.last_match       #=> #<MatchData "cat" 1:"a">
03458  *     Regexp.last_match(0)    #=> "cat"
03459  *     Regexp.last_match(1)    #=> "a"
03460  *     Regexp.last_match(2)    #=> nil
03461  *
03462  *     /(?<lhs>\w+)\s*=\s*(?<rhs>\w+)/ =~ "var = val"
03463  *     Regexp.last_match       #=> #<MatchData "var = val" lhs:"var" rhs:"val">
03464  *     Regexp.last_match(:lhs) #=> "var"
03465  *     Regexp.last_match(:rhs) #=> "val"
03466  */
03467 
03468 static VALUE
03469 rb_reg_s_last_match(int argc, VALUE *argv)
03470 {
03471     VALUE nth;
03472 
03473     if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03474         VALUE match = rb_backref_get();
03475         int n;
03476         if (NIL_P(match)) return Qnil;
03477         n = match_backref_number(match, nth);
03478         return rb_reg_nth_match(n, match);
03479     }
03480     return match_getter();
03481 }
03482 
03483 static void
03484 re_warn(const char *s)
03485 {
03486     rb_warn("%s", s);
03487 }
03488 
03489 /*
03490  *  Document-class: RegexpError
03491  *
03492  *  Raised when given an invalid regexp expression.
03493  *
03494  *     Regexp.new("?")
03495  *
03496  *  <em>raises the exception:</em>
03497  *
03498  *     RegexpError: target of repeat operator is not specified: /?/
03499  */
03500 
03501 /*
03502  *  Document-class: Regexp
03503  *
03504  *  A <code>Regexp</code> holds a regular expression, used to match a pattern
03505  *  against strings. Regexps are created using the <code>/.../</code> and
03506  *  <code>%r{...}</code> literals, and by the <code>Regexp::new</code>
03507  *  constructor.
03508  *
03509  *  :include: doc/re.rdoc
03510  */
03511 
03512 void
03513 Init_Regexp(void)
03514 {
03515     rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03516 
03517     onigenc_set_default_caseconv_table((UChar*)casetable);
03518     onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03519     onig_set_warn_func(re_warn);
03520     onig_set_verb_warn_func(re_warn);
03521 
03522     rb_define_virtual_variable("$~", match_getter, match_setter);
03523     rb_define_virtual_variable("$&", last_match_getter, 0);
03524     rb_define_virtual_variable("$`", prematch_getter, 0);
03525     rb_define_virtual_variable("$'", postmatch_getter, 0);
03526     rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03527 
03528     rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03529     rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03530     rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03531 
03532     rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03533     rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03534     rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03535     rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03536     rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03537     rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03538     rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03539     rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03540 
03541     rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03542     rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03543     rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03544     rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03545     rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03546     rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03547     rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03548     rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03549     rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03550     rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03551     rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03552     rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03553     rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03554     rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03555     rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0); /* in encoding.c */
03556     rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03557     rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03558     rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03559 
03560     rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03561     rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03562     rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03563     rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03564 
03565     rb_global_variable(&reg_cache);
03566 
03567     rb_cMatch  = rb_define_class("MatchData", rb_cObject);
03568     rb_define_alloc_func(rb_cMatch, match_alloc);
03569     rb_undef_method(CLASS_OF(rb_cMatch), "new");
03570 
03571     rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03572     rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03573     rb_define_method(rb_cMatch, "names", match_names, 0);
03574     rb_define_method(rb_cMatch, "size", match_size, 0);
03575     rb_define_method(rb_cMatch, "length", match_size, 0);
03576     rb_define_method(rb_cMatch, "offset", match_offset, 1);
03577     rb_define_method(rb_cMatch, "begin", match_begin, 1);
03578     rb_define_method(rb_cMatch, "end", match_end, 1);
03579     rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03580     rb_define_method(rb_cMatch, "[]", match_aref, -1);
03581     rb_define_method(rb_cMatch, "captures", match_captures, 0);
03582     rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03583     rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03584     rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03585     rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03586     rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03587     rb_define_method(rb_cMatch, "string", match_string, 0);
03588     rb_define_method(rb_cMatch, "hash", match_hash, 0);
03589     rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03590     rb_define_method(rb_cMatch, "==", match_equal, 1);
03591 }
03592 

Generated on Wed Sep 8 2010 21:55:12 for Ruby by  doxygen 1.7.1