00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/re.h"
00014 #include "ruby/encoding.h"
00015 #include "ruby/util.h"
00016 #include "regint.h"
00017 #include <ctype.h>
00018
00019 VALUE rb_eRegexpError;
00020
00021 typedef char onig_errmsg_buffer[ONIG_MAX_ERROR_MESSAGE_LEN];
00022 #define errcpy(err, msg) strlcpy((err), (msg), ONIG_MAX_ERROR_MESSAGE_LEN)
00023
00024 #define BEG(no) regs->beg[no]
00025 #define END(no) regs->end[no]
00026
00027 #if 'a' == 97
00028 static const char casetable[] = {
00029 '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
00030 '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
00031 '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
00032 '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
00033
00034 '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
00035
00036 '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
00037
00038 '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
00039
00040 '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
00041
00042 '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00043
00044 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00045
00046 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00047
00048 '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
00049
00050 '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
00051
00052 '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
00053
00054 '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
00055
00056 '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
00057 '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
00058 '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
00059 '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
00060 '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
00061 '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
00062 '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
00063 '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
00064 '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
00065 '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
00066 '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
00067 '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
00068 '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
00069 '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
00070 '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
00071 '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
00072 '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
00073 };
00074 #else
00075 # error >>> "You lose. You will need a translation table for your character set." <<<
00076 #endif
00077
00078 int
00079 rb_memcicmp(const void *x, const void *y, long len)
00080 {
00081 const unsigned char *p1 = x, *p2 = y;
00082 int tmp;
00083
00084 while (len--) {
00085 if ((tmp = casetable[(unsigned)*p1++] - casetable[(unsigned)*p2++]))
00086 return tmp;
00087 }
00088 return 0;
00089 }
00090
00091 #undef rb_memcmp
00092
00093 int
00094 rb_memcmp(const void *p1, const void *p2, long len)
00095 {
00096 return memcmp(p1, p2, len);
00097 }
00098
00099 static inline long
00100 rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n)
00101 {
00102 const unsigned char *x = xs, *xe = xs + m;
00103 const unsigned char *y = ys, *ye = ys + n;
00104 #ifndef VALUE_MAX
00105 # if SIZEOF_VALUE == 8
00106 # define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL
00107 # elif SIZEOF_VALUE == 4
00108 # define VALUE_MAX 0xFFFFFFFFUL
00109 # endif
00110 #endif
00111 VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT);
00112
00113 if (m > SIZEOF_VALUE)
00114 rb_bug("!!too long pattern string!!");
00115
00116
00117 for (hx = *x++, hy = *y++; x < xe; ++x, ++y) {
00118 hx <<= CHAR_BIT;
00119 hy <<= CHAR_BIT;
00120 hx |= *x;
00121 hy |= *y;
00122 }
00123
00124 while (hx != hy) {
00125 if (y == ye)
00126 return -1;
00127 hy <<= CHAR_BIT;
00128 hy |= *y;
00129 hy &= mask;
00130 y++;
00131 }
00132 return y - ys - m;
00133 }
00134
00135 static inline long
00136 rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n)
00137 {
00138 const unsigned char *x = xs, *xe = xs + m;
00139 const unsigned char *y = ys;
00140 VALUE i, qstable[256];
00141
00142
00143 for (i = 0; i < 256; ++i)
00144 qstable[i] = m + 1;
00145 for (; x < xe; ++x)
00146 qstable[*x] = xe - x;
00147
00148 for (; y + m <= ys + n; y += *(qstable + y[m])) {
00149 if (*xs == *y && memcmp(xs, y, m) == 0)
00150 return y - ys;
00151 }
00152 return -1;
00153 }
00154
00155 static inline unsigned int
00156 rb_memsearch_qs_utf8_hash(const unsigned char *x)
00157 {
00158 register const unsigned int mix = 8353;
00159 register unsigned int h = *x;
00160 if (h < 0xC0) {
00161 return h + 256;
00162 }
00163 else if (h < 0xE0) {
00164 h *= mix;
00165 h += x[1];
00166 }
00167 else if (h < 0xF0) {
00168 h *= mix;
00169 h += x[1];
00170 h *= mix;
00171 h += x[2];
00172 }
00173 else if (h < 0xF5) {
00174 h *= mix;
00175 h += x[1];
00176 h *= mix;
00177 h += x[2];
00178 h *= mix;
00179 h += x[3];
00180 }
00181 else {
00182 return h + 256;
00183 }
00184 return (unsigned char)h;
00185 }
00186
00187 static inline long
00188 rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n)
00189 {
00190 const unsigned char *x = xs, *xe = xs + m;
00191 const unsigned char *y = ys;
00192 VALUE i, qstable[512];
00193
00194
00195 for (i = 0; i < 512; ++i) {
00196 qstable[i] = m + 1;
00197 }
00198 for (; x < xe; ++x) {
00199 qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x;
00200 }
00201
00202 for (; y + m <= ys + n; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) {
00203 if (*xs == *y && memcmp(xs, y, m) == 0)
00204 return y - ys;
00205 }
00206 return -1;
00207 }
00208
00209 long
00210 rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc)
00211 {
00212 const unsigned char *x = x0, *y = y0;
00213
00214 if (m > n) return -1;
00215 else if (m == n) {
00216 return memcmp(x0, y0, m) == 0 ? 0 : -1;
00217 }
00218 else if (m < 1) {
00219 return 0;
00220 }
00221 else if (m == 1) {
00222 const unsigned char *ys = y, *ye = ys + n;
00223 for (; y < ye; ++y) {
00224 if (*x == *y)
00225 return y - ys;
00226 }
00227 return -1;
00228 }
00229 else if (m <= SIZEOF_VALUE) {
00230 return rb_memsearch_ss(x0, m, y0, n);
00231 }
00232 else if (enc == rb_utf8_encoding()){
00233 return rb_memsearch_qs_utf8(x0, m, y0, n);
00234 }
00235 else {
00236 return rb_memsearch_qs(x0, m, y0, n);
00237 }
00238 }
00239
00240 #define REG_LITERAL FL_USER5
00241 #define REG_ENCODING_NONE FL_USER6
00242
00243 #define KCODE_FIXED FL_USER4
00244
00245 #define ARG_REG_OPTION_MASK \
00246 (ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
00247 #define ARG_ENCODING_FIXED 16
00248 #define ARG_ENCODING_NONE 32
00249
00250 static int
00251 char_to_option(int c)
00252 {
00253 int val;
00254
00255 switch (c) {
00256 case 'i':
00257 val = ONIG_OPTION_IGNORECASE;
00258 break;
00259 case 'x':
00260 val = ONIG_OPTION_EXTEND;
00261 break;
00262 case 'm':
00263 val = ONIG_OPTION_MULTILINE;
00264 break;
00265 default:
00266 val = 0;
00267 break;
00268 }
00269 return val;
00270 }
00271
00272 static char *
00273 option_to_str(char str[4], int options)
00274 {
00275 char *p = str;
00276 if (options & ONIG_OPTION_MULTILINE) *p++ = 'm';
00277 if (options & ONIG_OPTION_IGNORECASE) *p++ = 'i';
00278 if (options & ONIG_OPTION_EXTEND) *p++ = 'x';
00279 *p = 0;
00280 return str;
00281 }
00282
00283 extern int
00284 rb_char_to_option_kcode(int c, int *option, int *kcode)
00285 {
00286 *option = 0;
00287
00288 switch (c) {
00289 case 'n':
00290 *kcode = rb_ascii8bit_encindex();
00291 return (*option = ARG_ENCODING_NONE);
00292 case 'e':
00293 *kcode = rb_enc_find_index("EUC-JP");
00294 break;
00295 case 's':
00296 *kcode = rb_enc_find_index("Windows-31J");
00297 break;
00298 case 'u':
00299 *kcode = rb_utf8_encindex();
00300 break;
00301 default:
00302 *kcode = -1;
00303 return (*option = char_to_option(c));
00304 }
00305 *option = ARG_ENCODING_FIXED;
00306 return 1;
00307 }
00308
00309 static void
00310 rb_reg_check(VALUE re)
00311 {
00312 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00313 rb_raise(rb_eTypeError, "uninitialized Regexp");
00314 }
00315 }
00316
00317 int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p);
00318
00319 static void
00320 rb_reg_expr_str(VALUE str, const char *s, long len,
00321 rb_encoding *enc, rb_encoding *resenc)
00322 {
00323 const char *p, *pend;
00324 int need_escape = 0;
00325 int c, clen;
00326
00327 p = s; pend = p + len;
00328 if (rb_enc_asciicompat(enc)) {
00329 while (p < pend) {
00330 c = rb_enc_ascget(p, pend, &clen, enc);
00331 if (c == -1) {
00332 if (enc == resenc) {
00333 p += mbclen(p, pend, enc);
00334 }
00335 else {
00336 need_escape = 1;
00337 break;
00338 }
00339 }
00340 else if (c != '/' && rb_enc_isprint(c, enc)) {
00341 p += clen;
00342 }
00343 else {
00344 need_escape = 1;
00345 break;
00346 }
00347 }
00348 }
00349 else {
00350 need_escape = 1;
00351 }
00352
00353 if (!need_escape) {
00354 rb_str_buf_cat(str, s, len);
00355 }
00356 else {
00357 int unicode_p = rb_enc_unicode_p(enc);
00358 p = s;
00359 while (p<pend) {
00360 c = rb_enc_ascget(p, pend, &clen, enc);
00361 if (c == '\\' && p+clen < pend) {
00362 int n = clen + mbclen(p+clen, pend, enc);
00363 rb_str_buf_cat(str, p, n);
00364 p += n;
00365 continue;
00366 }
00367 else if (c == '/') {
00368 char c = '\\';
00369 rb_str_buf_cat(str, &c, 1);
00370 rb_str_buf_cat(str, p, clen);
00371 }
00372 else if (c == -1) {
00373 clen = rb_enc_precise_mbclen(p, pend, enc);
00374 if (!MBCLEN_CHARFOUND_P(clen)) {
00375 c = (unsigned char)*p;
00376 clen = 1;
00377 goto hex;
00378 }
00379 if (resenc) {
00380 unsigned int c = rb_enc_mbc_to_codepoint(p, pend, enc);
00381 rb_str_buf_cat_escaped_char(str, c, unicode_p);
00382 }
00383 else {
00384 clen = MBCLEN_CHARFOUND_LEN(clen);
00385 rb_str_buf_cat(str, p, clen);
00386 }
00387 }
00388 else if (rb_enc_isprint(c, enc)) {
00389 rb_str_buf_cat(str, p, clen);
00390 }
00391 else if (!rb_enc_isspace(c, enc)) {
00392 char b[8];
00393
00394 hex:
00395 snprintf(b, sizeof(b), "\\x%02X", c);
00396 rb_str_buf_cat(str, b, 4);
00397 }
00398 else {
00399 rb_str_buf_cat(str, p, clen);
00400 }
00401 p += clen;
00402 }
00403 }
00404 }
00405
00406 static VALUE
00407 rb_reg_desc(const char *s, long len, VALUE re)
00408 {
00409 rb_encoding *enc = rb_enc_get(re);
00410 VALUE str = rb_str_buf_new2("/");
00411 rb_encoding *resenc = rb_default_internal_encoding();
00412 if (resenc == NULL) resenc = rb_default_external_encoding();
00413
00414 if (re && rb_enc_asciicompat(enc)) {
00415 rb_enc_copy(str, re);
00416 }
00417 else {
00418 rb_enc_associate(str, rb_usascii_encoding());
00419 }
00420 rb_reg_expr_str(str, s, len, enc, resenc);
00421 rb_str_buf_cat2(str, "/");
00422 if (re) {
00423 char opts[4];
00424 rb_reg_check(re);
00425 if (*option_to_str(opts, RREGEXP(re)->ptr->options))
00426 rb_str_buf_cat2(str, opts);
00427 if (RBASIC(re)->flags & REG_ENCODING_NONE)
00428 rb_str_buf_cat2(str, "n");
00429 }
00430 OBJ_INFECT(str, re);
00431 return str;
00432 }
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449 static VALUE
00450 rb_reg_source(VALUE re)
00451 {
00452 VALUE str;
00453
00454 rb_reg_check(re);
00455 str = rb_enc_str_new(RREGEXP_SRC_PTR(re),RREGEXP_SRC_LEN(re), rb_enc_get(re));
00456 if (OBJ_TAINTED(re)) OBJ_TAINT(str);
00457 return str;
00458 }
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472 static VALUE
00473 rb_reg_inspect(VALUE re)
00474 {
00475 if (!RREGEXP(re)->ptr || !RREGEXP_SRC(re) || !RREGEXP_SRC_PTR(re)) {
00476 return rb_any_to_s(re);
00477 }
00478 return rb_reg_desc(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), re);
00479 }
00480
00481
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502 static VALUE
00503 rb_reg_to_s(VALUE re)
00504 {
00505 int options, opt;
00506 const int embeddable = ONIG_OPTION_MULTILINE|ONIG_OPTION_IGNORECASE|ONIG_OPTION_EXTEND;
00507 long len;
00508 const UChar* ptr;
00509 VALUE str = rb_str_buf_new2("(?");
00510 char optbuf[5];
00511 rb_encoding *enc = rb_enc_get(re);
00512
00513 rb_reg_check(re);
00514
00515 rb_enc_copy(str, re);
00516 options = RREGEXP(re)->ptr->options;
00517 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00518 len = RREGEXP_SRC_LEN(re);
00519 again:
00520 if (len >= 4 && ptr[0] == '(' && ptr[1] == '?') {
00521 int err = 1;
00522 ptr += 2;
00523 if ((len -= 2) > 0) {
00524 do {
00525 opt = char_to_option((int )*ptr);
00526 if (opt != 0) {
00527 options |= opt;
00528 }
00529 else {
00530 break;
00531 }
00532 ++ptr;
00533 } while (--len > 0);
00534 }
00535 if (len > 1 && *ptr == '-') {
00536 ++ptr;
00537 --len;
00538 do {
00539 opt = char_to_option((int )*ptr);
00540 if (opt != 0) {
00541 options &= ~opt;
00542 }
00543 else {
00544 break;
00545 }
00546 ++ptr;
00547 } while (--len > 0);
00548 }
00549 if (*ptr == ')') {
00550 --len;
00551 ++ptr;
00552 goto again;
00553 }
00554 if (*ptr == ':' && ptr[len-1] == ')') {
00555 Regexp *rp;
00556
00557 ++ptr;
00558 len -= 2;
00559 err = onig_new(&rp, ptr, ptr + len, ONIG_OPTION_DEFAULT,
00560 enc, OnigDefaultSyntax, NULL);
00561 onig_free(rp);
00562 }
00563 if (err) {
00564 options = RREGEXP(re)->ptr->options;
00565 ptr = (UChar*)RREGEXP_SRC_PTR(re);
00566 len = RREGEXP_SRC_LEN(re);
00567 }
00568 }
00569
00570 if (*option_to_str(optbuf, options)) rb_str_buf_cat2(str, optbuf);
00571
00572 if ((options & embeddable) != embeddable) {
00573 optbuf[0] = '-';
00574 option_to_str(optbuf + 1, ~options);
00575 rb_str_buf_cat2(str, optbuf);
00576 }
00577
00578 rb_str_buf_cat2(str, ":");
00579 rb_reg_expr_str(str, (char*)ptr, len, enc, NULL);
00580 rb_str_buf_cat2(str, ")");
00581 rb_enc_copy(str, re);
00582
00583 OBJ_INFECT(str, re);
00584 return str;
00585 }
00586
00587 static void
00588 rb_reg_raise(const char *s, long len, const char *err, VALUE re)
00589 {
00590 VALUE desc = rb_reg_desc(s, len, re);
00591
00592 rb_raise(rb_eRegexpError, "%s: %s", err, RSTRING_PTR(desc));
00593 }
00594
00595 static VALUE
00596 rb_enc_reg_error_desc(const char *s, long len, rb_encoding *enc, int options, const char *err)
00597 {
00598 char opts[6];
00599 VALUE desc = rb_str_buf_new2(err);
00600 rb_encoding *resenc = rb_default_internal_encoding();
00601 if (resenc == NULL) resenc = rb_default_external_encoding();
00602
00603 rb_enc_associate(desc, enc);
00604 rb_str_buf_cat2(desc, ": /");
00605 rb_reg_expr_str(desc, s, len, enc, resenc);
00606 opts[0] = '/';
00607 option_to_str(opts + 1, options);
00608 rb_str_buf_cat2(desc, opts);
00609 return rb_exc_new3(rb_eRegexpError, desc);
00610 }
00611
00612 static void
00613 rb_enc_reg_raise(const char *s, long len, rb_encoding *enc, int options, const char *err)
00614 {
00615 rb_exc_raise(rb_enc_reg_error_desc(s, len, enc, options, err));
00616 }
00617
00618 static VALUE
00619 rb_reg_error_desc(VALUE str, int options, const char *err)
00620 {
00621 return rb_enc_reg_error_desc(RSTRING_PTR(str), RSTRING_LEN(str),
00622 rb_enc_get(str), options, err);
00623 }
00624
00625 static void
00626 rb_reg_raise_str(VALUE str, int options, const char *err)
00627 {
00628 rb_exc_raise(rb_reg_error_desc(str, options, err));
00629 }
00630
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643 static VALUE
00644 rb_reg_casefold_p(VALUE re)
00645 {
00646 rb_reg_check(re);
00647 if (RREGEXP(re)->ptr->options & ONIG_OPTION_IGNORECASE) return Qtrue;
00648 return Qfalse;
00649 }
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675 static VALUE
00676 rb_reg_options_m(VALUE re)
00677 {
00678 int options = rb_reg_options(re);
00679 return INT2NUM(options);
00680 }
00681
00682 static int
00683 reg_names_iter(const OnigUChar *name, const OnigUChar *name_end,
00684 int back_num, int *back_refs, OnigRegex regex, void *arg)
00685 {
00686 VALUE ary = (VALUE)arg;
00687 rb_ary_push(ary, rb_str_new((const char *)name, name_end-name));
00688 return 0;
00689 }
00690
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707 static VALUE
00708 rb_reg_names(VALUE re)
00709 {
00710 VALUE ary = rb_ary_new();
00711 rb_reg_check(re);
00712 onig_foreach_name(RREGEXP(re)->ptr, reg_names_iter, (void*)ary);
00713 return ary;
00714 }
00715
00716 static int
00717 reg_named_captures_iter(const OnigUChar *name, const OnigUChar *name_end,
00718 int back_num, int *back_refs, OnigRegex regex, void *arg)
00719 {
00720 VALUE hash = (VALUE)arg;
00721 VALUE ary = rb_ary_new2(back_num);
00722 int i;
00723
00724 for(i = 0; i < back_num; i++)
00725 rb_ary_store(ary, i, INT2NUM(back_refs[i]));
00726
00727 rb_hash_aset(hash, rb_str_new((const char*)name, name_end-name),ary);
00728
00729 return 0;
00730 }
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754 static VALUE
00755 rb_reg_named_captures(VALUE re)
00756 {
00757 VALUE hash = rb_hash_new();
00758 rb_reg_check(re);
00759 onig_foreach_name(RREGEXP(re)->ptr, reg_named_captures_iter, (void*)hash);
00760 return hash;
00761 }
00762
00763 static int
00764 onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
00765 OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
00766 OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
00767 {
00768 int r;
00769
00770 *reg = (regex_t* )xmalloc(sizeof(regex_t));
00771 if (IS_NULL(*reg)) return ONIGERR_MEMORY;
00772
00773 r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
00774 if (r) goto err;
00775
00776 r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline);
00777 if (r) {
00778 err:
00779 onig_free(*reg);
00780 *reg = NULL;
00781 }
00782 return r;
00783 }
00784
00785 static Regexp*
00786 make_regexp(const char *s, long len, rb_encoding *enc, int flags, onig_errmsg_buffer err,
00787 const char *sourcefile, int sourceline)
00788 {
00789 Regexp *rp;
00790 int r;
00791 OnigErrorInfo einfo;
00792
00793
00794
00795
00796
00797
00798
00799
00800 r = onig_new_with_source(&rp, (UChar*)s, (UChar*)(s + len), flags,
00801 enc, OnigDefaultSyntax, &einfo, sourcefile, sourceline);
00802 if (r) {
00803 onig_error_code_to_str((UChar*)err, r, &einfo);
00804 return 0;
00805 }
00806 return rp;
00807 }
00808
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821
00822 VALUE rb_cMatch;
00823
00824 static VALUE
00825 match_alloc(VALUE klass)
00826 {
00827 NEWOBJ(match, struct RMatch);
00828 OBJSETUP(match, klass, T_MATCH);
00829
00830 match->str = 0;
00831 match->rmatch = 0;
00832 match->regexp = 0;
00833 match->rmatch = ALLOC(struct rmatch);
00834 MEMZERO(match->rmatch, struct rmatch, 1);
00835
00836 return (VALUE)match;
00837 }
00838
00839 typedef struct {
00840 long byte_pos;
00841 long char_pos;
00842 } pair_t;
00843
00844 static int
00845 pair_byte_cmp(const void *pair1, const void *pair2)
00846 {
00847 long diff = ((pair_t*)pair1)->byte_pos - ((pair_t*)pair2)->byte_pos;
00848 #if SIZEOF_LONG > SIZEOF_INT
00849 return diff ? diff > 0 ? 1 : -1 : 0;
00850 #else
00851 return (int)diff;
00852 #endif
00853 }
00854
00855 static void
00856 update_char_offset(VALUE match)
00857 {
00858 struct rmatch *rm = RMATCH(match)->rmatch;
00859 struct re_registers *regs;
00860 int i, num_regs, num_pos;
00861 long c;
00862 char *s, *p, *q, *e;
00863 rb_encoding *enc;
00864 pair_t *pairs;
00865
00866 if (rm->char_offset_updated)
00867 return;
00868
00869 regs = &rm->regs;
00870 num_regs = rm->regs.num_regs;
00871
00872 if (rm->char_offset_num_allocated < num_regs) {
00873 REALLOC_N(rm->char_offset, struct rmatch_offset, num_regs);
00874 rm->char_offset_num_allocated = num_regs;
00875 }
00876
00877 enc = rb_enc_get(RMATCH(match)->str);
00878 if (rb_enc_mbmaxlen(enc) == 1) {
00879 for (i = 0; i < num_regs; i++) {
00880 rm->char_offset[i].beg = BEG(i);
00881 rm->char_offset[i].end = END(i);
00882 }
00883 rm->char_offset_updated = 1;
00884 return;
00885 }
00886
00887 pairs = ALLOCA_N(pair_t, num_regs*2);
00888 num_pos = 0;
00889 for (i = 0; i < num_regs; i++) {
00890 if (BEG(i) < 0)
00891 continue;
00892 pairs[num_pos++].byte_pos = BEG(i);
00893 pairs[num_pos++].byte_pos = END(i);
00894 }
00895 qsort(pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00896
00897 s = p = RSTRING_PTR(RMATCH(match)->str);
00898 e = s + RSTRING_LEN(RMATCH(match)->str);
00899 c = 0;
00900 for (i = 0; i < num_pos; i++) {
00901 q = s + pairs[i].byte_pos;
00902 c += rb_enc_strlen(p, q, enc);
00903 pairs[i].char_pos = c;
00904 p = q;
00905 }
00906
00907 for (i = 0; i < num_regs; i++) {
00908 pair_t key, *found;
00909 if (BEG(i) < 0) {
00910 rm->char_offset[i].beg = -1;
00911 rm->char_offset[i].end = -1;
00912 continue;
00913 }
00914
00915 key.byte_pos = BEG(i);
00916 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00917 rm->char_offset[i].beg = found->char_pos;
00918
00919 key.byte_pos = END(i);
00920 found = bsearch(&key, pairs, num_pos, sizeof(pair_t), pair_byte_cmp);
00921 rm->char_offset[i].end = found->char_pos;
00922 }
00923
00924 rm->char_offset_updated = 1;
00925 }
00926
00927 static void
00928 match_check(VALUE match)
00929 {
00930 if (!RMATCH(match)->regexp) {
00931 rb_raise(rb_eTypeError, "uninitialized Match");
00932 }
00933 }
00934
00935
00936 static VALUE
00937 match_init_copy(VALUE obj, VALUE orig)
00938 {
00939 struct rmatch *rm;
00940
00941 if (obj == orig) return obj;
00942
00943 if (!rb_obj_is_instance_of(orig, rb_obj_class(obj))) {
00944 rb_raise(rb_eTypeError, "wrong argument class");
00945 }
00946 RMATCH(obj)->str = RMATCH(orig)->str;
00947 RMATCH(obj)->regexp = RMATCH(orig)->regexp;
00948
00949 rm = RMATCH(obj)->rmatch;
00950 onig_region_copy(&rm->regs, RMATCH_REGS(orig));
00951
00952 if (!RMATCH(orig)->rmatch->char_offset_updated) {
00953 rm->char_offset_updated = 0;
00954 }
00955 else {
00956 if (rm->char_offset_num_allocated < rm->regs.num_regs) {
00957 REALLOC_N(rm->char_offset, struct rmatch_offset, rm->regs.num_regs);
00958 rm->char_offset_num_allocated = rm->regs.num_regs;
00959 }
00960 MEMCPY(rm->char_offset, RMATCH(orig)->rmatch->char_offset,
00961 struct rmatch_offset, rm->regs.num_regs);
00962 rm->char_offset_updated = 1;
00963 }
00964
00965 return obj;
00966 }
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979 static VALUE
00980 match_regexp(VALUE match)
00981 {
00982 match_check(match);
00983 return RMATCH(match)->regexp;
00984 }
00985
00986
00987
00988
00989
00990
00991
00992
00993
00994
00995
00996
00997
00998
00999
01000 static VALUE
01001 match_names(VALUE match)
01002 {
01003 match_check(match);
01004 return rb_reg_names(RMATCH(match)->regexp);
01005 }
01006
01007
01008
01009
01010
01011
01012
01013
01014
01015
01016
01017
01018
01019 static VALUE
01020 match_size(VALUE match)
01021 {
01022 match_check(match);
01023 return INT2FIX(RMATCH_REGS(match)->num_regs);
01024 }
01025
01026 static int
01027 match_backref_number(VALUE match, VALUE backref)
01028 {
01029 const char *name;
01030 int num;
01031
01032 struct re_registers *regs = RMATCH_REGS(match);
01033 VALUE regexp = RMATCH(match)->regexp;
01034
01035 match_check(match);
01036 switch(TYPE(backref)) {
01037 default:
01038 return NUM2INT(backref);
01039
01040 case T_SYMBOL:
01041 name = rb_id2name(SYM2ID(backref));
01042 break;
01043
01044 case T_STRING:
01045 name = StringValueCStr(backref);
01046 break;
01047 }
01048
01049 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01050 (const unsigned char*)name,
01051 (const unsigned char*)name + strlen(name),
01052 regs);
01053
01054 if (num < 1) {
01055 rb_raise(rb_eIndexError, "undefined group name reference: %s", name);
01056 }
01057
01058 return num;
01059 }
01060
01061 int
01062 rb_reg_backref_number(VALUE match, VALUE backref)
01063 {
01064 return match_backref_number(match, backref);
01065 }
01066
01067
01068
01069
01070
01071
01072
01073
01074
01075
01076
01077
01078
01079
01080
01081
01082
01083
01084
01085 static VALUE
01086 match_offset(VALUE match, VALUE n)
01087 {
01088 int i = match_backref_number(match, n);
01089 struct re_registers *regs = RMATCH_REGS(match);
01090
01091 match_check(match);
01092 if (i < 0 || regs->num_regs <= i)
01093 rb_raise(rb_eIndexError, "index %d out of matches", i);
01094
01095 if (BEG(i) < 0)
01096 return rb_assoc_new(Qnil, Qnil);
01097
01098 update_char_offset(match);
01099 return rb_assoc_new(INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg),
01100 INT2FIX(RMATCH(match)->rmatch->char_offset[i].end));
01101 }
01102
01103
01104
01105
01106
01107
01108
01109
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121 static VALUE
01122 match_begin(VALUE match, VALUE n)
01123 {
01124 int i = match_backref_number(match, n);
01125 struct re_registers *regs = RMATCH_REGS(match);
01126
01127 match_check(match);
01128 if (i < 0 || regs->num_regs <= i)
01129 rb_raise(rb_eIndexError, "index %d out of matches", i);
01130
01131 if (BEG(i) < 0)
01132 return Qnil;
01133
01134 update_char_offset(match);
01135 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].beg);
01136 }
01137
01138
01139
01140
01141
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152
01153
01154
01155
01156 static VALUE
01157 match_end(VALUE match, VALUE n)
01158 {
01159 int i = match_backref_number(match, n);
01160 struct re_registers *regs = RMATCH_REGS(match);
01161
01162 match_check(match);
01163 if (i < 0 || regs->num_regs <= i)
01164 rb_raise(rb_eIndexError, "index %d out of matches", i);
01165
01166 if (BEG(i) < 0)
01167 return Qnil;
01168
01169 update_char_offset(match);
01170 return INT2FIX(RMATCH(match)->rmatch->char_offset[i].end);
01171 }
01172
01173 #define MATCH_BUSY FL_USER2
01174
01175 void
01176 rb_match_busy(VALUE match)
01177 {
01178 FL_SET(match, MATCH_BUSY);
01179 }
01180
01181
01182
01183
01184
01185
01186
01187
01188
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200
01201
01202
01203
01204
01205
01206
01207
01208
01209
01210 static VALUE
01211 rb_reg_fixed_encoding_p(VALUE re)
01212 {
01213 if (FL_TEST(re, KCODE_FIXED))
01214 return Qtrue;
01215 else
01216 return Qfalse;
01217 }
01218
01219 static VALUE
01220 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
01221 rb_encoding **fixed_enc, onig_errmsg_buffer err);
01222
01223
01224 static void
01225 reg_enc_error(VALUE re, VALUE str)
01226 {
01227 rb_raise(rb_eEncCompatError,
01228 "incompatible encoding regexp match (%s regexp with %s string)",
01229 rb_enc_name(rb_enc_get(re)),
01230 rb_enc_name(rb_enc_get(str)));
01231 }
01232
01233 static rb_encoding*
01234 rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
01235 {
01236 rb_encoding *enc = 0;
01237
01238 if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
01239 rb_raise(rb_eArgError,
01240 "invalid byte sequence in %s",
01241 rb_enc_name(rb_enc_get(str)));
01242 }
01243
01244 rb_reg_check(re);
01245 enc = rb_enc_get(str);
01246 if (!rb_enc_str_asciicompat_p(str)) {
01247 if (RREGEXP(re)->ptr->enc != enc) {
01248 reg_enc_error(re, str);
01249 }
01250 }
01251 else if (rb_reg_fixed_encoding_p(re)) {
01252 if (RREGEXP(re)->ptr->enc != enc &&
01253 (!rb_enc_asciicompat(RREGEXP(re)->ptr->enc) ||
01254 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)) {
01255 reg_enc_error(re, str);
01256 }
01257 enc = RREGEXP(re)->ptr->enc;
01258 }
01259 if (warn && (RBASIC(re)->flags & REG_ENCODING_NONE) &&
01260 enc != rb_ascii8bit_encoding() &&
01261 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
01262 rb_warn("regexp match /.../n against to %s string",
01263 rb_enc_name(enc));
01264 }
01265 return enc;
01266 }
01267
01268 regex_t *
01269 rb_reg_prepare_re(VALUE re, VALUE str)
01270 {
01271 regex_t *reg = RREGEXP(re)->ptr;
01272 onig_errmsg_buffer err = "";
01273 int r;
01274 OnigErrorInfo einfo;
01275 const char *pattern;
01276 VALUE unescaped;
01277 rb_encoding *fixed_enc = 0;
01278 rb_encoding *enc = rb_reg_prepare_enc(re, str, 1);
01279
01280 if (reg->enc == enc) return reg;
01281
01282 rb_reg_check(re);
01283 reg = RREGEXP(re)->ptr;
01284 pattern = RREGEXP_SRC_PTR(re);
01285
01286 unescaped = rb_reg_preprocess(
01287 pattern, pattern + RREGEXP_SRC_LEN(re), enc,
01288 &fixed_enc, err);
01289
01290 if (unescaped == Qnil) {
01291 rb_raise(rb_eArgError, "regexp preprocess failed: %s", err);
01292 }
01293
01294 r = onig_new(®, (UChar* )RSTRING_PTR(unescaped),
01295 (UChar* )(RSTRING_PTR(unescaped) + RSTRING_LEN(unescaped)),
01296 reg->options, enc,
01297 OnigDefaultSyntax, &einfo);
01298 if (r) {
01299 onig_error_code_to_str((UChar*)err, r, &einfo);
01300 rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
01301 }
01302
01303 RB_GC_GUARD(unescaped);
01304 return reg;
01305 }
01306
01307 long
01308 rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
01309 {
01310 long range;
01311 rb_encoding *enc;
01312 UChar *p, *string;
01313
01314 enc = rb_reg_prepare_enc(re, str, 0);
01315
01316 if (reverse) {
01317 range = -pos;
01318 }
01319 else {
01320 range = RSTRING_LEN(str) - pos;
01321 }
01322
01323 if (pos > 0 && ONIGENC_MBC_MAXLEN(enc) != 1 && pos < RSTRING_LEN(str)) {
01324 string = (UChar*)RSTRING_PTR(str);
01325
01326 if (range > 0) {
01327 p = onigenc_get_right_adjust_char_head(enc, string, string + pos, string + RSTRING_LEN(str));
01328 }
01329 else {
01330 p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, string, string + pos, string + RSTRING_LEN(str));
01331 }
01332 return p - string;
01333 }
01334
01335 return pos;
01336 }
01337
01338 long
01339 rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
01340 {
01341 long result;
01342 VALUE match;
01343 struct re_registers regi, *regs = ®i;
01344 char *range = RSTRING_PTR(str);
01345 regex_t *reg;
01346 int tmpreg;
01347
01348 if (pos > RSTRING_LEN(str) || pos < 0) {
01349 rb_backref_set(Qnil);
01350 return -1;
01351 }
01352
01353 reg = rb_reg_prepare_re(re, str);
01354 tmpreg = reg != RREGEXP(re)->ptr;
01355 if (!tmpreg) RREGEXP(re)->usecnt++;
01356
01357 match = rb_backref_get();
01358 if (!NIL_P(match)) {
01359 if (FL_TEST(match, MATCH_BUSY)) {
01360 match = Qnil;
01361 }
01362 else {
01363 regs = RMATCH_REGS(match);
01364 }
01365 }
01366 if (NIL_P(match)) {
01367 MEMZERO(regs, struct re_registers, 1);
01368 }
01369 if (!reverse) {
01370 range += RSTRING_LEN(str);
01371 }
01372 result = onig_search(reg,
01373 (UChar*)(RSTRING_PTR(str)),
01374 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
01375 ((UChar*)(RSTRING_PTR(str)) + pos),
01376 ((UChar*)range),
01377 regs, ONIG_OPTION_NONE);
01378 if (!tmpreg) RREGEXP(re)->usecnt--;
01379 if (tmpreg) {
01380 if (RREGEXP(re)->usecnt) {
01381 onig_free(reg);
01382 }
01383 else {
01384 onig_free(RREGEXP(re)->ptr);
01385 RREGEXP(re)->ptr = reg;
01386 }
01387 }
01388 if (result < 0) {
01389 if (regs == ®i)
01390 onig_region_free(regs, 0);
01391 if (result == ONIG_MISMATCH) {
01392 rb_backref_set(Qnil);
01393 return result;
01394 }
01395 else {
01396 onig_errmsg_buffer err = "";
01397 onig_error_code_to_str((UChar*)err, (int)result);
01398 rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, 0);
01399 }
01400 }
01401
01402 if (NIL_P(match)) {
01403 match = match_alloc(rb_cMatch);
01404 onig_region_copy(RMATCH_REGS(match), regs);
01405 onig_region_free(regs, 0);
01406 }
01407 else {
01408 if (rb_safe_level() >= 3)
01409 OBJ_TAINT(match);
01410 else
01411 FL_UNSET(match, FL_TAINT);
01412 }
01413
01414 RMATCH(match)->str = rb_str_new4(str);
01415 RMATCH(match)->regexp = re;
01416 RMATCH(match)->rmatch->char_offset_updated = 0;
01417 rb_backref_set(match);
01418
01419 OBJ_INFECT(match, re);
01420 OBJ_INFECT(match, str);
01421
01422 return result;
01423 }
01424
01425 VALUE
01426 rb_reg_nth_defined(int nth, VALUE match)
01427 {
01428 struct re_registers *regs;
01429 if (NIL_P(match)) return Qnil;
01430 match_check(match);
01431 regs = RMATCH_REGS(match);
01432 if (nth >= regs->num_regs) {
01433 return Qnil;
01434 }
01435 if (nth < 0) {
01436 nth += regs->num_regs;
01437 if (nth <= 0) return Qnil;
01438 }
01439 if (BEG(nth) == -1) return Qfalse;
01440 return Qtrue;
01441 }
01442
01443 VALUE
01444 rb_reg_nth_match(int nth, VALUE match)
01445 {
01446 VALUE str;
01447 long start, end, len;
01448 struct re_registers *regs;
01449
01450 if (NIL_P(match)) return Qnil;
01451 match_check(match);
01452 regs = RMATCH_REGS(match);
01453 if (nth >= regs->num_regs) {
01454 return Qnil;
01455 }
01456 if (nth < 0) {
01457 nth += regs->num_regs;
01458 if (nth <= 0) return Qnil;
01459 }
01460 start = BEG(nth);
01461 if (start == -1) return Qnil;
01462 end = END(nth);
01463 len = end - start;
01464 str = rb_str_subseq(RMATCH(match)->str, start, len);
01465 OBJ_INFECT(str, match);
01466 return str;
01467 }
01468
01469 VALUE
01470 rb_reg_last_match(VALUE match)
01471 {
01472 return rb_reg_nth_match(0, match);
01473 }
01474
01475
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487 VALUE
01488 rb_reg_match_pre(VALUE match)
01489 {
01490 VALUE str;
01491 struct re_registers *regs;
01492
01493 if (NIL_P(match)) return Qnil;
01494 match_check(match);
01495 regs = RMATCH_REGS(match);
01496 if (BEG(0) == -1) return Qnil;
01497 str = rb_str_subseq(RMATCH(match)->str, 0, BEG(0));
01498 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01499 return str;
01500 }
01501
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01512
01513
01514 VALUE
01515 rb_reg_match_post(VALUE match)
01516 {
01517 VALUE str;
01518 long pos;
01519 struct re_registers *regs;
01520
01521 if (NIL_P(match)) return Qnil;
01522 match_check(match);
01523 regs = RMATCH_REGS(match);
01524 if (BEG(0) == -1) return Qnil;
01525 str = RMATCH(match)->str;
01526 pos = END(0);
01527 str = rb_str_subseq(str, pos, RSTRING_LEN(str) - pos);
01528 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01529 return str;
01530 }
01531
01532 VALUE
01533 rb_reg_match_last(VALUE match)
01534 {
01535 int i;
01536 struct re_registers *regs;
01537
01538 if (NIL_P(match)) return Qnil;
01539 match_check(match);
01540 regs = RMATCH_REGS(match);
01541 if (BEG(0) == -1) return Qnil;
01542
01543 for (i=regs->num_regs-1; BEG(i) == -1 && i > 0; i--)
01544 ;
01545 if (i == 0) return Qnil;
01546 return rb_reg_nth_match(i, match);
01547 }
01548
01549 static VALUE
01550 last_match_getter(void)
01551 {
01552 return rb_reg_last_match(rb_backref_get());
01553 }
01554
01555 static VALUE
01556 prematch_getter(void)
01557 {
01558 return rb_reg_match_pre(rb_backref_get());
01559 }
01560
01561 static VALUE
01562 postmatch_getter(void)
01563 {
01564 return rb_reg_match_post(rb_backref_get());
01565 }
01566
01567 static VALUE
01568 last_paren_match_getter(void)
01569 {
01570 return rb_reg_match_last(rb_backref_get());
01571 }
01572
01573 static VALUE
01574 match_array(VALUE match, int start)
01575 {
01576 struct re_registers *regs;
01577 VALUE ary;
01578 VALUE target;
01579 int i;
01580 int taint = OBJ_TAINTED(match);
01581
01582 match_check(match);
01583 regs = RMATCH_REGS(match);
01584 ary = rb_ary_new2(regs->num_regs);
01585 target = RMATCH(match)->str;
01586
01587 for (i=start; i<regs->num_regs; i++) {
01588 if (regs->beg[i] == -1) {
01589 rb_ary_push(ary, Qnil);
01590 }
01591 else {
01592 VALUE str = rb_str_subseq(target, regs->beg[i], regs->end[i]-regs->beg[i]);
01593 if (taint) OBJ_TAINT(str);
01594 rb_ary_push(ary, str);
01595 }
01596 }
01597 return ary;
01598 }
01599
01600
01601
01602
01603
01604
01605
01606
01607
01608
01609
01610
01611
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627 static VALUE
01628 match_to_a(VALUE match)
01629 {
01630 return match_array(match, 0);
01631 }
01632
01633
01634
01635
01636
01637
01638
01639
01640
01641
01642
01643
01644
01645
01646 static VALUE
01647 match_captures(VALUE match)
01648 {
01649 return match_array(match, 1);
01650 }
01651
01652 static int
01653 name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end)
01654 {
01655 int num;
01656
01657 num = onig_name_to_backref_number(RREGEXP(regexp)->ptr,
01658 (const unsigned char* )name, (const unsigned char* )name_end, regs);
01659 if (num >= 1) {
01660 return num;
01661 }
01662 else {
01663 VALUE s = rb_str_new(name, (long )(name_end - name));
01664 rb_raise(rb_eIndexError, "undefined group name reference: %s",
01665 StringValuePtr(s));
01666 }
01667 }
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695 static VALUE
01696 match_aref(int argc, VALUE *argv, VALUE match)
01697 {
01698 VALUE idx, rest;
01699
01700 match_check(match);
01701 rb_scan_args(argc, argv, "11", &idx, &rest);
01702
01703 if (NIL_P(rest)) {
01704 if (FIXNUM_P(idx)) {
01705 if (FIX2INT(idx) >= 0) {
01706 return rb_reg_nth_match(FIX2INT(idx), match);
01707 }
01708 }
01709 else {
01710 const char *p;
01711 int num;
01712
01713 switch (TYPE(idx)) {
01714 case T_SYMBOL:
01715 p = rb_id2name(SYM2ID(idx));
01716 goto name_to_backref;
01717 break;
01718 case T_STRING:
01719 p = StringValuePtr(idx);
01720
01721 name_to_backref:
01722 num = name_to_backref_number(RMATCH_REGS(match),
01723 RMATCH(match)->regexp, p, p + strlen(p));
01724 return rb_reg_nth_match(num, match);
01725 break;
01726
01727 default:
01728 break;
01729 }
01730 }
01731 }
01732
01733 return rb_ary_aref(argc, argv, match_to_a(match));
01734 }
01735
01736 static VALUE
01737 match_entry(VALUE match, long n)
01738 {
01739
01740 return rb_reg_nth_match((int)n, match);
01741 }
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757 static VALUE
01758 match_values_at(int argc, VALUE *argv, VALUE match)
01759 {
01760 struct re_registers *regs;
01761
01762 match_check(match);
01763 regs = RMATCH_REGS(match);
01764 return rb_get_values_at(match, regs->num_regs, argc, argv, match_entry);
01765 }
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778 static VALUE
01779 match_to_s(VALUE match)
01780 {
01781 VALUE str = rb_reg_last_match(match);
01782
01783 match_check(match);
01784 if (NIL_P(str)) str = rb_str_new(0,0);
01785 if (OBJ_TAINTED(match)) OBJ_TAINT(str);
01786 if (OBJ_TAINTED(RMATCH(match)->str)) OBJ_TAINT(str);
01787 return str;
01788 }
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801 static VALUE
01802 match_string(VALUE match)
01803 {
01804 match_check(match);
01805 return RMATCH(match)->str;
01806 }
01807
01808 struct backref_name_tag {
01809 const UChar *name;
01810 long len;
01811 };
01812
01813 static int
01814 match_inspect_name_iter(const OnigUChar *name, const OnigUChar *name_end,
01815 int back_num, int *back_refs, OnigRegex regex, void *arg0)
01816 {
01817 struct backref_name_tag *arg = (struct backref_name_tag *)arg0;
01818 int i;
01819
01820 for (i = 0; i < back_num; i++) {
01821 arg[back_refs[i]].name = name;
01822 arg[back_refs[i]].len = name_end - name;
01823 }
01824 return 0;
01825 }
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837
01838
01839
01840
01841
01842
01843
01844
01845
01846
01847 static VALUE
01848 match_inspect(VALUE match)
01849 {
01850 const char *cname = rb_obj_classname(match);
01851 VALUE str;
01852 int i;
01853 struct re_registers *regs = RMATCH_REGS(match);
01854 int num_regs = regs->num_regs;
01855 struct backref_name_tag *names;
01856 VALUE regexp = RMATCH(match)->regexp;
01857
01858 if (regexp == 0) {
01859 return rb_sprintf("#<%s:%p>", cname, (void*)match);
01860 }
01861
01862 names = ALLOCA_N(struct backref_name_tag, num_regs);
01863 MEMZERO(names, struct backref_name_tag, num_regs);
01864
01865 onig_foreach_name(RREGEXP(regexp)->ptr,
01866 match_inspect_name_iter, names);
01867
01868 str = rb_str_buf_new2("#<");
01869 rb_str_buf_cat2(str, cname);
01870
01871 for (i = 0; i < num_regs; i++) {
01872 VALUE v;
01873 rb_str_buf_cat2(str, " ");
01874 if (0 < i) {
01875 if (names[i].name)
01876 rb_str_buf_cat(str, (const char *)names[i].name, names[i].len);
01877 else {
01878 rb_str_catf(str, "%d", i);
01879 }
01880 rb_str_buf_cat2(str, ":");
01881 }
01882 v = rb_reg_nth_match(i, match);
01883 if (v == Qnil)
01884 rb_str_buf_cat2(str, "nil");
01885 else
01886 rb_str_buf_append(str, rb_str_inspect(v));
01887 }
01888 rb_str_buf_cat2(str, ">");
01889
01890 return str;
01891 }
01892
01893 VALUE rb_cRegexp;
01894
01895 static int
01896 read_escaped_byte(const char **pp, const char *end, onig_errmsg_buffer err)
01897 {
01898 const char *p = *pp;
01899 int code;
01900 int meta_prefix = 0, ctrl_prefix = 0;
01901 size_t len;
01902 int retbyte;
01903
01904 retbyte = -1;
01905 if (p == end || *p++ != '\\') {
01906 errcpy(err, "too short escaped multibyte character");
01907 return -1;
01908 }
01909
01910 again:
01911 if (p == end) {
01912 errcpy(err, "too short escape sequence");
01913 return -1;
01914 }
01915 switch (*p++) {
01916 case '\\': code = '\\'; break;
01917 case 'n': code = '\n'; break;
01918 case 't': code = '\t'; break;
01919 case 'r': code = '\r'; break;
01920 case 'f': code = '\f'; break;
01921 case 'v': code = '\013'; break;
01922 case 'a': code = '\007'; break;
01923 case 'e': code = '\033'; break;
01924
01925
01926 case '0': case '1': case '2': case '3':
01927 case '4': case '5': case '6': case '7':
01928 p--;
01929 code = scan_oct(p, end < p+3 ? end-p : 3, &len);
01930 p += len;
01931 break;
01932
01933 case 'x':
01934 code = scan_hex(p, end < p+2 ? end-p : 2, &len);
01935 if (len < 1) {
01936 errcpy(err, "invalid hex escape");
01937 return -1;
01938 }
01939 p += len;
01940 break;
01941
01942 case 'M':
01943 if (meta_prefix) {
01944 errcpy(err, "duplicate meta escape");
01945 return -1;
01946 }
01947 meta_prefix = 1;
01948 if (p+1 < end && *p++ == '-' && (*p & 0x80) == 0) {
01949 if (*p == '\\') {
01950 p++;
01951 goto again;
01952 }
01953 else {
01954 code = *p++;
01955 break;
01956 }
01957 }
01958 errcpy(err, "too short meta escape");
01959 return -1;
01960
01961 case 'C':
01962 if (p == end || *p++ != '-') {
01963 errcpy(err, "too short control escape");
01964 return -1;
01965 }
01966 case 'c':
01967 if (ctrl_prefix) {
01968 errcpy(err, "duplicate control escape");
01969 return -1;
01970 }
01971 ctrl_prefix = 1;
01972 if (p < end && (*p & 0x80) == 0) {
01973 if (*p == '\\') {
01974 p++;
01975 goto again;
01976 }
01977 else {
01978 code = *p++;
01979 break;
01980 }
01981 }
01982 errcpy(err, "too short control escape");
01983 return -1;
01984
01985 default:
01986 errcpy(err, "unexpected escape sequence");
01987 return -1;
01988 }
01989 if (code < 0 || 0xff < code) {
01990 errcpy(err, "invalid escape code");
01991 return -1;
01992 }
01993
01994 if (ctrl_prefix)
01995 code &= 0x1f;
01996 if (meta_prefix)
01997 code |= 0x80;
01998
01999 *pp = p;
02000 return code;
02001 }
02002
02003 static int
02004 unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc,
02005 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02006 {
02007 const char *p = *pp;
02008 int chmaxlen = rb_enc_mbmaxlen(enc);
02009 char *chbuf = ALLOCA_N(char, chmaxlen);
02010 int chlen = 0;
02011 int byte;
02012 int l;
02013
02014 memset(chbuf, 0, chmaxlen);
02015
02016 byte = read_escaped_byte(&p, end, err);
02017 if (byte == -1) {
02018 return -1;
02019 }
02020
02021 chbuf[chlen++] = byte;
02022 while (chlen < chmaxlen &&
02023 MBCLEN_NEEDMORE_P(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) {
02024 byte = read_escaped_byte(&p, end, err);
02025 if (byte == -1) {
02026 return -1;
02027 }
02028 chbuf[chlen++] = byte;
02029 }
02030
02031 l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc);
02032 if (MBCLEN_INVALID_P(l)) {
02033 errcpy(err, "invalid multibyte escape");
02034 return -1;
02035 }
02036 if (1 < chlen || (chbuf[0] & 0x80)) {
02037 rb_str_buf_cat(buf, chbuf, chlen);
02038
02039 if (*encp == 0)
02040 *encp = enc;
02041 else if (*encp != enc) {
02042 errcpy(err, "escaped non ASCII character in UTF-8 regexp");
02043 return -1;
02044 }
02045 }
02046 else {
02047 char escbuf[5];
02048 snprintf(escbuf, sizeof(escbuf), "\\x%02X", chbuf[0]&0xff);
02049 rb_str_buf_cat(buf, escbuf, 4);
02050 }
02051 *pp = p;
02052 return 0;
02053 }
02054
02055 static int
02056 check_unicode_range(unsigned long code, onig_errmsg_buffer err)
02057 {
02058 if ((0xd800 <= code && code <= 0xdfff) ||
02059 0x10ffff < code) {
02060 errcpy(err, "invalid Unicode range");
02061 return -1;
02062 }
02063 return 0;
02064 }
02065
02066 static int
02067 append_utf8(unsigned long uv,
02068 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02069 {
02070 if (check_unicode_range(uv, err) != 0)
02071 return -1;
02072 if (uv < 0x80) {
02073 char escbuf[5];
02074 snprintf(escbuf, sizeof(escbuf), "\\x%02X", (int)uv);
02075 rb_str_buf_cat(buf, escbuf, 4);
02076 }
02077 else {
02078 int len;
02079 char utf8buf[6];
02080 len = rb_uv_to_utf8(utf8buf, uv);
02081 rb_str_buf_cat(buf, utf8buf, len);
02082
02083 if (*encp == 0)
02084 *encp = rb_utf8_encoding();
02085 else if (*encp != rb_utf8_encoding()) {
02086 errcpy(err, "UTF-8 character in non UTF-8 regexp");
02087 return -1;
02088 }
02089 }
02090 return 0;
02091 }
02092
02093 static int
02094 unescape_unicode_list(const char **pp, const char *end,
02095 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02096 {
02097 const char *p = *pp;
02098 int has_unicode = 0;
02099 unsigned long code;
02100 size_t len;
02101
02102 while (p < end && ISSPACE(*p)) p++;
02103
02104 while (1) {
02105 code = ruby_scan_hex(p, end-p, &len);
02106 if (len == 0)
02107 break;
02108 if (6 < len) {
02109 errcpy(err, "invalid Unicode range");
02110 return -1;
02111 }
02112 p += len;
02113 if (append_utf8(code, buf, encp, err) != 0)
02114 return -1;
02115 has_unicode = 1;
02116
02117 while (p < end && ISSPACE(*p)) p++;
02118 }
02119
02120 if (has_unicode == 0) {
02121 errcpy(err, "invalid Unicode list");
02122 return -1;
02123 }
02124
02125 *pp = p;
02126
02127 return 0;
02128 }
02129
02130 static int
02131 unescape_unicode_bmp(const char **pp, const char *end,
02132 VALUE buf, rb_encoding **encp, onig_errmsg_buffer err)
02133 {
02134 const char *p = *pp;
02135 size_t len;
02136 unsigned long code;
02137
02138 if (end < p+4) {
02139 errcpy(err, "invalid Unicode escape");
02140 return -1;
02141 }
02142 code = ruby_scan_hex(p, 4, &len);
02143 if (len != 4) {
02144 errcpy(err, "invalid Unicode escape");
02145 return -1;
02146 }
02147 if (append_utf8(code, buf, encp, err) != 0)
02148 return -1;
02149 *pp = p + 4;
02150 return 0;
02151 }
02152
02153 static int
02154 unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
02155 VALUE buf, rb_encoding **encp, int *has_property,
02156 onig_errmsg_buffer err)
02157 {
02158 char c;
02159 char smallbuf[2];
02160
02161 while (p < end) {
02162 int chlen = rb_enc_precise_mbclen(p, end, enc);
02163 if (!MBCLEN_CHARFOUND_P(chlen)) {
02164 errcpy(err, "invalid multibyte character");
02165 return -1;
02166 }
02167 chlen = MBCLEN_CHARFOUND_LEN(chlen);
02168 if (1 < chlen || (*p & 0x80)) {
02169 rb_str_buf_cat(buf, p, chlen);
02170 p += chlen;
02171 if (*encp == 0)
02172 *encp = enc;
02173 else if (*encp != enc) {
02174 errcpy(err, "non ASCII character in UTF-8 regexp");
02175 return -1;
02176 }
02177 continue;
02178 }
02179
02180 switch (c = *p++) {
02181 case '\\':
02182 if (p == end) {
02183 errcpy(err, "too short escape sequence");
02184 return -1;
02185 }
02186 switch (c = *p++) {
02187 case '1': case '2': case '3':
02188 case '4': case '5': case '6': case '7':
02189 {
02190 size_t octlen;
02191 if (ruby_scan_oct(p-1, end-(p-1), &octlen) <= 0177) {
02192
02193
02194
02195 goto escape_asis;
02196 }
02197 }
02198
02199
02200 case '0':
02201
02202 case 'x':
02203 case 'c':
02204 case 'C':
02205 case 'M':
02206 p = p-2;
02207 if (unescape_escaped_nonascii(&p, end, enc, buf, encp, err) != 0)
02208 return -1;
02209 break;
02210
02211 case 'u':
02212 if (p == end) {
02213 errcpy(err, "too short escape sequence");
02214 return -1;
02215 }
02216 if (*p == '{') {
02217
02218 p++;
02219 if (unescape_unicode_list(&p, end, buf, encp, err) != 0)
02220 return -1;
02221 if (p == end || *p++ != '}') {
02222 errcpy(err, "invalid Unicode list");
02223 return -1;
02224 }
02225 break;
02226 }
02227 else {
02228
02229 if (unescape_unicode_bmp(&p, end, buf, encp, err) != 0)
02230 return -1;
02231 break;
02232 }
02233
02234 case 'p':
02235 case 'P':
02236 if (!*encp) {
02237 *has_property = 1;
02238 }
02239 goto escape_asis;
02240
02241 default:
02242 escape_asis:
02243 smallbuf[0] = '\\';
02244 smallbuf[1] = c;
02245 rb_str_buf_cat(buf, smallbuf, 2);
02246 break;
02247 }
02248 break;
02249
02250 default:
02251 rb_str_buf_cat(buf, &c, 1);
02252 break;
02253 }
02254 }
02255
02256 return 0;
02257 }
02258
02259 static VALUE
02260 rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
02261 rb_encoding **fixed_enc, onig_errmsg_buffer err)
02262 {
02263 VALUE buf;
02264 int has_property = 0;
02265
02266 buf = rb_str_buf_new(0);
02267
02268 if (rb_enc_asciicompat(enc))
02269 *fixed_enc = 0;
02270 else {
02271 *fixed_enc = enc;
02272 rb_enc_associate(buf, enc);
02273 }
02274
02275 if (unescape_nonascii(p, end, enc, buf, fixed_enc, &has_property, err) != 0)
02276 return Qnil;
02277
02278 if (has_property && !*fixed_enc) {
02279 *fixed_enc = enc;
02280 }
02281
02282 if (*fixed_enc) {
02283 rb_enc_associate(buf, *fixed_enc);
02284 }
02285
02286 return buf;
02287 }
02288
02289 VALUE
02290 rb_reg_check_preprocess(VALUE str)
02291 {
02292 rb_encoding *fixed_enc = 0;
02293 onig_errmsg_buffer err = "";
02294 VALUE buf;
02295 char *p, *end;
02296 rb_encoding *enc;
02297
02298 StringValue(str);
02299 p = RSTRING_PTR(str);
02300 end = p + RSTRING_LEN(str);
02301 enc = rb_enc_get(str);
02302
02303 buf = rb_reg_preprocess(p, end, enc, &fixed_enc, err);
02304 RB_GC_GUARD(str);
02305
02306 if (buf == Qnil) {
02307 return rb_reg_error_desc(str, 0, err);
02308 }
02309 return Qnil;
02310 }
02311
02312 static VALUE
02313 rb_reg_preprocess_dregexp(VALUE ary, int options)
02314 {
02315 rb_encoding *fixed_enc = 0;
02316 rb_encoding *regexp_enc = 0;
02317 onig_errmsg_buffer err = "";
02318 int i;
02319 VALUE result = 0;
02320 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02321
02322 if (RARRAY_LEN(ary) == 0) {
02323 rb_raise(rb_eArgError, "no arguments given");
02324 }
02325
02326 for (i = 0; i < RARRAY_LEN(ary); i++) {
02327 VALUE str = RARRAY_PTR(ary)[i];
02328 VALUE buf;
02329 char *p, *end;
02330 rb_encoding *src_enc;
02331
02332 src_enc = rb_enc_get(str);
02333 if (options & ARG_ENCODING_NONE &&
02334 src_enc != ascii8bit) {
02335 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT)
02336 rb_raise(rb_eRegexpError, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02337 else
02338 src_enc = ascii8bit;
02339 }
02340
02341 StringValue(str);
02342 p = RSTRING_PTR(str);
02343 end = p + RSTRING_LEN(str);
02344
02345 buf = rb_reg_preprocess(p, end, src_enc, &fixed_enc, err);
02346
02347 if (buf == Qnil)
02348 rb_raise(rb_eArgError, "%s", err);
02349
02350 if (fixed_enc != 0) {
02351 if (regexp_enc != 0 && regexp_enc != fixed_enc) {
02352 rb_raise(rb_eRegexpError, "encoding mismatch in dynamic regexp : %s and %s",
02353 rb_enc_name(regexp_enc), rb_enc_name(fixed_enc));
02354 }
02355 regexp_enc = fixed_enc;
02356 }
02357
02358 if (!result)
02359 result = rb_str_new3(str);
02360 else
02361 rb_str_buf_append(result, str);
02362 }
02363 if (regexp_enc) {
02364 rb_enc_associate(result, regexp_enc);
02365 }
02366
02367 return result;
02368 }
02369
02370 static int
02371 rb_reg_initialize(VALUE obj, const char *s, long len, rb_encoding *enc,
02372 int options, onig_errmsg_buffer err,
02373 const char *sourcefile, int sourceline)
02374 {
02375 struct RRegexp *re = RREGEXP(obj);
02376 VALUE unescaped;
02377 rb_encoding *fixed_enc = 0;
02378 rb_encoding *a_enc = rb_ascii8bit_encoding();
02379
02380 if (!OBJ_UNTRUSTED(obj) && rb_safe_level() >= 4)
02381 rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
02382 rb_check_frozen(obj);
02383 if (FL_TEST(obj, REG_LITERAL))
02384 rb_raise(rb_eSecurityError, "can't modify literal regexp");
02385 if (re->ptr)
02386 rb_raise(rb_eTypeError, "already initialized regexp");
02387 re->ptr = 0;
02388
02389 if (rb_enc_dummy_p(enc)) {
02390 errcpy(err, "can't make regexp with dummy encoding");
02391 return -1;
02392 }
02393
02394 unescaped = rb_reg_preprocess(s, s+len, enc, &fixed_enc, err);
02395 if (unescaped == Qnil)
02396 return -1;
02397
02398 if (fixed_enc) {
02399 if ((fixed_enc != enc && (options & ARG_ENCODING_FIXED)) ||
02400 (fixed_enc != a_enc && (options & ARG_ENCODING_NONE))) {
02401 errcpy(err, "incompatible character encoding");
02402 return -1;
02403 }
02404 if (fixed_enc != a_enc) {
02405 options |= ARG_ENCODING_FIXED;
02406 enc = fixed_enc;
02407 }
02408 }
02409 else if (!(options & ARG_ENCODING_FIXED)) {
02410 enc = rb_usascii_encoding();
02411 }
02412
02413 rb_enc_associate((VALUE)re, enc);
02414 if ((options & ARG_ENCODING_FIXED) || fixed_enc) {
02415 re->basic.flags |= KCODE_FIXED;
02416 }
02417 if (options & ARG_ENCODING_NONE) {
02418 re->basic.flags |= REG_ENCODING_NONE;
02419 }
02420
02421 re->ptr = make_regexp(RSTRING_PTR(unescaped), RSTRING_LEN(unescaped), enc,
02422 options & ARG_REG_OPTION_MASK, err,
02423 sourcefile, sourceline);
02424 if (!re->ptr) return -1;
02425 re->src = rb_enc_str_new(s, len, enc);
02426 OBJ_FREEZE(re->src);
02427 RB_GC_GUARD(unescaped);
02428 return 0;
02429 }
02430
02431 static int
02432 rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err,
02433 const char *sourcefile, int sourceline)
02434 {
02435 int ret;
02436 rb_encoding *enc = rb_enc_get(str);
02437 if (options & ARG_ENCODING_NONE) {
02438 rb_encoding *ascii8bit = rb_ascii8bit_encoding();
02439 if (enc != ascii8bit) {
02440 if (rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
02441 errcpy(err, "/.../n has a non escaped non ASCII character in non ASCII-8BIT script");
02442 return -1;
02443 }
02444 enc = ascii8bit;
02445 }
02446 }
02447 ret = rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), enc,
02448 options, err, sourcefile, sourceline);
02449 RB_GC_GUARD(str);
02450 return ret;
02451 }
02452
02453 static VALUE
02454 rb_reg_s_alloc(VALUE klass)
02455 {
02456 NEWOBJ(re, struct RRegexp);
02457 OBJSETUP(re, klass, T_REGEXP);
02458
02459 re->ptr = 0;
02460 re->src = 0;
02461 re->usecnt = 0;
02462
02463 return (VALUE)re;
02464 }
02465
02466 VALUE
02467 rb_reg_alloc(void)
02468 {
02469 return rb_reg_s_alloc(rb_cRegexp);
02470 }
02471
02472 VALUE
02473 rb_reg_new_str(VALUE s, int options)
02474 {
02475 return rb_reg_init_str(rb_reg_alloc(), s, options);
02476 }
02477
02478 VALUE
02479 rb_reg_init_str(VALUE re, VALUE s, int options)
02480 {
02481 onig_errmsg_buffer err = "";
02482
02483 if (rb_reg_initialize_str(re, s, options, err, NULL, 0) != 0) {
02484 rb_reg_raise_str(s, options, err);
02485 }
02486
02487 return re;
02488 }
02489
02490 VALUE
02491 rb_reg_new_ary(VALUE ary, int opt)
02492 {
02493 return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt);
02494 }
02495
02496 VALUE
02497 rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options)
02498 {
02499 VALUE re = rb_reg_alloc();
02500 onig_errmsg_buffer err = "";
02501
02502 if (rb_reg_initialize(re, s, len, enc, options, err, NULL, 0) != 0) {
02503 rb_enc_reg_raise(s, len, enc, options, err);
02504 }
02505
02506 return re;
02507 }
02508
02509 VALUE
02510 rb_reg_new(const char *s, long len, int options)
02511 {
02512 return rb_enc_reg_new(s, len, rb_ascii8bit_encoding(), options);
02513 }
02514
02515 VALUE
02516 rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline)
02517 {
02518 VALUE re = rb_reg_alloc();
02519 onig_errmsg_buffer err = "";
02520
02521 if (!str) str = rb_str_new(0,0);
02522 if (rb_reg_initialize_str(re, str, options, err, sourcefile, sourceline) != 0) {
02523 rb_set_errinfo(rb_reg_error_desc(str, options, err));
02524 return Qnil;
02525 }
02526 FL_SET(re, REG_LITERAL);
02527 return re;
02528 }
02529
02530 static VALUE reg_cache;
02531
02532 VALUE
02533 rb_reg_regcomp(VALUE str)
02534 {
02535 volatile VALUE save_str = str;
02536 if (reg_cache && RREGEXP_SRC_LEN(reg_cache) == RSTRING_LEN(str)
02537 && ENCODING_GET(reg_cache) == ENCODING_GET(str)
02538 && memcmp(RREGEXP_SRC_PTR(reg_cache), RSTRING_PTR(str), RSTRING_LEN(str)) == 0)
02539 return reg_cache;
02540
02541 return reg_cache = rb_reg_new_str(save_str, 0);
02542 }
02543
02544 static st_index_t reg_hash(VALUE re);
02545
02546
02547
02548
02549
02550
02551
02552 static VALUE
02553 rb_reg_hash(VALUE re)
02554 {
02555 st_index_t hashval = reg_hash(re);
02556 return LONG2FIX(hashval);
02557 }
02558
02559 static st_index_t
02560 reg_hash(VALUE re)
02561 {
02562 st_index_t hashval;
02563
02564 rb_reg_check(re);
02565 hashval = RREGEXP(re)->ptr->options;
02566 hashval = rb_hash_uint(hashval, rb_memhash(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re)));
02567 return rb_hash_end(hashval);
02568 }
02569
02570
02571
02572
02573
02574
02575
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586 static VALUE
02587 rb_reg_equal(VALUE re1, VALUE re2)
02588 {
02589 if (re1 == re2) return Qtrue;
02590 if (TYPE(re2) != T_REGEXP) return Qfalse;
02591 rb_reg_check(re1); rb_reg_check(re2);
02592 if (FL_TEST(re1, KCODE_FIXED) != FL_TEST(re2, KCODE_FIXED)) return Qfalse;
02593 if (RREGEXP(re1)->ptr->options != RREGEXP(re2)->ptr->options) return Qfalse;
02594 if (RREGEXP_SRC_LEN(re1) != RREGEXP_SRC_LEN(re2)) return Qfalse;
02595 if (ENCODING_GET(re1) != ENCODING_GET(re2)) return Qfalse;
02596 if (memcmp(RREGEXP_SRC_PTR(re1), RREGEXP_SRC_PTR(re2), RREGEXP_SRC_LEN(re1)) == 0) {
02597 return Qtrue;
02598 }
02599 return Qfalse;
02600 }
02601
02602
02603
02604
02605
02606
02607
02608
02609
02610 static VALUE
02611 match_hash(VALUE match)
02612 {
02613 const struct re_registers *regs;
02614 st_index_t hashval = rb_hash_start(rb_str_hash(RMATCH(match)->str));
02615
02616 rb_hash_uint(hashval, reg_hash(RMATCH(match)->regexp));
02617 regs = RMATCH_REGS(match);
02618 hashval = rb_hash_uint(hashval, regs->num_regs);
02619 hashval = rb_hash_uint(hashval, rb_memhash(regs->beg, regs->num_regs * sizeof(*regs->beg)));
02620 hashval = rb_hash_uint(hashval, rb_memhash(regs->end, regs->num_regs * sizeof(*regs->end)));
02621 hashval = rb_hash_end(hashval);
02622 return LONG2FIX(hashval);
02623 }
02624
02625
02626
02627
02628
02629
02630
02631
02632
02633 static VALUE
02634 match_equal(VALUE match1, VALUE match2)
02635 {
02636 const struct re_registers *regs1, *regs2;
02637 if (match1 == match2) return Qtrue;
02638 if (TYPE(match2) != T_MATCH) return Qfalse;
02639 if (!rb_str_equal(RMATCH(match1)->str, RMATCH(match2)->str)) return Qfalse;
02640 if (!rb_reg_equal(RMATCH(match1)->regexp, RMATCH(match2)->regexp)) return Qfalse;
02641 regs1 = RMATCH_REGS(match1);
02642 regs2 = RMATCH_REGS(match2);
02643 if (regs1->num_regs != regs2->num_regs) return Qfalse;
02644 if (memcmp(regs1->beg, regs2->beg, regs1->num_regs * sizeof(*regs1->beg))) return Qfalse;
02645 if (memcmp(regs1->end, regs2->end, regs1->num_regs * sizeof(*regs1->end))) return Qfalse;
02646 return Qtrue;
02647 }
02648
02649 static VALUE
02650 reg_operand(VALUE s, int check)
02651 {
02652 if (SYMBOL_P(s)) {
02653 return rb_sym_to_s(s);
02654 }
02655 else {
02656 VALUE tmp = rb_check_string_type(s);
02657 if (check && NIL_P(tmp)) {
02658 rb_raise(rb_eTypeError, "can't convert %s to String",
02659 rb_obj_classname(s));
02660 }
02661 return tmp;
02662 }
02663 }
02664
02665 static long
02666 reg_match_pos(VALUE re, VALUE *strp, long pos)
02667 {
02668 VALUE str = *strp;
02669
02670 if (NIL_P(str)) {
02671 rb_backref_set(Qnil);
02672 return -1;
02673 }
02674 *strp = str = reg_operand(str, TRUE);
02675 if (pos != 0) {
02676 if (pos < 0) {
02677 VALUE l = rb_str_length(str);
02678 pos += NUM2INT(l);
02679 if (pos < 0) {
02680 return pos;
02681 }
02682 }
02683 pos = rb_str_offset(str, pos);
02684 }
02685 return rb_reg_search(re, str, pos, 0);
02686 }
02687
02688
02689
02690
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724
02725
02726
02727
02728
02729
02730
02731
02732
02733
02734
02735
02736 VALUE
02737 rb_reg_match(VALUE re, VALUE str)
02738 {
02739 long pos = reg_match_pos(re, &str, 0);
02740 if (pos < 0) return Qnil;
02741 pos = rb_str_sublen(str, pos);
02742 return LONG2FIX(pos);
02743 }
02744
02745
02746
02747
02748
02749
02750
02751
02752
02753
02754
02755
02756
02757
02758
02759
02760
02761
02762
02763 VALUE
02764 rb_reg_eqq(VALUE re, VALUE str)
02765 {
02766 long start;
02767
02768 str = reg_operand(str, FALSE);
02769 if (NIL_P(str)) {
02770 rb_backref_set(Qnil);
02771 return Qfalse;
02772 }
02773 start = rb_reg_search(re, str, 0, 0);
02774 if (start < 0) {
02775 return Qfalse;
02776 }
02777 return Qtrue;
02778 }
02779
02780
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792 VALUE
02793 rb_reg_match2(VALUE re)
02794 {
02795 long start;
02796 VALUE line = rb_lastline_get();
02797
02798 if (TYPE(line) != T_STRING) {
02799 rb_backref_set(Qnil);
02800 return Qnil;
02801 }
02802
02803 start = rb_reg_search(re, line, 0, 0);
02804 if (start < 0) {
02805 return Qnil;
02806 }
02807 start = rb_str_sublen(line, start);
02808 return LONG2FIX(start);
02809 }
02810
02811
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840 static VALUE
02841 rb_reg_match_m(int argc, VALUE *argv, VALUE re)
02842 {
02843 VALUE result, str, initpos;
02844 long pos;
02845
02846 if (rb_scan_args(argc, argv, "11", &str, &initpos) == 2) {
02847 pos = NUM2LONG(initpos);
02848 }
02849 else {
02850 pos = 0;
02851 }
02852
02853 pos = reg_match_pos(re, &str, pos);
02854 if (pos < 0) {
02855 rb_backref_set(Qnil);
02856 return Qnil;
02857 }
02858 result = rb_backref_get();
02859 rb_match_busy(result);
02860 if (!NIL_P(result) && rb_block_given_p()) {
02861 return rb_yield(result);
02862 }
02863 return result;
02864 }
02865
02866
02867
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896 static VALUE
02897 rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
02898 {
02899 onig_errmsg_buffer err = "";
02900 int flags = 0;
02901 VALUE str;
02902 rb_encoding *enc;
02903 const char *ptr;
02904 long len;
02905
02906 if (argc == 0 || argc > 3) {
02907 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..3)", argc);
02908 }
02909 if (TYPE(argv[0]) == T_REGEXP) {
02910 VALUE re = argv[0];
02911
02912 if (argc > 1) {
02913 rb_warn("flags ignored");
02914 }
02915 rb_reg_check(re);
02916 flags = rb_reg_options(re);
02917 ptr = RREGEXP_SRC_PTR(re);
02918 len = RREGEXP_SRC_LEN(re);
02919 enc = rb_enc_get(re);
02920 if (rb_reg_initialize(self, ptr, len, enc, flags, err, NULL, 0)) {
02921 str = rb_enc_str_new(ptr, len, enc);
02922 rb_reg_raise_str(str, flags, err);
02923 }
02924 }
02925 else {
02926 if (argc >= 2) {
02927 if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
02928 else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
02929 }
02930 enc = 0;
02931 if (argc == 3 && !NIL_P(argv[2])) {
02932 char *kcode = StringValuePtr(argv[2]);
02933 if (kcode[0] == 'n' || kcode[0] == 'N') {
02934 enc = rb_ascii8bit_encoding();
02935 flags |= ARG_ENCODING_NONE;
02936 }
02937 else {
02938 rb_warn("encoding option is ignored - %s", kcode);
02939 }
02940 }
02941 str = argv[0];
02942 ptr = StringValuePtr(str);
02943 if (enc
02944 ? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err, NULL, 0)
02945 : rb_reg_initialize_str(self, str, flags, err, NULL, 0)) {
02946 rb_reg_raise_str(str, flags, err);
02947 }
02948 }
02949 return self;
02950 }
02951
02952 VALUE
02953 rb_reg_quote(VALUE str)
02954 {
02955 rb_encoding *enc = rb_enc_get(str);
02956 char *s, *send, *t;
02957 VALUE tmp;
02958 int c, clen;
02959 int ascii_only = rb_enc_str_asciionly_p(str);
02960
02961 s = RSTRING_PTR(str);
02962 send = s + RSTRING_LEN(str);
02963 while (s < send) {
02964 c = rb_enc_ascget(s, send, &clen, enc);
02965 if (c == -1) {
02966 s += mbclen(s, send, enc);
02967 continue;
02968 }
02969 switch (c) {
02970 case '[': case ']': case '{': case '}':
02971 case '(': case ')': case '|': case '-':
02972 case '*': case '.': case '\\':
02973 case '?': case '+': case '^': case '$':
02974 case ' ': case '#':
02975 case '\t': case '\f': case '\v': case '\n': case '\r':
02976 goto meta_found;
02977 }
02978 s += clen;
02979 }
02980 tmp = rb_str_new3(str);
02981 if (ascii_only) {
02982 rb_enc_associate(tmp, rb_usascii_encoding());
02983 }
02984 return tmp;
02985
02986 meta_found:
02987 tmp = rb_str_new(0, RSTRING_LEN(str)*2);
02988 if (ascii_only) {
02989 rb_enc_associate(tmp, rb_usascii_encoding());
02990 }
02991 else {
02992 rb_enc_copy(tmp, str);
02993 }
02994 t = RSTRING_PTR(tmp);
02995
02996 memcpy(t, RSTRING_PTR(str), s - RSTRING_PTR(str));
02997 t += s - RSTRING_PTR(str);
02998
02999 while (s < send) {
03000 c = rb_enc_ascget(s, send, &clen, enc);
03001 if (c == -1) {
03002 int n = mbclen(s, send, enc);
03003
03004 while (n--)
03005 *t++ = *s++;
03006 continue;
03007 }
03008 s += clen;
03009 switch (c) {
03010 case '[': case ']': case '{': case '}':
03011 case '(': case ')': case '|': case '-':
03012 case '*': case '.': case '\\':
03013 case '?': case '+': case '^': case '$':
03014 case '#':
03015 t += rb_enc_mbcput('\\', t, enc);
03016 break;
03017 case ' ':
03018 t += rb_enc_mbcput('\\', t, enc);
03019 t += rb_enc_mbcput(' ', t, enc);
03020 continue;
03021 case '\t':
03022 t += rb_enc_mbcput('\\', t, enc);
03023 t += rb_enc_mbcput('t', t, enc);
03024 continue;
03025 case '\n':
03026 t += rb_enc_mbcput('\\', t, enc);
03027 t += rb_enc_mbcput('n', t, enc);
03028 continue;
03029 case '\r':
03030 t += rb_enc_mbcput('\\', t, enc);
03031 t += rb_enc_mbcput('r', t, enc);
03032 continue;
03033 case '\f':
03034 t += rb_enc_mbcput('\\', t, enc);
03035 t += rb_enc_mbcput('f', t, enc);
03036 continue;
03037 case '\v':
03038 t += rb_enc_mbcput('\\', t, enc);
03039 t += rb_enc_mbcput('v', t, enc);
03040 continue;
03041 }
03042 t += rb_enc_mbcput(c, t, enc);
03043 }
03044 rb_str_resize(tmp, t - RSTRING_PTR(tmp));
03045 OBJ_INFECT(tmp, str);
03046 return tmp;
03047 }
03048
03049
03050
03051
03052
03053
03054
03055
03056
03057
03058
03059
03060
03061
03062
03063
03064 static VALUE
03065 rb_reg_s_quote(VALUE c, VALUE str)
03066 {
03067 return rb_reg_quote(reg_operand(str, TRUE));
03068 }
03069
03070 int
03071 rb_reg_options(VALUE re)
03072 {
03073 int options;
03074
03075 rb_reg_check(re);
03076 options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
03077 if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
03078 if (RBASIC(re)->flags & REG_ENCODING_NONE) options |= ARG_ENCODING_NONE;
03079 return options;
03080 }
03081
03082 VALUE
03083 rb_check_regexp_type(VALUE re)
03084 {
03085 return rb_check_convert_type(re, T_REGEXP, "Regexp", "to_regexp");
03086 }
03087
03088
03089
03090
03091
03092
03093
03094
03095
03096
03097
03098
03099
03100
03101
03102
03103
03104
03105 static VALUE
03106 rb_reg_s_try_convert(VALUE dummy, VALUE re)
03107 {
03108 return rb_check_regexp_type(re);
03109 }
03110
03111 static VALUE
03112 rb_reg_s_union(VALUE self, VALUE args0)
03113 {
03114 long argc = RARRAY_LEN(args0);
03115
03116 if (argc == 0) {
03117 VALUE args[1];
03118 args[0] = rb_str_new2("(?!)");
03119 return rb_class_new_instance(1, args, rb_cRegexp);
03120 }
03121 else if (argc == 1) {
03122 VALUE arg = rb_ary_entry(args0, 0);
03123 VALUE re = rb_check_regexp_type(arg);
03124 if (!NIL_P(re))
03125 return re;
03126 else {
03127 VALUE quoted;
03128 quoted = rb_reg_s_quote(Qnil, arg);
03129 return rb_reg_new_str(quoted, 0);
03130 }
03131 }
03132 else {
03133 int i;
03134 VALUE source = rb_str_buf_new(0);
03135 rb_encoding *result_enc;
03136
03137 int has_asciionly = 0;
03138 rb_encoding *has_ascii_compat_fixed = 0;
03139 rb_encoding *has_ascii_incompat = 0;
03140
03141 for (i = 0; i < argc; i++) {
03142 volatile VALUE v;
03143 VALUE e = rb_ary_entry(args0, i);
03144
03145 if (0 < i)
03146 rb_str_buf_cat_ascii(source, "|");
03147
03148 v = rb_check_regexp_type(e);
03149 if (!NIL_P(v)) {
03150 rb_encoding *enc = rb_enc_get(v);
03151 if (!rb_enc_asciicompat(enc)) {
03152 if (!has_ascii_incompat)
03153 has_ascii_incompat = enc;
03154 else if (has_ascii_incompat != enc)
03155 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03156 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03157 }
03158 else if (rb_reg_fixed_encoding_p(v)) {
03159 if (!has_ascii_compat_fixed)
03160 has_ascii_compat_fixed = enc;
03161 else if (has_ascii_compat_fixed != enc)
03162 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03163 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03164 }
03165 else {
03166 has_asciionly = 1;
03167 }
03168 v = rb_reg_to_s(v);
03169 }
03170 else {
03171 rb_encoding *enc;
03172 StringValue(e);
03173 enc = rb_enc_get(e);
03174 if (!rb_enc_str_asciicompat_p(e)) {
03175 if (!has_ascii_incompat)
03176 has_ascii_incompat = enc;
03177 else if (has_ascii_incompat != enc)
03178 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03179 rb_enc_name(has_ascii_incompat), rb_enc_name(enc));
03180 }
03181 else if (rb_enc_str_asciionly_p(e)) {
03182 has_asciionly = 1;
03183 }
03184 else {
03185 if (!has_ascii_compat_fixed)
03186 has_ascii_compat_fixed = enc;
03187 else if (has_ascii_compat_fixed != enc)
03188 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03189 rb_enc_name(has_ascii_compat_fixed), rb_enc_name(enc));
03190 }
03191 v = rb_reg_s_quote(Qnil, e);
03192 }
03193 if (has_ascii_incompat) {
03194 if (has_asciionly) {
03195 rb_raise(rb_eArgError, "ASCII incompatible encoding: %s",
03196 rb_enc_name(has_ascii_incompat));
03197 }
03198 if (has_ascii_compat_fixed) {
03199 rb_raise(rb_eArgError, "incompatible encodings: %s and %s",
03200 rb_enc_name(has_ascii_incompat), rb_enc_name(has_ascii_compat_fixed));
03201 }
03202 }
03203
03204 if (i == 0) {
03205 rb_enc_copy(source, v);
03206 }
03207 rb_str_append(source, v);
03208 }
03209
03210 if (has_ascii_incompat) {
03211 result_enc = has_ascii_incompat;
03212 }
03213 else if (has_ascii_compat_fixed) {
03214 result_enc = has_ascii_compat_fixed;
03215 }
03216 else {
03217 result_enc = rb_ascii8bit_encoding();
03218 }
03219
03220 rb_enc_associate(source, result_enc);
03221 return rb_class_new_instance(1, &source, rb_cRegexp);
03222 }
03223 }
03224
03225
03226
03227
03228
03229
03230
03231
03232
03233
03234
03235
03236
03237
03238
03239
03240
03241
03242
03243 static VALUE
03244 rb_reg_s_union_m(VALUE self, VALUE args)
03245 {
03246 VALUE v;
03247 if (RARRAY_LEN(args) == 1 &&
03248 !NIL_P(v = rb_check_array_type(rb_ary_entry(args, 0)))) {
03249 return rb_reg_s_union(self, v);
03250 }
03251 return rb_reg_s_union(self, args);
03252 }
03253
03254
03255 static VALUE
03256 rb_reg_init_copy(VALUE copy, VALUE re)
03257 {
03258 onig_errmsg_buffer err = "";
03259 const char *s;
03260 long len;
03261
03262 if (copy == re) return copy;
03263 rb_check_frozen(copy);
03264
03265 if (!rb_obj_is_instance_of(re, rb_obj_class(copy))) {
03266 rb_raise(rb_eTypeError, "wrong argument type");
03267 }
03268 rb_reg_check(re);
03269 s = RREGEXP_SRC_PTR(re);
03270 len = RREGEXP_SRC_LEN(re);
03271 if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re),
03272 err, NULL, 0) != 0) {
03273 rb_reg_raise(s, len, err, re);
03274 }
03275 return copy;
03276 }
03277
03278 VALUE
03279 rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
03280 {
03281 VALUE val = 0;
03282 char *p, *s, *e;
03283 int no, clen;
03284 rb_encoding *str_enc = rb_enc_get(str);
03285 rb_encoding *src_enc = rb_enc_get(src);
03286 int acompat = rb_enc_asciicompat(str_enc);
03287 #define ASCGET(s,e,cl) (acompat ? (*cl=1,ISASCII(s[0])?s[0]:-1) : rb_enc_ascget(s, e, cl, str_enc))
03288
03289 p = s = RSTRING_PTR(str);
03290 e = s + RSTRING_LEN(str);
03291
03292 while (s < e) {
03293 int c = ASCGET(s, e, &clen);
03294 char *ss;
03295
03296 if (c == -1) {
03297 s += mbclen(s, e, str_enc);
03298 continue;
03299 }
03300 ss = s;
03301 s += clen;
03302
03303 if (c != '\\' || s == e) continue;
03304
03305 if (!val) {
03306 val = rb_str_buf_new(ss-p);
03307 }
03308 rb_enc_str_buf_cat(val, p, ss-p, str_enc);
03309
03310 c = ASCGET(s, e, &clen);
03311 if (c == -1) {
03312 s += mbclen(s, e, str_enc);
03313 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03314 p = s;
03315 continue;
03316 }
03317 s += clen;
03318
03319 p = s;
03320 switch (c) {
03321 case '1': case '2': case '3': case '4':
03322 case '5': case '6': case '7': case '8': case '9':
03323 if (onig_noname_group_capture_is_active(RREGEXP(regexp)->ptr)) {
03324 no = c - '0';
03325 }
03326 else {
03327 continue;
03328 }
03329 break;
03330
03331 case 'k':
03332 if (s < e && ASCGET(s, e, &clen) == '<') {
03333 char *name, *name_end;
03334
03335 name_end = name = s + clen;
03336 while (name_end < e) {
03337 c = ASCGET(name_end, e, &clen);
03338 if (c == '>') break;
03339 name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
03340 }
03341 if (name_end < e) {
03342 no = name_to_backref_number(regs, regexp, name, name_end);
03343 p = s = name_end + clen;
03344 break;
03345 }
03346 else {
03347 rb_raise(rb_eRuntimeError, "invalid group name reference format");
03348 }
03349 }
03350
03351 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03352 continue;
03353
03354 case '0':
03355 case '&':
03356 no = 0;
03357 break;
03358
03359 case '`':
03360 rb_enc_str_buf_cat(val, RSTRING_PTR(src), BEG(0), src_enc);
03361 continue;
03362
03363 case '\'':
03364 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+END(0), RSTRING_LEN(src)-END(0), src_enc);
03365 continue;
03366
03367 case '+':
03368 no = regs->num_regs-1;
03369 while (BEG(no) == -1 && no > 0) no--;
03370 if (no == 0) continue;
03371 break;
03372
03373 case '\\':
03374 rb_enc_str_buf_cat(val, s-clen, clen, str_enc);
03375 continue;
03376
03377 default:
03378 rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
03379 continue;
03380 }
03381
03382 if (no >= 0) {
03383 if (no >= regs->num_regs) continue;
03384 if (BEG(no) == -1) continue;
03385 rb_enc_str_buf_cat(val, RSTRING_PTR(src)+BEG(no), END(no)-BEG(no), src_enc);
03386 }
03387 }
03388
03389 if (!val) return str;
03390 if (p < e) {
03391 rb_enc_str_buf_cat(val, p, e-p, str_enc);
03392 }
03393
03394 return val;
03395 }
03396
03397 static VALUE
03398 kcode_getter(void)
03399 {
03400 rb_warn("variable $KCODE is no longer effective");
03401 return Qnil;
03402 }
03403
03404 static void
03405 kcode_setter(VALUE val, ID id)
03406 {
03407 rb_warn("variable $KCODE is no longer effective; ignored");
03408 }
03409
03410 static VALUE
03411 ignorecase_getter(void)
03412 {
03413 rb_warn("variable $= is no longer effective");
03414 return Qfalse;
03415 }
03416
03417 static void
03418 ignorecase_setter(VALUE val, ID id)
03419 {
03420 rb_warn("variable $= is no longer effective; ignored");
03421 }
03422
03423 static VALUE
03424 match_getter(void)
03425 {
03426 VALUE match = rb_backref_get();
03427
03428 if (NIL_P(match)) return Qnil;
03429 rb_match_busy(match);
03430 return match;
03431 }
03432
03433 static void
03434 match_setter(VALUE val)
03435 {
03436 if (!NIL_P(val)) {
03437 Check_Type(val, T_MATCH);
03438 }
03439 rb_backref_set(val);
03440 }
03441
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457
03458
03459
03460
03461
03462
03463
03464
03465
03466
03467
03468 static VALUE
03469 rb_reg_s_last_match(int argc, VALUE *argv)
03470 {
03471 VALUE nth;
03472
03473 if (argc > 0 && rb_scan_args(argc, argv, "01", &nth) == 1) {
03474 VALUE match = rb_backref_get();
03475 int n;
03476 if (NIL_P(match)) return Qnil;
03477 n = match_backref_number(match, nth);
03478 return rb_reg_nth_match(n, match);
03479 }
03480 return match_getter();
03481 }
03482
03483 static void
03484 re_warn(const char *s)
03485 {
03486 rb_warn("%s", s);
03487 }
03488
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512 void
03513 Init_Regexp(void)
03514 {
03515 rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError);
03516
03517 onigenc_set_default_caseconv_table((UChar*)casetable);
03518 onigenc_set_default_encoding(ONIG_ENCODING_ASCII);
03519 onig_set_warn_func(re_warn);
03520 onig_set_verb_warn_func(re_warn);
03521
03522 rb_define_virtual_variable("$~", match_getter, match_setter);
03523 rb_define_virtual_variable("$&", last_match_getter, 0);
03524 rb_define_virtual_variable("$`", prematch_getter, 0);
03525 rb_define_virtual_variable("$'", postmatch_getter, 0);
03526 rb_define_virtual_variable("$+", last_paren_match_getter, 0);
03527
03528 rb_define_virtual_variable("$=", ignorecase_getter, ignorecase_setter);
03529 rb_define_virtual_variable("$KCODE", kcode_getter, kcode_setter);
03530 rb_define_virtual_variable("$-K", kcode_getter, kcode_setter);
03531
03532 rb_cRegexp = rb_define_class("Regexp", rb_cObject);
03533 rb_define_alloc_func(rb_cRegexp, rb_reg_s_alloc);
03534 rb_define_singleton_method(rb_cRegexp, "compile", rb_class_new_instance, -1);
03535 rb_define_singleton_method(rb_cRegexp, "quote", rb_reg_s_quote, 1);
03536 rb_define_singleton_method(rb_cRegexp, "escape", rb_reg_s_quote, 1);
03537 rb_define_singleton_method(rb_cRegexp, "union", rb_reg_s_union_m, -2);
03538 rb_define_singleton_method(rb_cRegexp, "last_match", rb_reg_s_last_match, -1);
03539 rb_define_singleton_method(rb_cRegexp, "try_convert", rb_reg_s_try_convert, 1);
03540
03541 rb_define_method(rb_cRegexp, "initialize", rb_reg_initialize_m, -1);
03542 rb_define_method(rb_cRegexp, "initialize_copy", rb_reg_init_copy, 1);
03543 rb_define_method(rb_cRegexp, "hash", rb_reg_hash, 0);
03544 rb_define_method(rb_cRegexp, "eql?", rb_reg_equal, 1);
03545 rb_define_method(rb_cRegexp, "==", rb_reg_equal, 1);
03546 rb_define_method(rb_cRegexp, "=~", rb_reg_match, 1);
03547 rb_define_method(rb_cRegexp, "===", rb_reg_eqq, 1);
03548 rb_define_method(rb_cRegexp, "~", rb_reg_match2, 0);
03549 rb_define_method(rb_cRegexp, "match", rb_reg_match_m, -1);
03550 rb_define_method(rb_cRegexp, "to_s", rb_reg_to_s, 0);
03551 rb_define_method(rb_cRegexp, "inspect", rb_reg_inspect, 0);
03552 rb_define_method(rb_cRegexp, "source", rb_reg_source, 0);
03553 rb_define_method(rb_cRegexp, "casefold?", rb_reg_casefold_p, 0);
03554 rb_define_method(rb_cRegexp, "options", rb_reg_options_m, 0);
03555 rb_define_method(rb_cRegexp, "encoding", rb_obj_encoding, 0);
03556 rb_define_method(rb_cRegexp, "fixed_encoding?", rb_reg_fixed_encoding_p, 0);
03557 rb_define_method(rb_cRegexp, "names", rb_reg_names, 0);
03558 rb_define_method(rb_cRegexp, "named_captures", rb_reg_named_captures, 0);
03559
03560 rb_define_const(rb_cRegexp, "IGNORECASE", INT2FIX(ONIG_OPTION_IGNORECASE));
03561 rb_define_const(rb_cRegexp, "EXTENDED", INT2FIX(ONIG_OPTION_EXTEND));
03562 rb_define_const(rb_cRegexp, "MULTILINE", INT2FIX(ONIG_OPTION_MULTILINE));
03563 rb_define_const(rb_cRegexp, "FIXEDENCODING", INT2FIX(ARG_ENCODING_FIXED));
03564
03565 rb_global_variable(®_cache);
03566
03567 rb_cMatch = rb_define_class("MatchData", rb_cObject);
03568 rb_define_alloc_func(rb_cMatch, match_alloc);
03569 rb_undef_method(CLASS_OF(rb_cMatch), "new");
03570
03571 rb_define_method(rb_cMatch, "initialize_copy", match_init_copy, 1);
03572 rb_define_method(rb_cMatch, "regexp", match_regexp, 0);
03573 rb_define_method(rb_cMatch, "names", match_names, 0);
03574 rb_define_method(rb_cMatch, "size", match_size, 0);
03575 rb_define_method(rb_cMatch, "length", match_size, 0);
03576 rb_define_method(rb_cMatch, "offset", match_offset, 1);
03577 rb_define_method(rb_cMatch, "begin", match_begin, 1);
03578 rb_define_method(rb_cMatch, "end", match_end, 1);
03579 rb_define_method(rb_cMatch, "to_a", match_to_a, 0);
03580 rb_define_method(rb_cMatch, "[]", match_aref, -1);
03581 rb_define_method(rb_cMatch, "captures", match_captures, 0);
03582 rb_define_method(rb_cMatch, "values_at", match_values_at, -1);
03583 rb_define_method(rb_cMatch, "pre_match", rb_reg_match_pre, 0);
03584 rb_define_method(rb_cMatch, "post_match", rb_reg_match_post, 0);
03585 rb_define_method(rb_cMatch, "to_s", match_to_s, 0);
03586 rb_define_method(rb_cMatch, "inspect", match_inspect, 0);
03587 rb_define_method(rb_cMatch, "string", match_string, 0);
03588 rb_define_method(rb_cMatch, "hash", match_hash, 0);
03589 rb_define_method(rb_cMatch, "eql?", match_equal, 1);
03590 rb_define_method(rb_cMatch, "==", match_equal, 1);
03591 }
03592