ureg - portable BRE/ERE regex api
#include "ureg.h"
//main func
ureg_t*
ureg_new(const char*
reg
[,
int reglen,
int syn]
);
ureg_t*
ureg_new_raw(const char*
reg
[,
int reglen,
int syn]
);
int ureg_search(ureg_t*
robj,
const
char*
s [,
int slen]
);
int ureg_search_head(ureg_t*
robj,
const char*
s [,
int slen]
);
void ureg_free(ureg_t*
robj);
//main object info
typedef struct ureg_tag {
void* data; //opaque regobj
char* fastmap; //fastmap char[255]. holds 1st byte hit or not
char* lc; //save regex locale
int rawflg; //locale flag
const char* p; //result buff
int sz;
const char* pbk[10];
int szbk[10]; //buff
} ureg_t;
//support func
ureg_t ureg_easy(const char*
reg,
const char*
s
[,
int reglen,
int slen,
int
syn]
) ;
int ureg_ere_syn(void);
int ureg_bre_syn(void);
void*
ureg_iconv(const char*
fromenc,
const char*
toenc,
char**
s [,
int
slen]
);
#include "ureg.h"
int main() {
ureg_t res = ureg_easy("a[bc]", "123abc"); //dfl:ERE
printf("%.*s \n", res.sz, res.p); // == "ab"
return 0;
}
//~$ cc src.c libureg.a
#include "ureg.h"
int main() {
//simple
ureg_t res = ureg_easy("a[bc]", "123abc");
//res.p="123abc"+3, res.sz=2
res = ureg_easy("a(b|c)", "123abc");
//res.pbk[1]="123abc"+3,res.szbk[1]=1
printf("%.*s", res.szbk[0], res.pbk[0]); //>> "ab"
//complex
ureg_t* obj = ureg_new("a(b|c)"); //rtn NULL: err
const char* s = "123abc";
int rc = ureg_search(obj, s); //rc== hit:>=0 nohit/err:<0
//rc=3, obj->p=s+3(rc), obj->sz=2
//obj->pbk[0]= s+3, obj->szbk[0]=2 //p=pbk[0]: all the match str
//obj->pbk[1]= s+4, obj->szbk[1]=1 //bkref: (b|c) hits 123a(b)c
//obj->pbk[2]= NULL,obj->szbk[2]=-1 //set NULL/-1 if fail
rc = ureg_search_head(obj, s); //regex hits only string top
// rc= -1, obj->p= NULL, obj->sz= -1 //nohit
rc = ureg_search_head(obj, s+3); // rc=0, obj->p= s+3, obj->sz= 2
ureg_free(obj);
setlocale(LC_CTYPE, "UTF32"); // ~$ iconv -l
obj = ureg_new_raw("a(b|c)");
setlocale(LC_CTYPE, "C"); // reset
rc = ureg_search(obj, s);
ureg_free(obj);
//ureg_iconv(): not regex. conv encode UTF-8, EBCDIC etc
const char* s0 = "hwd"; //"hwd"=="\150\167\144" in C charset
const char* s1 = "\210\246\204"; //"hwd" in EBCDIC-BR
char* s = s0;
int* p = ureg_iconv("C","EBCDIC-BR",&s,2); //(from,to,s0_adrs,len)
int binsz = p[0]; //==2, conv result(EBCDIC-BR) bytesz
char* pp = (char*)(p+1);
// pp[0]=='\210', pp[1]=='\246', pp[2]=='\0'
// srcptr &s is changed to "hwd"+2, "(hw)d" pos, s[0]=='d'
// | binsz | c | c | 0 |
// int char char \0
free(p); // ureg_iconv() rtns sz+bin malloc()ed ptr.
return 0;
}
//~$ cc src.c libureg.a
ureg uses posix-ERE with the following expantion in default. some items are
compromised for reasons
- [] and .(dot) includes \n and \0. posix doesnt allow byte \0.
- [] and .(dot) ignores out of charactor byte. (should be err. gave up)
eg) [\100\377] >> [\100] (ascii locale)
eg) [Σ\377]==[\u03a3] : 1char, multibyte locale
eg) [Σ\377]==[\316\243\377] : 3 char, C locale
- allow bytedata except [] and .(dot) eg) "\377z[0-9]" >> valid
- normal char esc \@ treats as @ (should be err. gave up)
ureg_t ureg_easy(r, s [, rlen, slen, syn]);
ureg_easy() uses for oneshot search. use
-1 if rlen/slen/syn isnt set.
search result is set to rtn. if nohit, set res.p==NULL, res.sz<0.
res = ureg_easy("a[bc]", "123abc"); //hit,
res.p="..."+3, res.sz=2
res = ureg_easy("a[bc]", "123abc", -1,-1,-1); //same
param
r : regex str
s : search target string/binary
[rlen]: r size. use strlen(r) if set -1/noset
[slen]: s size
[syn] : change regex syntax. see below complex mode
return
res.p : matched ptr. res.p[0]=='a' in above sample. "123(a)bc"
res.sz: match string byte size
others: see below (complex mode)
ureg_easy() has the following restrictions
- locale rawmode isnt support. always use the sys locale(UTF-8 etc)
- execute malloc()/free() every time
sample:
#include "ureg.h"
int main(int argc, char** argv){
ureg_t res = ureg_easy("a[bc]", "123abc");
printf("%.*s \n" res.sz, res.p); //>> disp "ab"
return 0; //no memleak
}
// ~$ gcc src.c libureg.a
divided to 3 parts, compile >> search >> free
1. compile regex
ureg_t* robj = ureg_new(r [,rlen, syn] );
ureg_t* robj = ureg_new_raw(r [,rlen, syn] );
int syn = ureg_ere_syn();
int syn = ureg_bre_syn();
C-lang uses "C" locale in default setting and all 8bit char is vaild.
(www.gnu.org/software/grep/manual/html_node/Character-Encoding.html)
eg) int main(){ puts(setlocale(LC_CTYPE, NULL)); } //>>
"C"
and regex semantics are differences from locale "C" to
"UTF-8".
C : "[Σ]" == "[\316\243]" ==
"(\316|\243)" //binary \316 or \243
u8: "[Σ]" == "\316\243" //binary sequence is
1 charactor
- ureg_new()
- use OS locale, setlocale(LC_CTYPE, "") (see ~$ echo $LANG) and
use its locale when search.
- ureg_new_raw()
- uses current locale and never changes locale setting, so adjusting locale
is user duty. if new/search locale isnt the same, you may get the strange
results. robj->lc saves setlocale(LC_TYPE,NULL) str.
sample:
#include "ureg.h"
int main() {
ureg_t* obj = ureg_new("[Σ]");
puts(obj->lc); // "XX.UTF-8" etc
ureg_search(obj, "xΣyz"); // match len: obj->sz=2
ureg_free(obj);
obj = ureg_new_raw("[Σ]"); //== "(\316|\243)" at locale "C"
puts(obj->lc); // "C" etc
ureg_search(obj, "xΣyz"); // obj->sz=1
ureg_free(obj);
setlocale(LC_CTYPE, "C.UTF-8");
obj = ureg_new_raw("[Σ]"); //== "\316\243" at UTF-8
setlocale(LC_CTYPE, "C");
ureg_search(obj, "xΣyz"); //obj->sz=1, use current locale
puts(obj->lc); //>> "C.UTF-8"
ureg_free(obj);
return 0;
}
//~$ cc src.c libureg.a
malloc()/free() isnt executed every time. you can avoid the overhead. but locale
setting is very confusing so I reccomend you to use ureg_new() except
rare/special usage.
param
- r
- regex str. mb class/range works only under the mb locale env.
[Σ-Τ] == "(\u03a3|\u03a4)" //"XXX.UTF-8"
[Σ-Τ] == [\316\243-\316\244]== [\243-\316]
//"C"
- [rlen]
- r size. use strlen(r) if set -1/noset
- [syn]
- use ERE if -1/noset. ureg_ere_syn()/ureg_bre_syn() returns ERE/BRE flag.
int syn = ureg_bre_syn(); // or ureg_ere_syn();
robj = ureg_new("a\\(b\\)c", -1, syn);
rc = ureg_search(robj, "00abc");//hit rc=2
return
- robj->data
- ptr to compiled regex data
- robj->fastmap
- char arr[255]. if r="[ab]c", arr['a'], ['b'] is 1. others is
0.
- robj->lc
- copy of setlocale(LC_CTYPE, NULL) str when regex compiled.
- robj->rawflg
- 0/1 == ureg_new()/ureg_new_raw()
- others
- result buffer
2. search
int rc = ureg_search(robj, s [, slen] );
int rc = ureg_search_head(robj, s [, slen] );
- ureg_search()
- finds match data from s. use strlen(s) if slen= -1/noset. if you used
ureg_new(), use robj->lc locale setting automatically.
- ureg_search_head()
- check only BOS like lex scanner
param
- robj
- search result is set to this member. set NULL / -1 if nohit.
- char* robj->p
- full matching ptrpos
- int obj->sz
- byte size of matching str.
- char* obj->pbk[10]
- back reference ptrpos. obj->p == obj->pbk[0]
- int obj->szbk[10]
- byte size of back references. obj->sz==obj->szbk[0]. posix-ERE
syntax doesnt allow backref \1-9, but to hold backref data is lawful.
posix regcomp(), REG_NOSUB also holds its data.
- s
- search target string ptr
- [slen]
- use strlen(s) if -1/noset
return
- int rc
- funcs rtns >=0 or <0 == hit/nohit. srcptr+rc will be hittop
ptr.
sample:
r = "a[bc]";
obj = ureg_new(r);
s = "123abc";
rc = ureg_search(obj, s); //rc=3, hit: "...(ab)c", s[3]='a'
rc = ureg_search_head(obj, s); //rc<0, nohit
rc = ureg_search_head(obj, s+3); //rc=0, hit "(ab)c", (s+3)[rc]='a'
search funcs doesnt support pirtial match. you need fulltext.
r = "a(bc)";
s = "123abc";
rc = ureg_search(obj, s, 4); //"123a", reg hits 'a' but fail. rc<0
// if supports rc= -5(pirtial/morestr) etc, you may use fgetc()
back reference result \1-\9 is set to pbk/szbk[1-9]. [0] has fullmatch data.
index 1-9 applies to open parlen "(" order. the same as posix.
reg: ( a ( b | c ) )
bk : \1 \2
back reference sample:
r = "a[bc]";
s = "123abc";
robj = ureg_new(r);
rc = ureg_search(robj, s);
//robj->p=s+4, robj->sz=2
//pbk[0]=p,szbk[0]=sz, pbk[1]=NULL, szbk[1]=-1
r = "\\(a\\)\\(bc\\)";
s = "123abc";
robj = ureg_new(r, -1, ureg_bre_syn() );
rc = ureg_search(robj, s); //robj->p, pbk is as below
// str: "123abc", reg: "(a)(bc)"
// p : ...oo.., robj->sz=2, rc=3, robj->p==s+rc
// p0 : (p0==p)
// p1 : ...o..., robj->szbk[1]== 1
// p2 : ....o.., robj->szbk[2]== 1
// p3 : NULL, robj->szbk[3]== -1
3. free
void ureg_free(robj);
robj is malloc()ed. free memory when you finish regex search.
void* p = ureg_iconv(fenc, tenc, &s [, slen]);
ureg_iconv() doesnt treat regex. convert different encoding data. posix iconv()
api/manual is very awkward so make a wrapper.
param
fenc : from enc type str. "EBCDIC-BR" etc. see ~$ iconv -l
tenc : to enc
&s : srcptr-ref. this ag is changed by func, so pass the copy.
[slen]: src byte size. use strlen(*&s) if set -1/noset.
return
- p
- suc/fail == notNULL/NULL. converted binary stream with bytesize. use type
punning (int), (char*) to get sz and binptr. byte sequence is as follows.
| binsz | c | c | 0 |
int char char \0 (if binsz=2)
...
char* s = (char*)"hw";
int* p = ureg_iconv("ASCII", "UTF32", &s); //p is malloc()ed
int binsz = p[0]; //==12, conv bytesz. BOM(4b) + h(4b) + w(4b)
char* bin = (char*)(p+1);
printf("%.*s \n", binsz, bin);
free(p);
if allsrc is converted to other enc, s is set as s == src+slen. if detect
invalid byte sequences, s is set to its address.
all suc: "oooooooo"
........s(maybe \0 or other)
bad seq: "ooooox.."
.....s.. (s[0]=='\377' etc. from BOS to (s-1) is valid)
sample:
#include <string.h>
#include <stdlib.h>
#include "ureg.h"
int main(){
const char* s = "ab\377c"; // "ab(NG)c", ascii
char* sp = (char*)s;
int* p = ureg_iconv("ASCII", "UTF32", &sp); // ~$ iconv -l
printf("%p %p\n", s, sp); // p-s == 2, sp[0]='\377'
int sz = p[0]; // ==12, ascii:7bit, UTF32:32bit + BOM 32bit
char* bin = (char*)&p[1];
printf("%d\n", sz); //12, bin[0]..bin[12-1] is UTF32 str
if(s+strlen(s) != sp){ puts("bad ascii byte"); }
free(p);
return 0;
}
//~$ cc src.c libureg.a
ureg_erebin_syn_ = 0
// | RE_BACKSLASH_ESCAPE_IN_LISTS //"\" works as esc in [] //posix:off
// | RE_ICASE // ignore case, aa==Aa on:a==A off:a!=A
| RE_INTERVALS // use {} op on:use off:normal chars
// | RE_LIMITED_OPS // +*?| are normal chars on:yes off:special
| RE_CHAR_CLASSES // use [:alnum:] etc on:yes off:no
| RE_CONTEXT_INDEP_ANCHORS //$^ works except []/esc (a^c etc)
| RE_CONTEXT_INDEP_OPS // *+? raise err in badpos, "+a" etc
| RE_CONTEXT_INVALID_DUP // "{2}a" raise err off:[{][2][}]a
| RE_CONTEXT_INVALID_OPS // +*? raise err, "?a" on:yes off:[+]a
| RE_UNMATCHED_RIGHT_PAREN_ORD //")a" ISNT err on:[)]a off:err
// | RE_INVALID_INTERVAL_ORD // "}abc" ISNT err on:[}] off:err
| RE_DOT_NEWLINE // .(dot) includes '\n' on:inc off:exc
// | RE_DOT_NOT_NULL // .(dot) excludes '\0' on:exc off:inc
// | RE_HAT_LISTS_NOT_NEWLINE //[^a] excludes '\n' on:exc off:inc
// | RE_NEWLINE_ALT // a(\n)b works as a|b, on:\n==| off:| only
| RE_NO_BK_BRACES // on:{} off:\{\} //needs RE_INTERVALS
| RE_NO_BK_PARENS // on:() off:\(\)
| RE_NO_BK_VBAR // on:| off:\| //needs RE_LIMITED_OPS=off
// | RE_NO_BK_REFS // use \1,\2 etc on:nouse off:use
// | RE_BK_PLUS_QM // repeat "a+" or "a\+" on:\+ off:+
| RE_DEBUG // holds dbginfo on:yes off:no //emsg etc
| RE_NO_EMPTY_RANGES // [z-a] is err on:yes off:use as empty
| RE_NO_GNU_OPS //out of standard op, \< etc on:nouse off:use
// | RE_NO_POSIX_BACKTRACKING //shortest match on:yes off:longest
// | RE_NO_SUB //drop backref data, (ab)c \1=ab etc. on:yes off:hold
;
// https://www.gnu.org/software/gnulib/manual/html_node/Syntax-Bits.html
// --posix-ERE
// --undefined
// /()/
// /+aa|*{/
// /\@/
// /|g/
// [z-a]
// [a-c-e]
// --valid
// )ab >>uneven parlen treat as ordinary char
// []]
// [\]] >>2char '\' or ']'
// [-a] >>2char '-' or 'a'
// [ac-]
// /a^b/, /a$b/ >> valid, but never matched
ureg_brebin_syn_= 0
| RE_CHAR_CLASSES
| RE_DOT_NEWLINE
// | RE_DOT_NOT_NULL //>>for support binary
| RE_INTERVALS
| RE_NO_EMPTY_RANGES
;
..gnu-regex manual have inconsistent expression(DO/NOT DO/NO/LIMIT) and low
readability. refer to the above and the mit manual.
( http://web.mit.edu/gnu/doc/html/regex_2.html#SEC3 )
important difference between BRE and ERE syntax is
backref,
altername
op |,
anchor work and
repeat op:
BRE ERE eg
backref \1-\9 nothing B: \(a\)\1 >>aa, E: (a)\1 >> undefined
alter nothing | B: - E: (aa|bb)
anchor BOS/EOS anywhere B: \(^a^b\)>>a^b E: (^a^b) >> never match
repeat context anywhare B: *ab >> *ab E: *ab >> undefined
I recommend you to use ERE in generally. you should use BRE only when you need
back-references or non-support ERE command (sed, grep etc) additionally,
quote/esc the special charactors to avoid
context depends regex. eg)
ERE: )abc >> [)]abc
(https://stackoverflow.com/questions/40455975/why-regular-expressions-with-backreferences-are-not-regular-expressions)
(https://swtch.com/~rsc/regexp/regexp1.html)
BRE doesnt have '|' op and equivalent expression seems impossible
ERE: (ab|cd)* >> ( (ab){0,1} (cd){0,1} ){0,}
ERE: (ab|cd)+ >> ???
ERE doesnt have back-reference \1-\9. close expression is possible but not
perfect. catch _ab_, __ab__, ___ab___ ... is:
BRE: \(_*\)[^_]\1
ERE: (_[^_]*_) | (__[^_]*__) | (___[^_]*___) ...
--- benchmark: loop(10*1000){ ureg_easy("4.6", "1234567890"); } etc
compile FAST: -- >>> onig(1) >>> ureg(50) :SLOW
search FAST: strstr(1) >>> onig(100) >>> ureg(300) :SLOW
- ureg
real 948.292 ms: ./ureg.tmp.c 881: t_bm0_sub(): msg:easy:10*1000
real 813.072 ms: ./ureg.tmp.c 891: t_bm1_sub(): msg:new-free:10*1000
real 24.156 ms : ./ureg.tmp.c 898: t_bm1_sub(): msg:search:10*1000
real 787.625 ms: ./ureg.tmp.c 911: t_bm2_sub(): msg:new-free@raw:10*1000
real 10.868 ms : ./ureg.tmp.c 918: t_bm2_sub(): msg:search@raw:10*1000
- oniguruma
real 17.268 ms : ./ureg.ts.c 264: t_bm4_sub(): msg:new-free@onig:10*1000
real 4.473 ms : ./ureg.ts.c 276: t_bm4_sub(): msg:search@onig:10*1000
- strstr()
real 0.050 ms : ./ureg.ts.c 240: t_bm3_sub(): msg:strstr(): 10*1000
--- concept
- avoid complex api
- avoid non-standard regexp/operator like PCRE
- support binary input
posix-2001+
2021-08-08 v3.0.3
Copyright 2021 momi-g, GPLv3+
https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
https://swtch.com/~rsc/regexp/regexp1.html
http://web.mit.edu/gnu/doc/html/regex_7.html
http://www.kt.rim.or.jp/~kbk/regex/re_7.html
https://regular-expressions.mobi/refunicode.html?wlr=1
https://stackoverflow.com/questions/2359811/working-with-gnu-regex-functions-in-c-or-c
https://stackoverflow.com/questions/8727795/does-posix-regex-h-provide-unicode-or-basically-non-ascii-characters
https://stackoverflow.com/questions/48743106/whats-ansi-x3-4-1968-encoding
https://www.iana.org/assignments/character-sets/character-sets.xhtml