NAME

ureg - portable BRE/ERE regex api

SYNOPSIS

#include "ureg.h"

//main func

ureg_t* ureg_new(const char* reg [,int reglen, int syn] );

ureg_t* ureg_new_raw(const char* reg [, int reglen, int syn] );

int ureg_search(ureg_t* robj, const char* s [,int slen] );

int ureg_search_head(ureg_t* robj, const char* s [,int slen] );

void ureg_free(ureg_t* robj);

//main object info

typedef struct ureg_tag {

void* data; //opaque regobj

char* fastmap; //fastmap char[255]. holds 1st byte hit or not

char* lc; //save regex locale

int rawflg; //locale flag

const char* p; //result buff

int sz;

const char* pbk[10];

int szbk[10]; //buff

} ureg_t;

//support func

ureg_t ureg_easy(const char* reg,const char* s

[, int reglen,int slen,int syn]) ;

int ureg_ere_syn(void);

int ureg_bre_syn(void);

void* ureg_iconv(const char* fromenc,

const char* toenc, char** s [,int slen]);

TL;DR

#include "ureg.h"

int main() {
  ureg_t res = ureg_easy("a[bc]", "123abc");	//dfl:ERE
  printf("%.*s \n", res.sz, res.p);	// == "ab"
  return 0;
}
//~$ cc src.c libureg.a

EXSAMPLE

 #include "ureg.h"
	
 int main() {
	//simple
	ureg_t res = ureg_easy("a[bc]", "123abc");
		//res.p="123abc"+3, res.sz=2
	res = ureg_easy("a(b|c)", "123abc");
		//res.pbk[1]="123abc"+3,res.szbk[1]=1
	printf("%.*s", res.szbk[0], res.pbk[0]);	//>> "ab"
	
	//complex
	ureg_t* obj = ureg_new("a(b|c)");	//rtn NULL: err
	const char* s = "123abc";
	int rc = ureg_search(obj, s);	//rc== hit:>=0 nohit/err:<0
	  //rc=3, obj->p=s+3(rc), obj->sz=2 
	  //obj->pbk[0]= s+3, obj->szbk[0]=2 //p=pbk[0]: all the match str
	  //obj->pbk[1]= s+4, obj->szbk[1]=1 //bkref: (b|c) hits 123a(b)c
	  //obj->pbk[2]= NULL,obj->szbk[2]=-1 //set NULL/-1 if fail
	
	rc = ureg_search_head(obj, s);	//regex hits only string top
		// rc= -1, obj->p= NULL, obj->sz= -1	//nohit
	rc = ureg_search_head(obj, s+3);  // rc=0, obj->p= s+3, obj->sz= 2
	ureg_free(obj);
	
	setlocale(LC_CTYPE, "UTF32");	// ~$ iconv -l
	obj = ureg_new_raw("a(b|c)");
	setlocale(LC_CTYPE, "C");	// reset
	rc = ureg_search(obj, s);
	ureg_free(obj);
	
	//ureg_iconv(): not regex. conv encode UTF-8, EBCDIC etc
	const char* s0 = "hwd";  //"hwd"=="\150\167\144" in C charset
	const char* s1 = "\210\246\204";	//"hwd" in EBCDIC-BR
	char* s = s0;
	int* p = ureg_iconv("C","EBCDIC-BR",&s,2); //(from,to,s0_adrs,len)
	int binsz = p[0];	//==2, conv result(EBCDIC-BR) bytesz
	char* pp = (char*)(p+1);
		// pp[0]=='\210', pp[1]=='\246', pp[2]=='\0'
		// srcptr &s is changed to "hwd"+2, "(hw)d" pos, s[0]=='d'
		// | binsz |  c  |  c  | 0 |
		//    int    char  char  \0
	free(p);	// ureg_iconv() rtns sz+bin malloc()ed ptr.
	return 0;
  }
  //~$ cc src.c libureg.a

DESCRIPTION

ureg uses posix-ERE with the following expantion in default. some items are compromised for reasons

- [] and .(dot) includes \n and \0. posix doesnt allow byte \0.
- [] and .(dot) ignores out of charactor byte. (should be err. gave up)
    eg) [\100\377] >> [\100] (ascii locale)
    eg) [Σ\377]==[\u03a3] : 1char, multibyte locale
    eg) [Σ\377]==[\316\243\377] : 3 char, C locale
- allow bytedata except [] and .(dot)   eg) "\377z[0-9]"  >> valid
- normal char esc \@ treats as @ (should be err. gave up)

easy mode

ureg_t ureg_easy(r, s [, rlen, slen, syn]);

ureg_easy() uses for oneshot search. use -1 if rlen/slen/syn isnt set. search result is set to rtn. if nohit, set res.p==NULL, res.sz<0.

res = ureg_easy("a[bc]", "123abc"); //hit, res.p="..."+3, res.sz=2

res = ureg_easy("a[bc]", "123abc", -1,-1,-1); //same

param

	r     : regex str
	s     : search target string/binary
	[rlen]: r size. use strlen(r) if set -1/noset
	[slen]: s size
	[syn] : change regex syntax. see below  complex mode

return

    res.p : matched ptr. res.p[0]=='a' in above sample. "123(a)bc"
    res.sz: match string byte size
    others: see below (complex mode)

ureg_easy() has the following restrictions

	- locale rawmode isnt support. always use the sys locale(UTF-8 etc)
	- execute malloc()/free() every time

sample:

	#include "ureg.h"
	int main(int argc, char** argv){
		ureg_t res = ureg_easy("a[bc]", "123abc");
		printf("%.*s \n" res.sz, res.p);	//>> disp "ab"
		return 0;	//no memleak
	}
	// ~$ gcc src.c libureg.a

complex mode

divided to 3 parts, compile >> search >> free

1. compile regex

	ureg_t* robj = ureg_new(r [,rlen, syn] );
	ureg_t* robj = ureg_new_raw(r [,rlen, syn] );
	int syn = ureg_ere_syn();
	int syn = ureg_bre_syn();

C-lang uses "C" locale in default setting and all 8bit char is vaild.

(www.gnu.org/software/grep/manual/html_node/Character-Encoding.html)

eg) int main(){ puts(setlocale(LC_CTYPE, NULL)); } //>> "C"

and regex semantics are differences from locale "C" to "UTF-8".

C : "[Σ]" == "[\316\243]" == "(\316|\243)" //binary \316 or \243

u8: "[Σ]" == "\316\243" //binary sequence is 1 charactor

ureg_new(): use OS locale, setlocale(LC_CTYPE, "") (see ~$ echo $LANG) and use its locale when search.

ureg_new_raw(): uses current locale and never changes locale setting, so adjusting locale is user duty. if new/search locale isnt the same, you may get the strange results. robj->lc saves setlocale(LC_TYPE,NULL) str.

sample:

	#include "ureg.h"
	int main() {
		ureg_t* obj = ureg_new("[Σ]");
		puts(obj->lc);	// "XX.UTF-8" etc
		ureg_search(obj, "xΣyz");	// match len: obj->sz=2
		ureg_free(obj);
		obj = ureg_new_raw("[Σ]");  //== "(\316|\243)" at locale "C"
		puts(obj->lc);	// "C" etc
		ureg_search(obj, "xΣyz");  // obj->sz=1
		ureg_free(obj);
			setlocale(LC_CTYPE, "C.UTF-8");
		obj = ureg_new_raw("[Σ]");	//== "\316\243" at UTF-8
			setlocale(LC_CTYPE, "C");
		ureg_search(obj, "xΣyz"); //obj->sz=1, use current locale
			puts(obj->lc);	//>> "C.UTF-8"
		ureg_free(obj);
		return 0;
	}
	//~$ cc src.c  libureg.a

malloc()/free() isnt executed every time. you can avoid the overhead. but locale setting is very confusing so I reccomend you to use ureg_new() except rare/special usage.

param

r: regex str. mb class/range works only under the mb locale env.

[Σ-Τ] == "(\u03a3|\u03a4)" //"XXX.UTF-8"

[Σ-Τ] == [\316\243-\316\244]== [\243-\316] //"C"

[rlen]: r size. use strlen(r) if set -1/noset

[syn]: use ERE if -1/noset. ureg_ere_syn()/ureg_bre_syn() returns ERE/BRE flag.

int syn = ureg_bre_syn(); // or ureg_ere_syn();

robj = ureg_new("a\$b\$c", -1, syn);

rc = ureg_search(robj, "00abc");//hit rc=2

return

robj->data: ptr to compiled regex data

robj->fastmap: char arr[255]. if r="[ab]c", arr['a'], ['b'] is 1. others is 0.

robj->lc: copy of setlocale(LC_CTYPE, NULL) str when regex compiled.

robj->rawflg: 0/1 == ureg_new()/ureg_new_raw()

others: result buffer

2. search

	int rc = ureg_search(robj, s [, slen] );
	int rc = ureg_search_head(robj, s [, slen] );

ureg_search(): finds match data from s. use strlen(s) if slen= -1/noset. if you used ureg_new(), use robj->lc locale setting automatically.

ureg_search_head(): check only BOS like lex scanner

param

robj: search result is set to this member. set NULL / -1 if nohit.

char* robj->p: full matching ptrpos

int obj->sz: byte size of matching str.

char* obj->pbk[10]: back reference ptrpos. obj->p == obj->pbk[0]

int obj->szbk[10]: byte size of back references. obj->sz==obj->szbk[0]. posix-ERE syntax doesnt allow backref \1-9, but to hold backref data is lawful. posix regcomp(), REG_NOSUB also holds its data.

s: search target string ptr

[slen]: use strlen(s) if -1/noset

return

int rc: funcs rtns >=0 or <0 == hit/nohit. srcptr+rc will be hittop ptr.

sample:

	r = "a[bc]";
	obj = ureg_new(r);
	s = "123abc";
	rc = ureg_search(obj, s);	//rc=3, hit: "...(ab)c", s[3]='a'
	rc = ureg_search_head(obj, s); //rc<0, nohit
	rc = ureg_search_head(obj, s+3); //rc=0, hit "(ab)c", (s+3)[rc]='a'

search funcs doesnt support pirtial match. you need fulltext.

	r = "a(bc)";
	s = "123abc";
	rc = ureg_search(obj, s, 4); //"123a", reg hits 'a' but fail. rc<0
	// if supports rc= -5(pirtial/morestr) etc, you may use fgetc()

back reference result \1-\9 is set to pbk/szbk[1-9]. [0] has fullmatch data. index 1-9 applies to open parlen "(" order. the same as posix.

reg: ( a ( b | c ) )

bk : \1 \2

back reference sample:

	r = "a[bc]";
	s = "123abc";
	robj = ureg_new(r);
	rc = ureg_search(robj, s);
	  //robj->p=s+4, robj->sz=2
	  //pbk[0]=p,szbk[0]=sz, pbk[1]=NULL, szbk[1]=-1
 	r = "\\(a\\)\\(bc\\)";
 	s = "123abc";
	robj = ureg_new(r, -1, ureg_bre_syn() );
	rc = ureg_search(robj, s);	//robj->p, pbk is as below
		//	str: "123abc",	reg: "(a)(bc)"
		//	p  :  ...oo..,  robj->sz=2, rc=3, robj->p==s+rc
		//	p0 :  (p0==p) 
		//	p1 :  ...o...,  robj->szbk[1]== 1
		//	p2 :  ....o..,  robj->szbk[2]== 1
		//	p3 :  NULL,     robj->szbk[3]== -1

3. free

	void ureg_free(robj);

robj is malloc()ed. free memory when you finish regex search.

other func

	void* p = ureg_iconv(fenc, tenc, &s [, slen]);

ureg_iconv() doesnt treat regex. convert different encoding data. posix iconv() api/manual is very awkward so make a wrapper.

param

	fenc  : from enc type str. "EBCDIC-BR" etc. see ~$ iconv -l
	tenc  : to enc
	&s    : srcptr-ref. this ag is changed by func, so pass the copy.
	[slen]: src byte size. use strlen(*&s) if set -1/noset.

return

p: suc/fail == notNULL/NULL. converted binary stream with bytesize. use type punning (int), (char*) to get sz and binptr. byte sequence is as follows.

	| binsz |  c  |  c  | 0 |
	   int    char  char  \0	(if binsz=2)
	...
	char* s = (char*)"hw";
	int* p = ureg_iconv("ASCII", "UTF32", &s); //p is malloc()ed
	int binsz = p[0];	//==12, conv bytesz. BOM(4b) + h(4b) + w(4b)
	char* bin = (char*)(p+1);
	printf("%.*s \n", binsz, bin);
	free(p);

if allsrc is converted to other enc, s is set as s == src+slen. if detect invalid byte sequences, s is set to its address.

	all suc: "oooooooo"
	          ........s(maybe \0 or other)
	bad seq: "ooooox.."
	          .....s.. (s[0]=='\377' etc. from BOS to (s-1) is valid)

sample:

	#include <string.h>
	#include <stdlib.h>
	#include "ureg.h"
	
	int main(){
		const char* s = "ab\377c";	// "ab(NG)c", ascii
		char* sp = (char*)s;
		int* p = ureg_iconv("ASCII", "UTF32", &sp);	// ~$ iconv -l
			printf("%p %p\n", s, sp);	// p-s == 2, sp[0]='\377'
		int sz = p[0];	// ==12, ascii:7bit, UTF32:32bit + BOM 32bit
		char* bin = (char*)&p[1];
			printf("%d\n", sz);	//12, bin[0]..bin[12-1] is UTF32 str
		if(s+strlen(s) != sp){ puts("bad ascii byte"); }
		free(p);
		return 0;
	}
	//~$ cc src.c libureg.a

NOTES

 ureg_erebin_syn_ = 0
 //	| RE_BACKSLASH_ESCAPE_IN_LISTS //"\" works as esc in [] //posix:off
 //	| RE_ICASE			// ignore case, aa==Aa	 on:a==A off:a!=A
	| RE_INTERVALS		// use {} op	on:use off:normal chars
 //	| RE_LIMITED_OPS	// +*?| are normal chars	on:yes off:special
	| RE_CHAR_CLASSES	// use [:alnum:] etc	on:yes off:no
 	| RE_CONTEXT_INDEP_ANCHORS	//$^ works except []/esc (a^c etc)
	| RE_CONTEXT_INDEP_OPS	// *+? raise err in badpos, "+a" etc
	| RE_CONTEXT_INVALID_DUP  // "{2}a" raise err   off:[{][2][}]a
	| RE_CONTEXT_INVALID_OPS  // +*? raise err, "?a"   on:yes off:[+]a
 	| RE_UNMATCHED_RIGHT_PAREN_ORD //")a" ISNT err	on:[)]a off:err
 //	| RE_INVALID_INTERVAL_ORD	// "}abc" ISNT err	on:[}] off:err
	| RE_DOT_NEWLINE	// .(dot) includes '\n'	on:inc off:exc
 //	| RE_DOT_NOT_NULL	// .(dot) excludes '\0'	on:exc off:inc
 //	| RE_HAT_LISTS_NOT_NEWLINE	//[^a] excludes '\n'  on:exc off:inc
 //	| RE_NEWLINE_ALT	// a(\n)b works as a|b, on:\n==| off:| only
	| RE_NO_BK_BRACES	// on:{} off:\{\} //needs RE_INTERVALS
	| RE_NO_BK_PARENS	// on:() off:\(\)
	| RE_NO_BK_VBAR	// on:| off:\|	//needs RE_LIMITED_OPS=off
 //	| RE_NO_BK_REFS	// use \1,\2 etc  on:nouse off:use
 //	| RE_BK_PLUS_QM	// repeat "a+" or "a\+" 	on:\+ off:+
	| RE_DEBUG		// holds dbginfo	on:yes off:no	//emsg etc
	| RE_NO_EMPTY_RANGES	// [z-a] is err  on:yes off:use as empty
	| RE_NO_GNU_OPS	//out of standard op, \< etc  on:nouse off:use
 //	| RE_NO_POSIX_BACKTRACKING	//shortest match  on:yes off:longest
 //	| RE_NO_SUB  //drop backref data, (ab)c \1=ab etc. on:yes off:hold
 ;
// https://www.gnu.org/software/gnulib/manual/html_node/Syntax-Bits.html
 
 // --posix-ERE
 //	--undefined
 //	/()/
 //	/+aa|*{/
 //	/\@/
 //	/|g/
 //	[z-a]
 //	[a-c-e]
 // --valid
 //	)ab		>>uneven parlen treat as ordinary char
 //	[]]
 //	[\]]	>>2char '\' or ']'
 //	[-a]	>>2char '-' or 'a'
 //	[ac-]
 //	/a^b/, /a$b/ >> valid, but never matched
 ureg_brebin_syn_= 0
	| RE_CHAR_CLASSES
	| RE_DOT_NEWLINE
 //	| RE_DOT_NOT_NULL	//>>for support binary
	| RE_INTERVALS
	| RE_NO_EMPTY_RANGES
	;

..gnu-regex manual have inconsistent expression(DO/NOT DO/NO/LIMIT) and low readability. refer to the above and the mit manual.

( http://web.mit.edu/gnu/doc/html/regex_2.html#SEC3 )

important difference between BRE and ERE syntax is backref, altername op |, anchor work and repeat op:

         BRE	  ERE		eg
backref  \1-\9   nothing  B: \(a\)\1 >>aa,  E: (a)\1 >> undefined
alter   nothing    |      B:      -         E: (aa|bb)
anchor  BOS/EOS anywhere  B: \(^a^b\)>>a^b  E: (^a^b) >> never match
repeat  context anywhare  B: *ab >> *ab     E: *ab >> undefined

I recommend you to use ERE in generally. you should use BRE only when you need back-references or non-support ERE command (sed, grep etc) additionally, quote/esc the special charactors to avoid context depends regex. eg) ERE: )abc >> [)]abc

(https://stackoverflow.com/questions/40455975/why-regular-expressions-with-backreferences-are-not-regular-expressions)

(https://swtch.com/~rsc/regexp/regexp1.html)

BRE doesnt have '|' op and equivalent expression seems impossible

ERE: (ab|cd)* >> ( (ab){0,1} (cd){0,1} ){0,}

ERE: (ab|cd)+ >> ???

ERE doesnt have back-reference \1-\9. close expression is possible but not perfect. catch _ab_, __ab__, ___ab___ ... is:

BRE: $_*$[^_]\1

ERE: (_[^_]*_) | (__[^_]*__) | (___[^_]*___) ...

--- benchmark:	loop(10*1000){ ureg_easy("4.6", "1234567890"); } etc
compile FAST:    --     >>> onig(1)   >>> ureg(50)  :SLOW
search  FAST: strstr(1) >>> onig(100) >>> ureg(300) :SLOW

- ureg
real 948.292 ms: ./ureg.tmp.c 881: t_bm0_sub(): msg:easy:10*1000
real 813.072 ms: ./ureg.tmp.c 891: t_bm1_sub(): msg:new-free:10*1000
real 24.156 ms : ./ureg.tmp.c 898: t_bm1_sub(): msg:search:10*1000
real 787.625 ms: ./ureg.tmp.c 911: t_bm2_sub(): msg:new-free@raw:10*1000
real 10.868 ms : ./ureg.tmp.c 918: t_bm2_sub(): msg:search@raw:10*1000

- oniguruma
real 17.268 ms : ./ureg.ts.c 264: t_bm4_sub(): msg:new-free@onig:10*1000
real 4.473 ms  : ./ureg.ts.c 276: t_bm4_sub(): msg:search@onig:10*1000

- strstr()
real 0.050 ms  : ./ureg.ts.c 240: t_bm3_sub(): msg:strstr(): 10*1000


--- concept
- avoid complex api
- avoid non-standard regexp/operator like PCRE
- support binary input

CONFORMING_TO

posix-2001+

VERSION

2021-08-08 v3.0.3

COPYRIGHT

SEE_ALSO

https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
https://swtch.com/~rsc/regexp/regexp1.html
http://web.mit.edu/gnu/doc/html/regex_7.html
http://www.kt.rim.or.jp/~kbk/regex/re_7.html
https://regular-expressions.mobi/refunicode.html?wlr=1
https://stackoverflow.com/questions/2359811/working-with-gnu-regex-functions-in-c-or-c
https://stackoverflow.com/questions/8727795/does-posix-regex-h-provide-unicode-or-basically-non-ascii-characters
https://stackoverflow.com/questions/48743106/whats-ansi-x3-4-1968-encoding
https://www.iana.org/assignments/character-sets/character-sets.xhtml