xref: /OK3568_Linux_fs/u-boot/lib/slre.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3*4882a593Smuzhiyun  * All rights reserved
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * "THE BEER-WARE LICENSE" (Revision 42):
6*4882a593Smuzhiyun  * Sergey Lyubka wrote this file.  As long as you retain this notice you
7*4882a593Smuzhiyun  * can do whatever you want with this stuff. If we meet some day, and you think
8*4882a593Smuzhiyun  * this stuff is worth it, you can buy me a beer in return.
9*4882a593Smuzhiyun  */
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun /*
12*4882a593Smuzhiyun  * Downloaded Sat Nov  5 17:43:06 CET 2011 at
13*4882a593Smuzhiyun  * http://slre.sourceforge.net/1.0/slre.c
14*4882a593Smuzhiyun  */
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun #ifdef SLRE_TEST
17*4882a593Smuzhiyun #include <stdio.h>
18*4882a593Smuzhiyun #include <assert.h>
19*4882a593Smuzhiyun #include <ctype.h>
20*4882a593Smuzhiyun #include <stdlib.h>
21*4882a593Smuzhiyun #include <string.h>
22*4882a593Smuzhiyun #else
23*4882a593Smuzhiyun #include <common.h>
24*4882a593Smuzhiyun #include <linux/ctype.h>
25*4882a593Smuzhiyun #endif /* SLRE_TEST */
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun #include <errno.h>
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun #include <slre.h>
30*4882a593Smuzhiyun 
31*4882a593Smuzhiyun enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32*4882a593Smuzhiyun 	STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33*4882a593Smuzhiyun 
34*4882a593Smuzhiyun #ifdef SLRE_TEST
35*4882a593Smuzhiyun static struct {
36*4882a593Smuzhiyun 	const char	*name;
37*4882a593Smuzhiyun 	int		narg;
38*4882a593Smuzhiyun 	const char	*flags;
39*4882a593Smuzhiyun } opcodes[] = {
40*4882a593Smuzhiyun 	{"END",		0, ""},		/* End of code block or program	*/
41*4882a593Smuzhiyun 	{"BRANCH",	2, "oo"},	/* Alternative operator, "|"	*/
42*4882a593Smuzhiyun 	{"ANY",		0, ""},		/* Match any character, "."	*/
43*4882a593Smuzhiyun 	{"EXACT",	2, "d"},	/* Match exact string		*/
44*4882a593Smuzhiyun 	{"ANYOF",	2, "D"},	/* Match any from set, "[]"	*/
45*4882a593Smuzhiyun 	{"ANYBUT",	2, "D"},	/* Match any but from set, "[^]"*/
46*4882a593Smuzhiyun 	{"OPEN ",	1, "i"},	/* Capture start, "("		*/
47*4882a593Smuzhiyun 	{"CLOSE",	1, "i"},	/* Capture end, ")"		*/
48*4882a593Smuzhiyun 	{"BOL",		0, ""},		/* Beginning of string, "^"	*/
49*4882a593Smuzhiyun 	{"EOL",		0, ""},		/* End of string, "$"		*/
50*4882a593Smuzhiyun 	{"STAR",	1, "o"},	/* Match zero or more times "*"	*/
51*4882a593Smuzhiyun 	{"PLUS",	1, "o"},	/* Match one or more times, "+"	*/
52*4882a593Smuzhiyun 	{"STARQ",	1, "o"},	/* Non-greedy STAR,  "*?"	*/
53*4882a593Smuzhiyun 	{"PLUSQ",	1, "o"},	/* Non-greedy PLUS, "+?"	*/
54*4882a593Smuzhiyun 	{"QUEST",	1, "o"},	/* Match zero or one time, "?"	*/
55*4882a593Smuzhiyun 	{"SPACE",	0, ""},		/* Match whitespace, "\s"	*/
56*4882a593Smuzhiyun 	{"NONSPACE",	0, ""},		/* Match non-space, "\S"	*/
57*4882a593Smuzhiyun 	{"DIGIT",	0, ""}		/* Match digit, "\d"		*/
58*4882a593Smuzhiyun };
59*4882a593Smuzhiyun #endif /* SLRE_TEST */
60*4882a593Smuzhiyun 
61*4882a593Smuzhiyun /*
62*4882a593Smuzhiyun  * Commands and operands are all unsigned char (1 byte long). All code offsets
63*4882a593Smuzhiyun  * are relative to current address, and positive (always point forward). Data
64*4882a593Smuzhiyun  * offsets are absolute. Commands with operands:
65*4882a593Smuzhiyun  *
66*4882a593Smuzhiyun  * BRANCH offset1 offset2
67*4882a593Smuzhiyun  *	Try to match the code block that follows the BRANCH instruction
68*4882a593Smuzhiyun  *	(code block ends with END). If no match, try to match code block that
69*4882a593Smuzhiyun  *	starts at offset1. If either of these match, jump to offset2.
70*4882a593Smuzhiyun  *
71*4882a593Smuzhiyun  * EXACT data_offset data_length
72*4882a593Smuzhiyun  *	Try to match exact string. String is recorded in data section from
73*4882a593Smuzhiyun  *	data_offset, and has length data_length.
74*4882a593Smuzhiyun  *
75*4882a593Smuzhiyun  * OPEN capture_number
76*4882a593Smuzhiyun  * CLOSE capture_number
77*4882a593Smuzhiyun  *	If the user have passed 'struct cap' array for captures, OPEN
78*4882a593Smuzhiyun  *	records the beginning of the matched substring (cap->ptr), CLOSE
79*4882a593Smuzhiyun  *	sets the length (cap->len) for respective capture_number.
80*4882a593Smuzhiyun  *
81*4882a593Smuzhiyun  * STAR code_offset
82*4882a593Smuzhiyun  * PLUS code_offset
83*4882a593Smuzhiyun  * QUEST code_offset
84*4882a593Smuzhiyun  *	*, +, ?, respectively. Try to gobble as much as possible from the
85*4882a593Smuzhiyun  *	matched buffer, until code block that follows these instructions
86*4882a593Smuzhiyun  *	matches. When the longest possible string is matched,
87*4882a593Smuzhiyun  *	jump to code_offset
88*4882a593Smuzhiyun  *
89*4882a593Smuzhiyun  * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
90*4882a593Smuzhiyun  */
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun static const char *meta_chars = "|.^$*+?()[\\";
93*4882a593Smuzhiyun 
94*4882a593Smuzhiyun #ifdef SLRE_TEST
95*4882a593Smuzhiyun 
96*4882a593Smuzhiyun static void
print_character_set(FILE * fp,const unsigned char * p,int len)97*4882a593Smuzhiyun print_character_set(FILE *fp, const unsigned char *p, int len)
98*4882a593Smuzhiyun {
99*4882a593Smuzhiyun 	int	i;
100*4882a593Smuzhiyun 
101*4882a593Smuzhiyun 	for (i = 0; i < len; i++) {
102*4882a593Smuzhiyun 		if (i > 0)
103*4882a593Smuzhiyun 			(void) fputc(',', fp);
104*4882a593Smuzhiyun 		if (p[i] == 0) {
105*4882a593Smuzhiyun 			i++;
106*4882a593Smuzhiyun 			if (p[i] == 0)
107*4882a593Smuzhiyun 				(void) fprintf(fp, "\\x%02x", p[i]);
108*4882a593Smuzhiyun 			else
109*4882a593Smuzhiyun 				(void) fprintf(fp, "%s", opcodes[p[i]].name);
110*4882a593Smuzhiyun 		} else if (isprint(p[i])) {
111*4882a593Smuzhiyun 			(void) fputc(p[i], fp);
112*4882a593Smuzhiyun 		} else {
113*4882a593Smuzhiyun 			(void) fprintf(fp, "\\x%02x", p[i]);
114*4882a593Smuzhiyun 		}
115*4882a593Smuzhiyun 	}
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun 
118*4882a593Smuzhiyun void
slre_dump(const struct slre * r,FILE * fp)119*4882a593Smuzhiyun slre_dump(const struct slre *r, FILE *fp)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun 	int	i, j, ch, op, pc;
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	for (pc = 0; pc < r->code_size; pc++) {
124*4882a593Smuzhiyun 
125*4882a593Smuzhiyun 		op = r->code[pc];
126*4882a593Smuzhiyun 		(void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 		for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129*4882a593Smuzhiyun 			switch (opcodes[op].flags[i]) {
130*4882a593Smuzhiyun 			case 'i':
131*4882a593Smuzhiyun 				(void) fprintf(fp, "%d ", r->code[pc + 1]);
132*4882a593Smuzhiyun 				pc++;
133*4882a593Smuzhiyun 				break;
134*4882a593Smuzhiyun 			case 'o':
135*4882a593Smuzhiyun 				(void) fprintf(fp, "%d ",
136*4882a593Smuzhiyun 				    pc + r->code[pc + 1] - i);
137*4882a593Smuzhiyun 				pc++;
138*4882a593Smuzhiyun 				break;
139*4882a593Smuzhiyun 			case 'D':
140*4882a593Smuzhiyun 				print_character_set(fp, r->data +
141*4882a593Smuzhiyun 				    r->code[pc + 1], r->code[pc + 2]);
142*4882a593Smuzhiyun 				pc += 2;
143*4882a593Smuzhiyun 				break;
144*4882a593Smuzhiyun 			case 'd':
145*4882a593Smuzhiyun 				(void) fputc('"', fp);
146*4882a593Smuzhiyun 				for (j = 0; j < r->code[pc + 2]; j++) {
147*4882a593Smuzhiyun 					ch = r->data[r->code[pc + 1] + j];
148*4882a593Smuzhiyun 					if (isprint(ch)) {
149*4882a593Smuzhiyun 						(void) fputc(ch, fp);
150*4882a593Smuzhiyun 					} else {
151*4882a593Smuzhiyun 						(void) fprintf(fp,
152*4882a593Smuzhiyun 							"\\x%02x", ch);
153*4882a593Smuzhiyun 					}
154*4882a593Smuzhiyun 				}
155*4882a593Smuzhiyun 				(void) fputc('"', fp);
156*4882a593Smuzhiyun 				pc += 2;
157*4882a593Smuzhiyun 				break;
158*4882a593Smuzhiyun 			}
159*4882a593Smuzhiyun 
160*4882a593Smuzhiyun 		(void) fputc('\n', fp);
161*4882a593Smuzhiyun 	}
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun #endif /* SLRE_TEST */
164*4882a593Smuzhiyun 
165*4882a593Smuzhiyun static void
set_jump_offset(struct slre * r,int pc,int offset)166*4882a593Smuzhiyun set_jump_offset(struct slre *r, int pc, int offset)
167*4882a593Smuzhiyun {
168*4882a593Smuzhiyun 	assert(offset < r->code_size);
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun 	if (r->code_size - offset > 0xff)
171*4882a593Smuzhiyun 		r->err_str = "Jump offset is too big";
172*4882a593Smuzhiyun 	else
173*4882a593Smuzhiyun 		r->code[pc] = (unsigned char) (r->code_size - offset);
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun 
176*4882a593Smuzhiyun static void
emit(struct slre * r,int code)177*4882a593Smuzhiyun emit(struct slre *r, int code)
178*4882a593Smuzhiyun {
179*4882a593Smuzhiyun 	if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180*4882a593Smuzhiyun 		r->err_str = "RE is too long (code overflow)";
181*4882a593Smuzhiyun 	else
182*4882a593Smuzhiyun 		r->code[r->code_size++] = (unsigned char) code;
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun static void
store_char_in_data(struct slre * r,int ch)186*4882a593Smuzhiyun store_char_in_data(struct slre *r, int ch)
187*4882a593Smuzhiyun {
188*4882a593Smuzhiyun 	if (r->data_size >= (int) sizeof(r->data))
189*4882a593Smuzhiyun 		r->err_str = "RE is too long (data overflow)";
190*4882a593Smuzhiyun 	else
191*4882a593Smuzhiyun 		r->data[r->data_size++] = ch;
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun static void
exact(struct slre * r,const char ** re)195*4882a593Smuzhiyun exact(struct slre *r, const char **re)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun 	int	old_data_size = r->data_size;
198*4882a593Smuzhiyun 
199*4882a593Smuzhiyun 	while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200*4882a593Smuzhiyun 		store_char_in_data(r, *(*re)++);
201*4882a593Smuzhiyun 
202*4882a593Smuzhiyun 	emit(r, EXACT);
203*4882a593Smuzhiyun 	emit(r, old_data_size);
204*4882a593Smuzhiyun 	emit(r, r->data_size - old_data_size);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun static int
get_escape_char(const char ** re)208*4882a593Smuzhiyun get_escape_char(const char **re)
209*4882a593Smuzhiyun {
210*4882a593Smuzhiyun 	int	res;
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 	switch (*(*re)++) {
213*4882a593Smuzhiyun 	case 'n':
214*4882a593Smuzhiyun 		res = '\n';
215*4882a593Smuzhiyun 		break;
216*4882a593Smuzhiyun 	case 'r':
217*4882a593Smuzhiyun 		res = '\r';
218*4882a593Smuzhiyun 		break;
219*4882a593Smuzhiyun 	case 't':
220*4882a593Smuzhiyun 		res = '\t';
221*4882a593Smuzhiyun 		break;
222*4882a593Smuzhiyun 	case '0':
223*4882a593Smuzhiyun 		res = 0;
224*4882a593Smuzhiyun 		break;
225*4882a593Smuzhiyun 	case 'S':
226*4882a593Smuzhiyun 		res = NONSPACE << 8;
227*4882a593Smuzhiyun 		break;
228*4882a593Smuzhiyun 	case 's':
229*4882a593Smuzhiyun 		res = SPACE << 8;
230*4882a593Smuzhiyun 		break;
231*4882a593Smuzhiyun 	case 'd':
232*4882a593Smuzhiyun 		res = DIGIT << 8;
233*4882a593Smuzhiyun 		break;
234*4882a593Smuzhiyun 	default:
235*4882a593Smuzhiyun 		res = (*re)[-1];
236*4882a593Smuzhiyun 		break;
237*4882a593Smuzhiyun 	}
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun 	return res;
240*4882a593Smuzhiyun }
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun static void
anyof(struct slre * r,const char ** re)243*4882a593Smuzhiyun anyof(struct slre *r, const char **re)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun 	int	esc, old_data_size = r->data_size, op = ANYOF;
246*4882a593Smuzhiyun 
247*4882a593Smuzhiyun 	if (**re == '^') {
248*4882a593Smuzhiyun 		op = ANYBUT;
249*4882a593Smuzhiyun 		(*re)++;
250*4882a593Smuzhiyun 	}
251*4882a593Smuzhiyun 
252*4882a593Smuzhiyun 	while (**re != '\0')
253*4882a593Smuzhiyun 
254*4882a593Smuzhiyun 		switch (*(*re)++) {
255*4882a593Smuzhiyun 		case ']':
256*4882a593Smuzhiyun 			emit(r, op);
257*4882a593Smuzhiyun 			emit(r, old_data_size);
258*4882a593Smuzhiyun 			emit(r, r->data_size - old_data_size);
259*4882a593Smuzhiyun 			return;
260*4882a593Smuzhiyun 			/* NOTREACHED */
261*4882a593Smuzhiyun 			break;
262*4882a593Smuzhiyun 		case '\\':
263*4882a593Smuzhiyun 			esc = get_escape_char(re);
264*4882a593Smuzhiyun 			if ((esc & 0xff) == 0) {
265*4882a593Smuzhiyun 				store_char_in_data(r, 0);
266*4882a593Smuzhiyun 				store_char_in_data(r, esc >> 8);
267*4882a593Smuzhiyun 			} else {
268*4882a593Smuzhiyun 				store_char_in_data(r, esc);
269*4882a593Smuzhiyun 			}
270*4882a593Smuzhiyun 			break;
271*4882a593Smuzhiyun 		default:
272*4882a593Smuzhiyun 			store_char_in_data(r, (*re)[-1]);
273*4882a593Smuzhiyun 			break;
274*4882a593Smuzhiyun 		}
275*4882a593Smuzhiyun 
276*4882a593Smuzhiyun 	r->err_str = "No closing ']' bracket";
277*4882a593Smuzhiyun }
278*4882a593Smuzhiyun 
279*4882a593Smuzhiyun static void
relocate(struct slre * r,int begin,int shift)280*4882a593Smuzhiyun relocate(struct slre *r, int begin, int shift)
281*4882a593Smuzhiyun {
282*4882a593Smuzhiyun 	emit(r, END);
283*4882a593Smuzhiyun 	memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284*4882a593Smuzhiyun 	r->code_size += shift;
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun 
287*4882a593Smuzhiyun static void
quantifier(struct slre * r,int prev,int op)288*4882a593Smuzhiyun quantifier(struct slre *r, int prev, int op)
289*4882a593Smuzhiyun {
290*4882a593Smuzhiyun 	if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291*4882a593Smuzhiyun 		r->code[prev + 2]--;
292*4882a593Smuzhiyun 		emit(r, EXACT);
293*4882a593Smuzhiyun 		emit(r, r->code[prev + 1] + r->code[prev + 2]);
294*4882a593Smuzhiyun 		emit(r, 1);
295*4882a593Smuzhiyun 		prev = r->code_size - 3;
296*4882a593Smuzhiyun 	}
297*4882a593Smuzhiyun 	relocate(r, prev, 2);
298*4882a593Smuzhiyun 	r->code[prev] = op;
299*4882a593Smuzhiyun 	set_jump_offset(r, prev + 1, prev);
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun static void
exact_one_char(struct slre * r,int ch)303*4882a593Smuzhiyun exact_one_char(struct slre *r, int ch)
304*4882a593Smuzhiyun {
305*4882a593Smuzhiyun 	emit(r, EXACT);
306*4882a593Smuzhiyun 	emit(r, r->data_size);
307*4882a593Smuzhiyun 	emit(r, 1);
308*4882a593Smuzhiyun 	store_char_in_data(r, ch);
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun static void
fixup_branch(struct slre * r,int fixup)312*4882a593Smuzhiyun fixup_branch(struct slre *r, int fixup)
313*4882a593Smuzhiyun {
314*4882a593Smuzhiyun 	if (fixup > 0) {
315*4882a593Smuzhiyun 		emit(r, END);
316*4882a593Smuzhiyun 		set_jump_offset(r, fixup, fixup - 2);
317*4882a593Smuzhiyun 	}
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun 
320*4882a593Smuzhiyun static void
compile(struct slre * r,const char ** re)321*4882a593Smuzhiyun compile(struct slre *r, const char **re)
322*4882a593Smuzhiyun {
323*4882a593Smuzhiyun 	int	op, esc, branch_start, last_op, fixup, cap_no, level;
324*4882a593Smuzhiyun 
325*4882a593Smuzhiyun 	fixup = 0;
326*4882a593Smuzhiyun 	level = r->num_caps;
327*4882a593Smuzhiyun 	branch_start = last_op = r->code_size;
328*4882a593Smuzhiyun 
329*4882a593Smuzhiyun 	for (;;)
330*4882a593Smuzhiyun 		switch (*(*re)++) {
331*4882a593Smuzhiyun 		case '\0':
332*4882a593Smuzhiyun 			(*re)--;
333*4882a593Smuzhiyun 			return;
334*4882a593Smuzhiyun 			/* NOTREACHED */
335*4882a593Smuzhiyun 			break;
336*4882a593Smuzhiyun 		case '^':
337*4882a593Smuzhiyun 			emit(r, BOL);
338*4882a593Smuzhiyun 			break;
339*4882a593Smuzhiyun 		case '$':
340*4882a593Smuzhiyun 			emit(r, EOL);
341*4882a593Smuzhiyun 			break;
342*4882a593Smuzhiyun 		case '.':
343*4882a593Smuzhiyun 			last_op = r->code_size;
344*4882a593Smuzhiyun 			emit(r, ANY);
345*4882a593Smuzhiyun 			break;
346*4882a593Smuzhiyun 		case '[':
347*4882a593Smuzhiyun 			last_op = r->code_size;
348*4882a593Smuzhiyun 			anyof(r, re);
349*4882a593Smuzhiyun 			break;
350*4882a593Smuzhiyun 		case '\\':
351*4882a593Smuzhiyun 			last_op = r->code_size;
352*4882a593Smuzhiyun 			esc = get_escape_char(re);
353*4882a593Smuzhiyun 			if (esc & 0xff00)
354*4882a593Smuzhiyun 				emit(r, esc >> 8);
355*4882a593Smuzhiyun 			else
356*4882a593Smuzhiyun 				exact_one_char(r, esc);
357*4882a593Smuzhiyun 			break;
358*4882a593Smuzhiyun 		case '(':
359*4882a593Smuzhiyun 			last_op = r->code_size;
360*4882a593Smuzhiyun 			cap_no = ++r->num_caps;
361*4882a593Smuzhiyun 			emit(r, OPEN);
362*4882a593Smuzhiyun 			emit(r, cap_no);
363*4882a593Smuzhiyun 
364*4882a593Smuzhiyun 			compile(r, re);
365*4882a593Smuzhiyun 			if (*(*re)++ != ')') {
366*4882a593Smuzhiyun 				r->err_str = "No closing bracket";
367*4882a593Smuzhiyun 				return;
368*4882a593Smuzhiyun 			}
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 			emit(r, CLOSE);
371*4882a593Smuzhiyun 			emit(r, cap_no);
372*4882a593Smuzhiyun 			break;
373*4882a593Smuzhiyun 		case ')':
374*4882a593Smuzhiyun 			(*re)--;
375*4882a593Smuzhiyun 			fixup_branch(r, fixup);
376*4882a593Smuzhiyun 			if (level == 0) {
377*4882a593Smuzhiyun 				r->err_str = "Unbalanced brackets";
378*4882a593Smuzhiyun 				return;
379*4882a593Smuzhiyun 			}
380*4882a593Smuzhiyun 			return;
381*4882a593Smuzhiyun 			/* NOTREACHED */
382*4882a593Smuzhiyun 			break;
383*4882a593Smuzhiyun 		case '+':
384*4882a593Smuzhiyun 		case '*':
385*4882a593Smuzhiyun 			op = (*re)[-1] == '*' ? STAR : PLUS;
386*4882a593Smuzhiyun 			if (**re == '?') {
387*4882a593Smuzhiyun 				(*re)++;
388*4882a593Smuzhiyun 				op = op == STAR ? STARQ : PLUSQ;
389*4882a593Smuzhiyun 			}
390*4882a593Smuzhiyun 			quantifier(r, last_op, op);
391*4882a593Smuzhiyun 			break;
392*4882a593Smuzhiyun 		case '?':
393*4882a593Smuzhiyun 			quantifier(r, last_op, QUEST);
394*4882a593Smuzhiyun 			break;
395*4882a593Smuzhiyun 		case '|':
396*4882a593Smuzhiyun 			fixup_branch(r, fixup);
397*4882a593Smuzhiyun 			relocate(r, branch_start, 3);
398*4882a593Smuzhiyun 			r->code[branch_start] = BRANCH;
399*4882a593Smuzhiyun 			set_jump_offset(r, branch_start + 1, branch_start);
400*4882a593Smuzhiyun 			fixup = branch_start + 2;
401*4882a593Smuzhiyun 			r->code[fixup] = 0xff;
402*4882a593Smuzhiyun 			break;
403*4882a593Smuzhiyun 		default:
404*4882a593Smuzhiyun 			(*re)--;
405*4882a593Smuzhiyun 			last_op = r->code_size;
406*4882a593Smuzhiyun 			exact(r, re);
407*4882a593Smuzhiyun 			break;
408*4882a593Smuzhiyun 		}
409*4882a593Smuzhiyun }
410*4882a593Smuzhiyun 
411*4882a593Smuzhiyun int
slre_compile(struct slre * r,const char * re)412*4882a593Smuzhiyun slre_compile(struct slre *r, const char *re)
413*4882a593Smuzhiyun {
414*4882a593Smuzhiyun 	r->err_str = NULL;
415*4882a593Smuzhiyun 	r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416*4882a593Smuzhiyun 
417*4882a593Smuzhiyun 	if (*re == '^')
418*4882a593Smuzhiyun 		r->anchored++;
419*4882a593Smuzhiyun 
420*4882a593Smuzhiyun 	emit(r, OPEN);	/* This will capture what matches full RE */
421*4882a593Smuzhiyun 	emit(r, 0);
422*4882a593Smuzhiyun 
423*4882a593Smuzhiyun 	while (*re != '\0')
424*4882a593Smuzhiyun 		compile(r, &re);
425*4882a593Smuzhiyun 
426*4882a593Smuzhiyun 	if (r->code[2] == BRANCH)
427*4882a593Smuzhiyun 		fixup_branch(r, 4);
428*4882a593Smuzhiyun 
429*4882a593Smuzhiyun 	emit(r, CLOSE);
430*4882a593Smuzhiyun 	emit(r, 0);
431*4882a593Smuzhiyun 	emit(r, END);
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun 	return (r->err_str == NULL ? 1 : 0);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun 
436*4882a593Smuzhiyun static int match(const struct slre *, int,
437*4882a593Smuzhiyun 		const char *, int, int *, struct cap *);
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)440*4882a593Smuzhiyun loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441*4882a593Smuzhiyun {
442*4882a593Smuzhiyun 	int	saved_offset, matched_offset;
443*4882a593Smuzhiyun 
444*4882a593Smuzhiyun 	matched_offset = *ofs;
445*4882a593Smuzhiyun 
446*4882a593Smuzhiyun 	while (match(r, pc + 2, s, len, ofs, NULL)) {
447*4882a593Smuzhiyun 		saved_offset = *ofs;
448*4882a593Smuzhiyun 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449*4882a593Smuzhiyun 			matched_offset = saved_offset;
450*4882a593Smuzhiyun 		*ofs = saved_offset;
451*4882a593Smuzhiyun 	}
452*4882a593Smuzhiyun 
453*4882a593Smuzhiyun 	*ofs = matched_offset;
454*4882a593Smuzhiyun }
455*4882a593Smuzhiyun 
456*4882a593Smuzhiyun static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)457*4882a593Smuzhiyun loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458*4882a593Smuzhiyun {
459*4882a593Smuzhiyun 	int	saved_offset = *ofs;
460*4882a593Smuzhiyun 
461*4882a593Smuzhiyun 	while (match(r, pc + 2, s, len, ofs, NULL)) {
462*4882a593Smuzhiyun 		saved_offset = *ofs;
463*4882a593Smuzhiyun 		if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464*4882a593Smuzhiyun 			break;
465*4882a593Smuzhiyun 	}
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun 	*ofs = saved_offset;
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)471*4882a593Smuzhiyun is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun 	int	i, ch;
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	ch = s[*ofs];
476*4882a593Smuzhiyun 
477*4882a593Smuzhiyun 	for (i = 0; i < len; i++)
478*4882a593Smuzhiyun 		if (p[i] == ch) {
479*4882a593Smuzhiyun 			(*ofs)++;
480*4882a593Smuzhiyun 			return 1;
481*4882a593Smuzhiyun 		}
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	return 0;
484*4882a593Smuzhiyun }
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)487*4882a593Smuzhiyun is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488*4882a593Smuzhiyun {
489*4882a593Smuzhiyun 	int	i, ch;
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	ch = s[*ofs];
492*4882a593Smuzhiyun 
493*4882a593Smuzhiyun 	for (i = 0; i < len; i++) {
494*4882a593Smuzhiyun 		if (p[i] == ch)
495*4882a593Smuzhiyun 			return 0;
496*4882a593Smuzhiyun 	}
497*4882a593Smuzhiyun 
498*4882a593Smuzhiyun 	(*ofs)++;
499*4882a593Smuzhiyun 	return 1;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun 
502*4882a593Smuzhiyun static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)503*4882a593Smuzhiyun match(const struct slre *r, int pc, const char *s, int len,
504*4882a593Smuzhiyun 		int *ofs, struct cap *caps)
505*4882a593Smuzhiyun {
506*4882a593Smuzhiyun 	int	n, saved_offset, res = 1;
507*4882a593Smuzhiyun 
508*4882a593Smuzhiyun 	while (res && r->code[pc] != END) {
509*4882a593Smuzhiyun 
510*4882a593Smuzhiyun 		assert(pc < r->code_size);
511*4882a593Smuzhiyun 		assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512*4882a593Smuzhiyun 
513*4882a593Smuzhiyun 		switch (r->code[pc]) {
514*4882a593Smuzhiyun 		case BRANCH:
515*4882a593Smuzhiyun 			saved_offset = *ofs;
516*4882a593Smuzhiyun 			res = match(r, pc + 3, s, len, ofs, caps);
517*4882a593Smuzhiyun 			if (res == 0) {
518*4882a593Smuzhiyun 				*ofs = saved_offset;
519*4882a593Smuzhiyun 				res = match(r, pc + r->code[pc + 1],
520*4882a593Smuzhiyun 				    s, len, ofs, caps);
521*4882a593Smuzhiyun 			}
522*4882a593Smuzhiyun 			pc += r->code[pc + 2];
523*4882a593Smuzhiyun 			break;
524*4882a593Smuzhiyun 		case EXACT:
525*4882a593Smuzhiyun 			res = 0;
526*4882a593Smuzhiyun 			n = r->code[pc + 2];	/* String length */
527*4882a593Smuzhiyun 			if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528*4882a593Smuzhiyun 			    r->code[pc + 1], n)) {
529*4882a593Smuzhiyun 				(*ofs) += n;
530*4882a593Smuzhiyun 				res = 1;
531*4882a593Smuzhiyun 			}
532*4882a593Smuzhiyun 			pc += 3;
533*4882a593Smuzhiyun 			break;
534*4882a593Smuzhiyun 		case QUEST:
535*4882a593Smuzhiyun 			res = 1;
536*4882a593Smuzhiyun 			saved_offset = *ofs;
537*4882a593Smuzhiyun 			if (!match(r, pc + 2, s, len, ofs, caps))
538*4882a593Smuzhiyun 				*ofs = saved_offset;
539*4882a593Smuzhiyun 			pc += r->code[pc + 1];
540*4882a593Smuzhiyun 			break;
541*4882a593Smuzhiyun 		case STAR:
542*4882a593Smuzhiyun 			res = 1;
543*4882a593Smuzhiyun 			loop_greedy(r, pc, s, len, ofs);
544*4882a593Smuzhiyun 			pc += r->code[pc + 1];
545*4882a593Smuzhiyun 			break;
546*4882a593Smuzhiyun 		case STARQ:
547*4882a593Smuzhiyun 			res = 1;
548*4882a593Smuzhiyun 			loop_non_greedy(r, pc, s, len, ofs);
549*4882a593Smuzhiyun 			pc += r->code[pc + 1];
550*4882a593Smuzhiyun 			break;
551*4882a593Smuzhiyun 		case PLUS:
552*4882a593Smuzhiyun 			res = match(r, pc + 2, s, len, ofs, caps);
553*4882a593Smuzhiyun 			if (res == 0)
554*4882a593Smuzhiyun 				break;
555*4882a593Smuzhiyun 
556*4882a593Smuzhiyun 			loop_greedy(r, pc, s, len, ofs);
557*4882a593Smuzhiyun 			pc += r->code[pc + 1];
558*4882a593Smuzhiyun 			break;
559*4882a593Smuzhiyun 		case PLUSQ:
560*4882a593Smuzhiyun 			res = match(r, pc + 2, s, len, ofs, caps);
561*4882a593Smuzhiyun 			if (res == 0)
562*4882a593Smuzhiyun 				break;
563*4882a593Smuzhiyun 
564*4882a593Smuzhiyun 			loop_non_greedy(r, pc, s, len, ofs);
565*4882a593Smuzhiyun 			pc += r->code[pc + 1];
566*4882a593Smuzhiyun 			break;
567*4882a593Smuzhiyun 		case SPACE:
568*4882a593Smuzhiyun 			res = 0;
569*4882a593Smuzhiyun 			if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570*4882a593Smuzhiyun 				(*ofs)++;
571*4882a593Smuzhiyun 				res = 1;
572*4882a593Smuzhiyun 			}
573*4882a593Smuzhiyun 			pc++;
574*4882a593Smuzhiyun 			break;
575*4882a593Smuzhiyun 		case NONSPACE:
576*4882a593Smuzhiyun 			res = 0;
577*4882a593Smuzhiyun 			if (*ofs < len &&
578*4882a593Smuzhiyun 					!isspace(((unsigned char *)s)[*ofs])) {
579*4882a593Smuzhiyun 				(*ofs)++;
580*4882a593Smuzhiyun 				res = 1;
581*4882a593Smuzhiyun 			}
582*4882a593Smuzhiyun 			pc++;
583*4882a593Smuzhiyun 			break;
584*4882a593Smuzhiyun 		case DIGIT:
585*4882a593Smuzhiyun 			res = 0;
586*4882a593Smuzhiyun 			if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587*4882a593Smuzhiyun 				(*ofs)++;
588*4882a593Smuzhiyun 				res = 1;
589*4882a593Smuzhiyun 			}
590*4882a593Smuzhiyun 			pc++;
591*4882a593Smuzhiyun 			break;
592*4882a593Smuzhiyun 		case ANY:
593*4882a593Smuzhiyun 			res = 0;
594*4882a593Smuzhiyun 			if (*ofs < len) {
595*4882a593Smuzhiyun 				(*ofs)++;
596*4882a593Smuzhiyun 				res = 1;
597*4882a593Smuzhiyun 			}
598*4882a593Smuzhiyun 			pc++;
599*4882a593Smuzhiyun 			break;
600*4882a593Smuzhiyun 		case ANYOF:
601*4882a593Smuzhiyun 			res = 0;
602*4882a593Smuzhiyun 			if (*ofs < len)
603*4882a593Smuzhiyun 				res = is_any_of(r->data + r->code[pc + 1],
604*4882a593Smuzhiyun 					r->code[pc + 2], s, ofs);
605*4882a593Smuzhiyun 			pc += 3;
606*4882a593Smuzhiyun 			break;
607*4882a593Smuzhiyun 		case ANYBUT:
608*4882a593Smuzhiyun 			res = 0;
609*4882a593Smuzhiyun 			if (*ofs < len)
610*4882a593Smuzhiyun 				res = is_any_but(r->data + r->code[pc + 1],
611*4882a593Smuzhiyun 					r->code[pc + 2], s, ofs);
612*4882a593Smuzhiyun 			pc += 3;
613*4882a593Smuzhiyun 			break;
614*4882a593Smuzhiyun 		case BOL:
615*4882a593Smuzhiyun 			res = *ofs == 0 ? 1 : 0;
616*4882a593Smuzhiyun 			pc++;
617*4882a593Smuzhiyun 			break;
618*4882a593Smuzhiyun 		case EOL:
619*4882a593Smuzhiyun 			res = *ofs == len ? 1 : 0;
620*4882a593Smuzhiyun 			pc++;
621*4882a593Smuzhiyun 			break;
622*4882a593Smuzhiyun 		case OPEN:
623*4882a593Smuzhiyun 			if (caps != NULL)
624*4882a593Smuzhiyun 				caps[r->code[pc + 1]].ptr = s + *ofs;
625*4882a593Smuzhiyun 			pc += 2;
626*4882a593Smuzhiyun 			break;
627*4882a593Smuzhiyun 		case CLOSE:
628*4882a593Smuzhiyun 			if (caps != NULL)
629*4882a593Smuzhiyun 				caps[r->code[pc + 1]].len = (s + *ofs) -
630*4882a593Smuzhiyun 				    caps[r->code[pc + 1]].ptr;
631*4882a593Smuzhiyun 			pc += 2;
632*4882a593Smuzhiyun 			break;
633*4882a593Smuzhiyun 		case END:
634*4882a593Smuzhiyun 			pc++;
635*4882a593Smuzhiyun 			break;
636*4882a593Smuzhiyun 		default:
637*4882a593Smuzhiyun 			printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638*4882a593Smuzhiyun 			assert(0);
639*4882a593Smuzhiyun 			break;
640*4882a593Smuzhiyun 		}
641*4882a593Smuzhiyun 	}
642*4882a593Smuzhiyun 
643*4882a593Smuzhiyun 	return res;
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun 
646*4882a593Smuzhiyun int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)647*4882a593Smuzhiyun slre_match(const struct slre *r, const char *buf, int len,
648*4882a593Smuzhiyun 		struct cap *caps)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun 	int	i, ofs = 0, res = 0;
651*4882a593Smuzhiyun 
652*4882a593Smuzhiyun 	if (r->anchored) {
653*4882a593Smuzhiyun 		res = match(r, 0, buf, len, &ofs, caps);
654*4882a593Smuzhiyun 	} else {
655*4882a593Smuzhiyun 		for (i = 0; i < len && res == 0; i++) {
656*4882a593Smuzhiyun 			ofs = i;
657*4882a593Smuzhiyun 			res = match(r, 0, buf, len, &ofs, caps);
658*4882a593Smuzhiyun 		}
659*4882a593Smuzhiyun 	}
660*4882a593Smuzhiyun 
661*4882a593Smuzhiyun 	return res;
662*4882a593Smuzhiyun }
663*4882a593Smuzhiyun 
664*4882a593Smuzhiyun #ifdef SLRE_TEST
665*4882a593Smuzhiyun #define N_CAPS	5
666*4882a593Smuzhiyun 
main(int argc,char * argv[])667*4882a593Smuzhiyun int main(int argc, char *argv[])
668*4882a593Smuzhiyun {
669*4882a593Smuzhiyun 	struct slre	slre;
670*4882a593Smuzhiyun 	struct cap	caps[N_CAPS];
671*4882a593Smuzhiyun 	unsigned char	data[1 * 1024 * 1024];
672*4882a593Smuzhiyun 	FILE		*fp;
673*4882a593Smuzhiyun 	int		i, res, len;
674*4882a593Smuzhiyun 
675*4882a593Smuzhiyun 	if (argc < 2) {
676*4882a593Smuzhiyun 		fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677*4882a593Smuzhiyun 		return 1;
678*4882a593Smuzhiyun 	}
679*4882a593Smuzhiyun 
680*4882a593Smuzhiyun 	fp = fopen(argv[2], "rb");
681*4882a593Smuzhiyun 	if (fp == NULL) {
682*4882a593Smuzhiyun 		fprintf(stderr, "Error: cannot open %s:%s\n",
683*4882a593Smuzhiyun 			argv[2], strerror(errno));
684*4882a593Smuzhiyun 		return 1;
685*4882a593Smuzhiyun 	}
686*4882a593Smuzhiyun 
687*4882a593Smuzhiyun 	if (!slre_compile(&slre, argv[1])) {
688*4882a593Smuzhiyun 		fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689*4882a593Smuzhiyun 		return 1;
690*4882a593Smuzhiyun 	}
691*4882a593Smuzhiyun 
692*4882a593Smuzhiyun 	slre_dump(&slre, stderr);
693*4882a593Smuzhiyun 
694*4882a593Smuzhiyun 	while (fgets(data, sizeof(data), fp) != NULL) {
695*4882a593Smuzhiyun 		len = strlen(data);
696*4882a593Smuzhiyun 
697*4882a593Smuzhiyun 		if ((len > 0) && (data[len-1] == '\n')) {
698*4882a593Smuzhiyun 			data[len-1] = '\0';
699*4882a593Smuzhiyun 			--len;
700*4882a593Smuzhiyun 		}
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 		printf("Data = \"%s\"\n", data);
703*4882a593Smuzhiyun 
704*4882a593Smuzhiyun 		(void) memset(caps, 0, sizeof(caps));
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 		res = 0;
707*4882a593Smuzhiyun 
708*4882a593Smuzhiyun 		res = slre_match(&slre, data, len, caps);
709*4882a593Smuzhiyun 		printf("Result [%d]: %d\n", i, res);
710*4882a593Smuzhiyun 
711*4882a593Smuzhiyun 		for (i = 0; i < N_CAPS; i++) {
712*4882a593Smuzhiyun 			if (caps[i].len > 0) {
713*4882a593Smuzhiyun 				printf("Substring %d: len=%d  [%.*s]\n", i,
714*4882a593Smuzhiyun 					caps[i].len,
715*4882a593Smuzhiyun 					caps[i].len, caps[i].ptr);
716*4882a593Smuzhiyun 			}
717*4882a593Smuzhiyun 		}
718*4882a593Smuzhiyun 		printf("----------------------------------------------------\n");
719*4882a593Smuzhiyun 	}
720*4882a593Smuzhiyun 	(void) fclose(fp);
721*4882a593Smuzhiyun 
722*4882a593Smuzhiyun 	return 0;
723*4882a593Smuzhiyun }
724*4882a593Smuzhiyun #endif /* SLRE_TEST */
725