1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2004-2005 Sergey Lyubka <valenok@gmail.com>
3*4882a593Smuzhiyun * All rights reserved
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * "THE BEER-WARE LICENSE" (Revision 42):
6*4882a593Smuzhiyun * Sergey Lyubka wrote this file. As long as you retain this notice you
7*4882a593Smuzhiyun * can do whatever you want with this stuff. If we meet some day, and you think
8*4882a593Smuzhiyun * this stuff is worth it, you can buy me a beer in return.
9*4882a593Smuzhiyun */
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun /*
12*4882a593Smuzhiyun * Downloaded Sat Nov 5 17:43:06 CET 2011 at
13*4882a593Smuzhiyun * http://slre.sourceforge.net/1.0/slre.c
14*4882a593Smuzhiyun */
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun #ifdef SLRE_TEST
17*4882a593Smuzhiyun #include <stdio.h>
18*4882a593Smuzhiyun #include <assert.h>
19*4882a593Smuzhiyun #include <ctype.h>
20*4882a593Smuzhiyun #include <stdlib.h>
21*4882a593Smuzhiyun #include <string.h>
22*4882a593Smuzhiyun #else
23*4882a593Smuzhiyun #include <common.h>
24*4882a593Smuzhiyun #include <linux/ctype.h>
25*4882a593Smuzhiyun #endif /* SLRE_TEST */
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun #include <errno.h>
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun #include <slre.h>
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL,
32*4882a593Smuzhiyun STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT};
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun #ifdef SLRE_TEST
35*4882a593Smuzhiyun static struct {
36*4882a593Smuzhiyun const char *name;
37*4882a593Smuzhiyun int narg;
38*4882a593Smuzhiyun const char *flags;
39*4882a593Smuzhiyun } opcodes[] = {
40*4882a593Smuzhiyun {"END", 0, ""}, /* End of code block or program */
41*4882a593Smuzhiyun {"BRANCH", 2, "oo"}, /* Alternative operator, "|" */
42*4882a593Smuzhiyun {"ANY", 0, ""}, /* Match any character, "." */
43*4882a593Smuzhiyun {"EXACT", 2, "d"}, /* Match exact string */
44*4882a593Smuzhiyun {"ANYOF", 2, "D"}, /* Match any from set, "[]" */
45*4882a593Smuzhiyun {"ANYBUT", 2, "D"}, /* Match any but from set, "[^]"*/
46*4882a593Smuzhiyun {"OPEN ", 1, "i"}, /* Capture start, "(" */
47*4882a593Smuzhiyun {"CLOSE", 1, "i"}, /* Capture end, ")" */
48*4882a593Smuzhiyun {"BOL", 0, ""}, /* Beginning of string, "^" */
49*4882a593Smuzhiyun {"EOL", 0, ""}, /* End of string, "$" */
50*4882a593Smuzhiyun {"STAR", 1, "o"}, /* Match zero or more times "*" */
51*4882a593Smuzhiyun {"PLUS", 1, "o"}, /* Match one or more times, "+" */
52*4882a593Smuzhiyun {"STARQ", 1, "o"}, /* Non-greedy STAR, "*?" */
53*4882a593Smuzhiyun {"PLUSQ", 1, "o"}, /* Non-greedy PLUS, "+?" */
54*4882a593Smuzhiyun {"QUEST", 1, "o"}, /* Match zero or one time, "?" */
55*4882a593Smuzhiyun {"SPACE", 0, ""}, /* Match whitespace, "\s" */
56*4882a593Smuzhiyun {"NONSPACE", 0, ""}, /* Match non-space, "\S" */
57*4882a593Smuzhiyun {"DIGIT", 0, ""} /* Match digit, "\d" */
58*4882a593Smuzhiyun };
59*4882a593Smuzhiyun #endif /* SLRE_TEST */
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun /*
62*4882a593Smuzhiyun * Commands and operands are all unsigned char (1 byte long). All code offsets
63*4882a593Smuzhiyun * are relative to current address, and positive (always point forward). Data
64*4882a593Smuzhiyun * offsets are absolute. Commands with operands:
65*4882a593Smuzhiyun *
66*4882a593Smuzhiyun * BRANCH offset1 offset2
67*4882a593Smuzhiyun * Try to match the code block that follows the BRANCH instruction
68*4882a593Smuzhiyun * (code block ends with END). If no match, try to match code block that
69*4882a593Smuzhiyun * starts at offset1. If either of these match, jump to offset2.
70*4882a593Smuzhiyun *
71*4882a593Smuzhiyun * EXACT data_offset data_length
72*4882a593Smuzhiyun * Try to match exact string. String is recorded in data section from
73*4882a593Smuzhiyun * data_offset, and has length data_length.
74*4882a593Smuzhiyun *
75*4882a593Smuzhiyun * OPEN capture_number
76*4882a593Smuzhiyun * CLOSE capture_number
77*4882a593Smuzhiyun * If the user have passed 'struct cap' array for captures, OPEN
78*4882a593Smuzhiyun * records the beginning of the matched substring (cap->ptr), CLOSE
79*4882a593Smuzhiyun * sets the length (cap->len) for respective capture_number.
80*4882a593Smuzhiyun *
81*4882a593Smuzhiyun * STAR code_offset
82*4882a593Smuzhiyun * PLUS code_offset
83*4882a593Smuzhiyun * QUEST code_offset
84*4882a593Smuzhiyun * *, +, ?, respectively. Try to gobble as much as possible from the
85*4882a593Smuzhiyun * matched buffer, until code block that follows these instructions
86*4882a593Smuzhiyun * matches. When the longest possible string is matched,
87*4882a593Smuzhiyun * jump to code_offset
88*4882a593Smuzhiyun *
89*4882a593Smuzhiyun * STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
90*4882a593Smuzhiyun */
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun static const char *meta_chars = "|.^$*+?()[\\";
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun #ifdef SLRE_TEST
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun static void
print_character_set(FILE * fp,const unsigned char * p,int len)97*4882a593Smuzhiyun print_character_set(FILE *fp, const unsigned char *p, int len)
98*4882a593Smuzhiyun {
99*4882a593Smuzhiyun int i;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun for (i = 0; i < len; i++) {
102*4882a593Smuzhiyun if (i > 0)
103*4882a593Smuzhiyun (void) fputc(',', fp);
104*4882a593Smuzhiyun if (p[i] == 0) {
105*4882a593Smuzhiyun i++;
106*4882a593Smuzhiyun if (p[i] == 0)
107*4882a593Smuzhiyun (void) fprintf(fp, "\\x%02x", p[i]);
108*4882a593Smuzhiyun else
109*4882a593Smuzhiyun (void) fprintf(fp, "%s", opcodes[p[i]].name);
110*4882a593Smuzhiyun } else if (isprint(p[i])) {
111*4882a593Smuzhiyun (void) fputc(p[i], fp);
112*4882a593Smuzhiyun } else {
113*4882a593Smuzhiyun (void) fprintf(fp, "\\x%02x", p[i]);
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun }
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun void
slre_dump(const struct slre * r,FILE * fp)119*4882a593Smuzhiyun slre_dump(const struct slre *r, FILE *fp)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun int i, j, ch, op, pc;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun for (pc = 0; pc < r->code_size; pc++) {
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun op = r->code[pc];
126*4882a593Smuzhiyun (void) fprintf(fp, "%3d %s ", pc, opcodes[op].name);
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun for (i = 0; opcodes[op].flags[i] != '\0'; i++)
129*4882a593Smuzhiyun switch (opcodes[op].flags[i]) {
130*4882a593Smuzhiyun case 'i':
131*4882a593Smuzhiyun (void) fprintf(fp, "%d ", r->code[pc + 1]);
132*4882a593Smuzhiyun pc++;
133*4882a593Smuzhiyun break;
134*4882a593Smuzhiyun case 'o':
135*4882a593Smuzhiyun (void) fprintf(fp, "%d ",
136*4882a593Smuzhiyun pc + r->code[pc + 1] - i);
137*4882a593Smuzhiyun pc++;
138*4882a593Smuzhiyun break;
139*4882a593Smuzhiyun case 'D':
140*4882a593Smuzhiyun print_character_set(fp, r->data +
141*4882a593Smuzhiyun r->code[pc + 1], r->code[pc + 2]);
142*4882a593Smuzhiyun pc += 2;
143*4882a593Smuzhiyun break;
144*4882a593Smuzhiyun case 'd':
145*4882a593Smuzhiyun (void) fputc('"', fp);
146*4882a593Smuzhiyun for (j = 0; j < r->code[pc + 2]; j++) {
147*4882a593Smuzhiyun ch = r->data[r->code[pc + 1] + j];
148*4882a593Smuzhiyun if (isprint(ch)) {
149*4882a593Smuzhiyun (void) fputc(ch, fp);
150*4882a593Smuzhiyun } else {
151*4882a593Smuzhiyun (void) fprintf(fp,
152*4882a593Smuzhiyun "\\x%02x", ch);
153*4882a593Smuzhiyun }
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun (void) fputc('"', fp);
156*4882a593Smuzhiyun pc += 2;
157*4882a593Smuzhiyun break;
158*4882a593Smuzhiyun }
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun (void) fputc('\n', fp);
161*4882a593Smuzhiyun }
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun #endif /* SLRE_TEST */
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun static void
set_jump_offset(struct slre * r,int pc,int offset)166*4882a593Smuzhiyun set_jump_offset(struct slre *r, int pc, int offset)
167*4882a593Smuzhiyun {
168*4882a593Smuzhiyun assert(offset < r->code_size);
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun if (r->code_size - offset > 0xff)
171*4882a593Smuzhiyun r->err_str = "Jump offset is too big";
172*4882a593Smuzhiyun else
173*4882a593Smuzhiyun r->code[pc] = (unsigned char) (r->code_size - offset);
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun
176*4882a593Smuzhiyun static void
emit(struct slre * r,int code)177*4882a593Smuzhiyun emit(struct slre *r, int code)
178*4882a593Smuzhiyun {
179*4882a593Smuzhiyun if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0])))
180*4882a593Smuzhiyun r->err_str = "RE is too long (code overflow)";
181*4882a593Smuzhiyun else
182*4882a593Smuzhiyun r->code[r->code_size++] = (unsigned char) code;
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun static void
store_char_in_data(struct slre * r,int ch)186*4882a593Smuzhiyun store_char_in_data(struct slre *r, int ch)
187*4882a593Smuzhiyun {
188*4882a593Smuzhiyun if (r->data_size >= (int) sizeof(r->data))
189*4882a593Smuzhiyun r->err_str = "RE is too long (data overflow)";
190*4882a593Smuzhiyun else
191*4882a593Smuzhiyun r->data[r->data_size++] = ch;
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun static void
exact(struct slre * r,const char ** re)195*4882a593Smuzhiyun exact(struct slre *r, const char **re)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun int old_data_size = r->data_size;
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun while (**re != '\0' && (strchr(meta_chars, **re)) == NULL)
200*4882a593Smuzhiyun store_char_in_data(r, *(*re)++);
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun emit(r, EXACT);
203*4882a593Smuzhiyun emit(r, old_data_size);
204*4882a593Smuzhiyun emit(r, r->data_size - old_data_size);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun static int
get_escape_char(const char ** re)208*4882a593Smuzhiyun get_escape_char(const char **re)
209*4882a593Smuzhiyun {
210*4882a593Smuzhiyun int res;
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun switch (*(*re)++) {
213*4882a593Smuzhiyun case 'n':
214*4882a593Smuzhiyun res = '\n';
215*4882a593Smuzhiyun break;
216*4882a593Smuzhiyun case 'r':
217*4882a593Smuzhiyun res = '\r';
218*4882a593Smuzhiyun break;
219*4882a593Smuzhiyun case 't':
220*4882a593Smuzhiyun res = '\t';
221*4882a593Smuzhiyun break;
222*4882a593Smuzhiyun case '0':
223*4882a593Smuzhiyun res = 0;
224*4882a593Smuzhiyun break;
225*4882a593Smuzhiyun case 'S':
226*4882a593Smuzhiyun res = NONSPACE << 8;
227*4882a593Smuzhiyun break;
228*4882a593Smuzhiyun case 's':
229*4882a593Smuzhiyun res = SPACE << 8;
230*4882a593Smuzhiyun break;
231*4882a593Smuzhiyun case 'd':
232*4882a593Smuzhiyun res = DIGIT << 8;
233*4882a593Smuzhiyun break;
234*4882a593Smuzhiyun default:
235*4882a593Smuzhiyun res = (*re)[-1];
236*4882a593Smuzhiyun break;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun return res;
240*4882a593Smuzhiyun }
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun static void
anyof(struct slre * r,const char ** re)243*4882a593Smuzhiyun anyof(struct slre *r, const char **re)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun int esc, old_data_size = r->data_size, op = ANYOF;
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun if (**re == '^') {
248*4882a593Smuzhiyun op = ANYBUT;
249*4882a593Smuzhiyun (*re)++;
250*4882a593Smuzhiyun }
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun while (**re != '\0')
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun switch (*(*re)++) {
255*4882a593Smuzhiyun case ']':
256*4882a593Smuzhiyun emit(r, op);
257*4882a593Smuzhiyun emit(r, old_data_size);
258*4882a593Smuzhiyun emit(r, r->data_size - old_data_size);
259*4882a593Smuzhiyun return;
260*4882a593Smuzhiyun /* NOTREACHED */
261*4882a593Smuzhiyun break;
262*4882a593Smuzhiyun case '\\':
263*4882a593Smuzhiyun esc = get_escape_char(re);
264*4882a593Smuzhiyun if ((esc & 0xff) == 0) {
265*4882a593Smuzhiyun store_char_in_data(r, 0);
266*4882a593Smuzhiyun store_char_in_data(r, esc >> 8);
267*4882a593Smuzhiyun } else {
268*4882a593Smuzhiyun store_char_in_data(r, esc);
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun break;
271*4882a593Smuzhiyun default:
272*4882a593Smuzhiyun store_char_in_data(r, (*re)[-1]);
273*4882a593Smuzhiyun break;
274*4882a593Smuzhiyun }
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun r->err_str = "No closing ']' bracket";
277*4882a593Smuzhiyun }
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun static void
relocate(struct slre * r,int begin,int shift)280*4882a593Smuzhiyun relocate(struct slre *r, int begin, int shift)
281*4882a593Smuzhiyun {
282*4882a593Smuzhiyun emit(r, END);
283*4882a593Smuzhiyun memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
284*4882a593Smuzhiyun r->code_size += shift;
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun static void
quantifier(struct slre * r,int prev,int op)288*4882a593Smuzhiyun quantifier(struct slre *r, int prev, int op)
289*4882a593Smuzhiyun {
290*4882a593Smuzhiyun if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
291*4882a593Smuzhiyun r->code[prev + 2]--;
292*4882a593Smuzhiyun emit(r, EXACT);
293*4882a593Smuzhiyun emit(r, r->code[prev + 1] + r->code[prev + 2]);
294*4882a593Smuzhiyun emit(r, 1);
295*4882a593Smuzhiyun prev = r->code_size - 3;
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun relocate(r, prev, 2);
298*4882a593Smuzhiyun r->code[prev] = op;
299*4882a593Smuzhiyun set_jump_offset(r, prev + 1, prev);
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun static void
exact_one_char(struct slre * r,int ch)303*4882a593Smuzhiyun exact_one_char(struct slre *r, int ch)
304*4882a593Smuzhiyun {
305*4882a593Smuzhiyun emit(r, EXACT);
306*4882a593Smuzhiyun emit(r, r->data_size);
307*4882a593Smuzhiyun emit(r, 1);
308*4882a593Smuzhiyun store_char_in_data(r, ch);
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun static void
fixup_branch(struct slre * r,int fixup)312*4882a593Smuzhiyun fixup_branch(struct slre *r, int fixup)
313*4882a593Smuzhiyun {
314*4882a593Smuzhiyun if (fixup > 0) {
315*4882a593Smuzhiyun emit(r, END);
316*4882a593Smuzhiyun set_jump_offset(r, fixup, fixup - 2);
317*4882a593Smuzhiyun }
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun static void
compile(struct slre * r,const char ** re)321*4882a593Smuzhiyun compile(struct slre *r, const char **re)
322*4882a593Smuzhiyun {
323*4882a593Smuzhiyun int op, esc, branch_start, last_op, fixup, cap_no, level;
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun fixup = 0;
326*4882a593Smuzhiyun level = r->num_caps;
327*4882a593Smuzhiyun branch_start = last_op = r->code_size;
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun for (;;)
330*4882a593Smuzhiyun switch (*(*re)++) {
331*4882a593Smuzhiyun case '\0':
332*4882a593Smuzhiyun (*re)--;
333*4882a593Smuzhiyun return;
334*4882a593Smuzhiyun /* NOTREACHED */
335*4882a593Smuzhiyun break;
336*4882a593Smuzhiyun case '^':
337*4882a593Smuzhiyun emit(r, BOL);
338*4882a593Smuzhiyun break;
339*4882a593Smuzhiyun case '$':
340*4882a593Smuzhiyun emit(r, EOL);
341*4882a593Smuzhiyun break;
342*4882a593Smuzhiyun case '.':
343*4882a593Smuzhiyun last_op = r->code_size;
344*4882a593Smuzhiyun emit(r, ANY);
345*4882a593Smuzhiyun break;
346*4882a593Smuzhiyun case '[':
347*4882a593Smuzhiyun last_op = r->code_size;
348*4882a593Smuzhiyun anyof(r, re);
349*4882a593Smuzhiyun break;
350*4882a593Smuzhiyun case '\\':
351*4882a593Smuzhiyun last_op = r->code_size;
352*4882a593Smuzhiyun esc = get_escape_char(re);
353*4882a593Smuzhiyun if (esc & 0xff00)
354*4882a593Smuzhiyun emit(r, esc >> 8);
355*4882a593Smuzhiyun else
356*4882a593Smuzhiyun exact_one_char(r, esc);
357*4882a593Smuzhiyun break;
358*4882a593Smuzhiyun case '(':
359*4882a593Smuzhiyun last_op = r->code_size;
360*4882a593Smuzhiyun cap_no = ++r->num_caps;
361*4882a593Smuzhiyun emit(r, OPEN);
362*4882a593Smuzhiyun emit(r, cap_no);
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun compile(r, re);
365*4882a593Smuzhiyun if (*(*re)++ != ')') {
366*4882a593Smuzhiyun r->err_str = "No closing bracket";
367*4882a593Smuzhiyun return;
368*4882a593Smuzhiyun }
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun emit(r, CLOSE);
371*4882a593Smuzhiyun emit(r, cap_no);
372*4882a593Smuzhiyun break;
373*4882a593Smuzhiyun case ')':
374*4882a593Smuzhiyun (*re)--;
375*4882a593Smuzhiyun fixup_branch(r, fixup);
376*4882a593Smuzhiyun if (level == 0) {
377*4882a593Smuzhiyun r->err_str = "Unbalanced brackets";
378*4882a593Smuzhiyun return;
379*4882a593Smuzhiyun }
380*4882a593Smuzhiyun return;
381*4882a593Smuzhiyun /* NOTREACHED */
382*4882a593Smuzhiyun break;
383*4882a593Smuzhiyun case '+':
384*4882a593Smuzhiyun case '*':
385*4882a593Smuzhiyun op = (*re)[-1] == '*' ? STAR : PLUS;
386*4882a593Smuzhiyun if (**re == '?') {
387*4882a593Smuzhiyun (*re)++;
388*4882a593Smuzhiyun op = op == STAR ? STARQ : PLUSQ;
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun quantifier(r, last_op, op);
391*4882a593Smuzhiyun break;
392*4882a593Smuzhiyun case '?':
393*4882a593Smuzhiyun quantifier(r, last_op, QUEST);
394*4882a593Smuzhiyun break;
395*4882a593Smuzhiyun case '|':
396*4882a593Smuzhiyun fixup_branch(r, fixup);
397*4882a593Smuzhiyun relocate(r, branch_start, 3);
398*4882a593Smuzhiyun r->code[branch_start] = BRANCH;
399*4882a593Smuzhiyun set_jump_offset(r, branch_start + 1, branch_start);
400*4882a593Smuzhiyun fixup = branch_start + 2;
401*4882a593Smuzhiyun r->code[fixup] = 0xff;
402*4882a593Smuzhiyun break;
403*4882a593Smuzhiyun default:
404*4882a593Smuzhiyun (*re)--;
405*4882a593Smuzhiyun last_op = r->code_size;
406*4882a593Smuzhiyun exact(r, re);
407*4882a593Smuzhiyun break;
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun }
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun int
slre_compile(struct slre * r,const char * re)412*4882a593Smuzhiyun slre_compile(struct slre *r, const char *re)
413*4882a593Smuzhiyun {
414*4882a593Smuzhiyun r->err_str = NULL;
415*4882a593Smuzhiyun r->code_size = r->data_size = r->num_caps = r->anchored = 0;
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun if (*re == '^')
418*4882a593Smuzhiyun r->anchored++;
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun emit(r, OPEN); /* This will capture what matches full RE */
421*4882a593Smuzhiyun emit(r, 0);
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun while (*re != '\0')
424*4882a593Smuzhiyun compile(r, &re);
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun if (r->code[2] == BRANCH)
427*4882a593Smuzhiyun fixup_branch(r, 4);
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun emit(r, CLOSE);
430*4882a593Smuzhiyun emit(r, 0);
431*4882a593Smuzhiyun emit(r, END);
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun return (r->err_str == NULL ? 1 : 0);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun static int match(const struct slre *, int,
437*4882a593Smuzhiyun const char *, int, int *, struct cap *);
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun static void
loop_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)440*4882a593Smuzhiyun loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
441*4882a593Smuzhiyun {
442*4882a593Smuzhiyun int saved_offset, matched_offset;
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun matched_offset = *ofs;
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun while (match(r, pc + 2, s, len, ofs, NULL)) {
447*4882a593Smuzhiyun saved_offset = *ofs;
448*4882a593Smuzhiyun if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
449*4882a593Smuzhiyun matched_offset = saved_offset;
450*4882a593Smuzhiyun *ofs = saved_offset;
451*4882a593Smuzhiyun }
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun *ofs = matched_offset;
454*4882a593Smuzhiyun }
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun static void
loop_non_greedy(const struct slre * r,int pc,const char * s,int len,int * ofs)457*4882a593Smuzhiyun loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
458*4882a593Smuzhiyun {
459*4882a593Smuzhiyun int saved_offset = *ofs;
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun while (match(r, pc + 2, s, len, ofs, NULL)) {
462*4882a593Smuzhiyun saved_offset = *ofs;
463*4882a593Smuzhiyun if (match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
464*4882a593Smuzhiyun break;
465*4882a593Smuzhiyun }
466*4882a593Smuzhiyun
467*4882a593Smuzhiyun *ofs = saved_offset;
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun static int
is_any_of(const unsigned char * p,int len,const char * s,int * ofs)471*4882a593Smuzhiyun is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun int i, ch;
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun ch = s[*ofs];
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun for (i = 0; i < len; i++)
478*4882a593Smuzhiyun if (p[i] == ch) {
479*4882a593Smuzhiyun (*ofs)++;
480*4882a593Smuzhiyun return 1;
481*4882a593Smuzhiyun }
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun return 0;
484*4882a593Smuzhiyun }
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun static int
is_any_but(const unsigned char * p,int len,const char * s,int * ofs)487*4882a593Smuzhiyun is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
488*4882a593Smuzhiyun {
489*4882a593Smuzhiyun int i, ch;
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun ch = s[*ofs];
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun for (i = 0; i < len; i++) {
494*4882a593Smuzhiyun if (p[i] == ch)
495*4882a593Smuzhiyun return 0;
496*4882a593Smuzhiyun }
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun (*ofs)++;
499*4882a593Smuzhiyun return 1;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun static int
match(const struct slre * r,int pc,const char * s,int len,int * ofs,struct cap * caps)503*4882a593Smuzhiyun match(const struct slre *r, int pc, const char *s, int len,
504*4882a593Smuzhiyun int *ofs, struct cap *caps)
505*4882a593Smuzhiyun {
506*4882a593Smuzhiyun int n, saved_offset, res = 1;
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun while (res && r->code[pc] != END) {
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun assert(pc < r->code_size);
511*4882a593Smuzhiyun assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun switch (r->code[pc]) {
514*4882a593Smuzhiyun case BRANCH:
515*4882a593Smuzhiyun saved_offset = *ofs;
516*4882a593Smuzhiyun res = match(r, pc + 3, s, len, ofs, caps);
517*4882a593Smuzhiyun if (res == 0) {
518*4882a593Smuzhiyun *ofs = saved_offset;
519*4882a593Smuzhiyun res = match(r, pc + r->code[pc + 1],
520*4882a593Smuzhiyun s, len, ofs, caps);
521*4882a593Smuzhiyun }
522*4882a593Smuzhiyun pc += r->code[pc + 2];
523*4882a593Smuzhiyun break;
524*4882a593Smuzhiyun case EXACT:
525*4882a593Smuzhiyun res = 0;
526*4882a593Smuzhiyun n = r->code[pc + 2]; /* String length */
527*4882a593Smuzhiyun if (n <= len - *ofs && !memcmp(s + *ofs, r->data +
528*4882a593Smuzhiyun r->code[pc + 1], n)) {
529*4882a593Smuzhiyun (*ofs) += n;
530*4882a593Smuzhiyun res = 1;
531*4882a593Smuzhiyun }
532*4882a593Smuzhiyun pc += 3;
533*4882a593Smuzhiyun break;
534*4882a593Smuzhiyun case QUEST:
535*4882a593Smuzhiyun res = 1;
536*4882a593Smuzhiyun saved_offset = *ofs;
537*4882a593Smuzhiyun if (!match(r, pc + 2, s, len, ofs, caps))
538*4882a593Smuzhiyun *ofs = saved_offset;
539*4882a593Smuzhiyun pc += r->code[pc + 1];
540*4882a593Smuzhiyun break;
541*4882a593Smuzhiyun case STAR:
542*4882a593Smuzhiyun res = 1;
543*4882a593Smuzhiyun loop_greedy(r, pc, s, len, ofs);
544*4882a593Smuzhiyun pc += r->code[pc + 1];
545*4882a593Smuzhiyun break;
546*4882a593Smuzhiyun case STARQ:
547*4882a593Smuzhiyun res = 1;
548*4882a593Smuzhiyun loop_non_greedy(r, pc, s, len, ofs);
549*4882a593Smuzhiyun pc += r->code[pc + 1];
550*4882a593Smuzhiyun break;
551*4882a593Smuzhiyun case PLUS:
552*4882a593Smuzhiyun res = match(r, pc + 2, s, len, ofs, caps);
553*4882a593Smuzhiyun if (res == 0)
554*4882a593Smuzhiyun break;
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun loop_greedy(r, pc, s, len, ofs);
557*4882a593Smuzhiyun pc += r->code[pc + 1];
558*4882a593Smuzhiyun break;
559*4882a593Smuzhiyun case PLUSQ:
560*4882a593Smuzhiyun res = match(r, pc + 2, s, len, ofs, caps);
561*4882a593Smuzhiyun if (res == 0)
562*4882a593Smuzhiyun break;
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun loop_non_greedy(r, pc, s, len, ofs);
565*4882a593Smuzhiyun pc += r->code[pc + 1];
566*4882a593Smuzhiyun break;
567*4882a593Smuzhiyun case SPACE:
568*4882a593Smuzhiyun res = 0;
569*4882a593Smuzhiyun if (*ofs < len && isspace(((unsigned char *)s)[*ofs])) {
570*4882a593Smuzhiyun (*ofs)++;
571*4882a593Smuzhiyun res = 1;
572*4882a593Smuzhiyun }
573*4882a593Smuzhiyun pc++;
574*4882a593Smuzhiyun break;
575*4882a593Smuzhiyun case NONSPACE:
576*4882a593Smuzhiyun res = 0;
577*4882a593Smuzhiyun if (*ofs < len &&
578*4882a593Smuzhiyun !isspace(((unsigned char *)s)[*ofs])) {
579*4882a593Smuzhiyun (*ofs)++;
580*4882a593Smuzhiyun res = 1;
581*4882a593Smuzhiyun }
582*4882a593Smuzhiyun pc++;
583*4882a593Smuzhiyun break;
584*4882a593Smuzhiyun case DIGIT:
585*4882a593Smuzhiyun res = 0;
586*4882a593Smuzhiyun if (*ofs < len && isdigit(((unsigned char *)s)[*ofs])) {
587*4882a593Smuzhiyun (*ofs)++;
588*4882a593Smuzhiyun res = 1;
589*4882a593Smuzhiyun }
590*4882a593Smuzhiyun pc++;
591*4882a593Smuzhiyun break;
592*4882a593Smuzhiyun case ANY:
593*4882a593Smuzhiyun res = 0;
594*4882a593Smuzhiyun if (*ofs < len) {
595*4882a593Smuzhiyun (*ofs)++;
596*4882a593Smuzhiyun res = 1;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun pc++;
599*4882a593Smuzhiyun break;
600*4882a593Smuzhiyun case ANYOF:
601*4882a593Smuzhiyun res = 0;
602*4882a593Smuzhiyun if (*ofs < len)
603*4882a593Smuzhiyun res = is_any_of(r->data + r->code[pc + 1],
604*4882a593Smuzhiyun r->code[pc + 2], s, ofs);
605*4882a593Smuzhiyun pc += 3;
606*4882a593Smuzhiyun break;
607*4882a593Smuzhiyun case ANYBUT:
608*4882a593Smuzhiyun res = 0;
609*4882a593Smuzhiyun if (*ofs < len)
610*4882a593Smuzhiyun res = is_any_but(r->data + r->code[pc + 1],
611*4882a593Smuzhiyun r->code[pc + 2], s, ofs);
612*4882a593Smuzhiyun pc += 3;
613*4882a593Smuzhiyun break;
614*4882a593Smuzhiyun case BOL:
615*4882a593Smuzhiyun res = *ofs == 0 ? 1 : 0;
616*4882a593Smuzhiyun pc++;
617*4882a593Smuzhiyun break;
618*4882a593Smuzhiyun case EOL:
619*4882a593Smuzhiyun res = *ofs == len ? 1 : 0;
620*4882a593Smuzhiyun pc++;
621*4882a593Smuzhiyun break;
622*4882a593Smuzhiyun case OPEN:
623*4882a593Smuzhiyun if (caps != NULL)
624*4882a593Smuzhiyun caps[r->code[pc + 1]].ptr = s + *ofs;
625*4882a593Smuzhiyun pc += 2;
626*4882a593Smuzhiyun break;
627*4882a593Smuzhiyun case CLOSE:
628*4882a593Smuzhiyun if (caps != NULL)
629*4882a593Smuzhiyun caps[r->code[pc + 1]].len = (s + *ofs) -
630*4882a593Smuzhiyun caps[r->code[pc + 1]].ptr;
631*4882a593Smuzhiyun pc += 2;
632*4882a593Smuzhiyun break;
633*4882a593Smuzhiyun case END:
634*4882a593Smuzhiyun pc++;
635*4882a593Smuzhiyun break;
636*4882a593Smuzhiyun default:
637*4882a593Smuzhiyun printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
638*4882a593Smuzhiyun assert(0);
639*4882a593Smuzhiyun break;
640*4882a593Smuzhiyun }
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun return res;
644*4882a593Smuzhiyun }
645*4882a593Smuzhiyun
646*4882a593Smuzhiyun int
slre_match(const struct slre * r,const char * buf,int len,struct cap * caps)647*4882a593Smuzhiyun slre_match(const struct slre *r, const char *buf, int len,
648*4882a593Smuzhiyun struct cap *caps)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun int i, ofs = 0, res = 0;
651*4882a593Smuzhiyun
652*4882a593Smuzhiyun if (r->anchored) {
653*4882a593Smuzhiyun res = match(r, 0, buf, len, &ofs, caps);
654*4882a593Smuzhiyun } else {
655*4882a593Smuzhiyun for (i = 0; i < len && res == 0; i++) {
656*4882a593Smuzhiyun ofs = i;
657*4882a593Smuzhiyun res = match(r, 0, buf, len, &ofs, caps);
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun }
660*4882a593Smuzhiyun
661*4882a593Smuzhiyun return res;
662*4882a593Smuzhiyun }
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun #ifdef SLRE_TEST
665*4882a593Smuzhiyun #define N_CAPS 5
666*4882a593Smuzhiyun
main(int argc,char * argv[])667*4882a593Smuzhiyun int main(int argc, char *argv[])
668*4882a593Smuzhiyun {
669*4882a593Smuzhiyun struct slre slre;
670*4882a593Smuzhiyun struct cap caps[N_CAPS];
671*4882a593Smuzhiyun unsigned char data[1 * 1024 * 1024];
672*4882a593Smuzhiyun FILE *fp;
673*4882a593Smuzhiyun int i, res, len;
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun if (argc < 2) {
676*4882a593Smuzhiyun fprintf(stderr, "Usage: %s 'slre' <file>\n", argv[0]);
677*4882a593Smuzhiyun return 1;
678*4882a593Smuzhiyun }
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun fp = fopen(argv[2], "rb");
681*4882a593Smuzhiyun if (fp == NULL) {
682*4882a593Smuzhiyun fprintf(stderr, "Error: cannot open %s:%s\n",
683*4882a593Smuzhiyun argv[2], strerror(errno));
684*4882a593Smuzhiyun return 1;
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun if (!slre_compile(&slre, argv[1])) {
688*4882a593Smuzhiyun fprintf(stderr, "Error compiling slre: %s\n", slre.err_str);
689*4882a593Smuzhiyun return 1;
690*4882a593Smuzhiyun }
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun slre_dump(&slre, stderr);
693*4882a593Smuzhiyun
694*4882a593Smuzhiyun while (fgets(data, sizeof(data), fp) != NULL) {
695*4882a593Smuzhiyun len = strlen(data);
696*4882a593Smuzhiyun
697*4882a593Smuzhiyun if ((len > 0) && (data[len-1] == '\n')) {
698*4882a593Smuzhiyun data[len-1] = '\0';
699*4882a593Smuzhiyun --len;
700*4882a593Smuzhiyun }
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun printf("Data = \"%s\"\n", data);
703*4882a593Smuzhiyun
704*4882a593Smuzhiyun (void) memset(caps, 0, sizeof(caps));
705*4882a593Smuzhiyun
706*4882a593Smuzhiyun res = 0;
707*4882a593Smuzhiyun
708*4882a593Smuzhiyun res = slre_match(&slre, data, len, caps);
709*4882a593Smuzhiyun printf("Result [%d]: %d\n", i, res);
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun for (i = 0; i < N_CAPS; i++) {
712*4882a593Smuzhiyun if (caps[i].len > 0) {
713*4882a593Smuzhiyun printf("Substring %d: len=%d [%.*s]\n", i,
714*4882a593Smuzhiyun caps[i].len,
715*4882a593Smuzhiyun caps[i].len, caps[i].ptr);
716*4882a593Smuzhiyun }
717*4882a593Smuzhiyun }
718*4882a593Smuzhiyun printf("----------------------------------------------------\n");
719*4882a593Smuzhiyun }
720*4882a593Smuzhiyun (void) fclose(fp);
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun return 0;
723*4882a593Smuzhiyun }
724*4882a593Smuzhiyun #endif /* SLRE_TEST */
725