1117 lines
34 KiB
C++
1117 lines
34 KiB
C++
/* e_regex.cpp
|
|
*
|
|
* Copyright (c) 1994-1996, Marko Macek
|
|
*
|
|
* You may distribute under the terms of either the GNU General Public
|
|
* License or the Artistic License, as specified in the README file.
|
|
*
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include "e_regex.h"
|
|
|
|
//#define DEBUG
|
|
|
|
static int RegCount = 0;
|
|
|
|
#ifdef DEBUG
|
|
static void RxDump(int N, RxNode *n);
|
|
#endif
|
|
|
|
static RxNode *NewNode(int aWhat) {
|
|
RxNode *N = (RxNode *) malloc(sizeof(RxNode));
|
|
|
|
if (N) {
|
|
memset(N, 0, sizeof(RxNode));
|
|
N->fWhat = (short)aWhat;
|
|
}
|
|
return N;
|
|
}
|
|
|
|
static RxNode *NewChar(char Ch) {
|
|
RxNode *A = NewNode(RE_CHAR);
|
|
|
|
if (A) {
|
|
A->fChar = (char *)malloc(1);
|
|
A->fLen = 1;
|
|
A->fChar[0] = Ch;
|
|
}
|
|
return A;
|
|
}
|
|
|
|
static RxNode *NewEscape(const char **const Regexp) {
|
|
char Ch = **Regexp;
|
|
++*Regexp;
|
|
switch (Ch) {
|
|
case 0: return 0;
|
|
case 'a': Ch = '\a'; break;
|
|
case 'b': Ch = '\b'; break;
|
|
case 'f': Ch = '\f'; break;
|
|
case 'n': Ch = '\n'; break;
|
|
case 'r': Ch = '\r'; break;
|
|
case 't': Ch = '\t'; break;
|
|
case 'v': Ch = '\v'; break;
|
|
case 'e': Ch = 27; break;
|
|
case 's': return NewNode(RE_WSPACE);
|
|
case 'S': return NewNode(RE_NWSPACE);
|
|
case 'U': return NewNode(RE_UPPER);
|
|
case 'L': return NewNode(RE_LOWER);
|
|
case 'w': return NewNode(RE_WORD);
|
|
case 'W': return NewNode(RE_NWORD);
|
|
case 'd': return NewNode(RE_DIGIT);
|
|
case 'D': return NewNode(RE_NDIGIT);
|
|
case 'C': return NewNode(RE_CASE);
|
|
case 'c': return NewNode(RE_NCASE);
|
|
case 'N':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = N * 100;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = A + N * 10;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
break;
|
|
case 'o':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = N * 64;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = A + N * 8;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
break;
|
|
case 'x':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
(*Regexp)++;
|
|
A = N << 4;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
break;
|
|
}
|
|
return NewChar(Ch);
|
|
}
|
|
|
|
|
|
#define NNN 32 // 8 * 32 = 256 (match set)
|
|
|
|
#define SETOP(set,n) \
|
|
do { \
|
|
set[(unsigned char)(n) >> 3] |= (unsigned char)(1 << ((unsigned char)(n) & 7)); \
|
|
} while (0)
|
|
|
|
static RxNode *NewSet(const char ** const Regexp) {
|
|
unsigned char set[NNN];
|
|
int s = 0;
|
|
int c = 0;
|
|
unsigned int i, xx;
|
|
unsigned char Ch, C1 = 0, C2 = 0;
|
|
int doset = 0;
|
|
|
|
memset(set, 0, sizeof(set));
|
|
s = 1;
|
|
if (**Regexp == '^') {
|
|
s = 0;
|
|
++*Regexp;
|
|
}
|
|
c = 0;
|
|
|
|
while (**Regexp) {
|
|
switch (Ch = *((*Regexp)++)) {
|
|
case ']':
|
|
if (doset == 1) return 0;
|
|
{
|
|
RxNode *N = NewNode(s?RE_INSET:RE_NOTINSET);
|
|
N->fChar = (char *) malloc(sizeof(set));
|
|
N->fLen = sizeof(set);
|
|
if (N->fChar == 0) return 0;
|
|
memcpy(N->fChar, (char *) set, sizeof(set));
|
|
return N;
|
|
}
|
|
case '\\':
|
|
switch (Ch = *((*Regexp)++)) {
|
|
case 0: return 0;
|
|
case 'a': Ch = '\a'; break;
|
|
case 'b': Ch = '\b'; break;
|
|
case 'f': Ch = '\f'; break;
|
|
case 'n': Ch = '\n'; break;
|
|
case 'r': Ch = '\r'; break;
|
|
case 't': Ch = '\t'; break;
|
|
case 'v': Ch = '\v'; break;
|
|
case 'e': Ch = 27; break;
|
|
case 'N':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = N * 100;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = A + N * 10;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (unsigned char)A;
|
|
}
|
|
break;
|
|
case 'o':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = N * 64;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = A + N * 8;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 7) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (unsigned char)A;
|
|
}
|
|
break;
|
|
case 'x':
|
|
{
|
|
unsigned int N = 0;
|
|
unsigned int A = 0;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
(*Regexp)++;
|
|
A = N << 4;
|
|
if (**Regexp == 0) return 0;
|
|
N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
(*Regexp)++;
|
|
A = A + N;
|
|
Ch = (unsigned char)A;
|
|
}
|
|
break;
|
|
case 's':
|
|
c += 4;
|
|
SETOP(set, '\n');
|
|
SETOP(set, '\t');
|
|
SETOP(set, ' ');
|
|
SETOP(set, '\r');
|
|
continue;
|
|
case 'S':
|
|
for (xx = 0; xx <= 255; xx++) {
|
|
if (xx != ' ' && xx != '\t' && xx != '\n' && xx != '\r') {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
}
|
|
continue;
|
|
case 'w':
|
|
for (xx = 0; xx <= 255; xx++) {
|
|
if (isalnum(xx)) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
}
|
|
break;
|
|
case 'W':
|
|
for (xx = 0; xx <= 255; xx++) {
|
|
if (!isalnum(xx)) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
}
|
|
break;
|
|
case 'd':
|
|
for (xx = 0; xx <= 255; xx++) {
|
|
if (isdigit(xx)) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
}
|
|
break;
|
|
case 'D':
|
|
for (xx = 0; xx <= 255; xx++) {
|
|
if (!isdigit(xx)) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
}
|
|
break;
|
|
case 'U':
|
|
for (xx = 'A'; xx <= 'Z'; xx++) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
continue;
|
|
case 'L':
|
|
for (xx = 'a'; xx <= 'z'; xx++) {
|
|
c++;
|
|
SETOP(set, xx);
|
|
}
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if (doset == 0 && ((**Regexp) == '-')) {
|
|
doset = 1;
|
|
C1 = Ch;
|
|
++*Regexp;
|
|
continue;
|
|
} else if (doset == 1) {
|
|
C2 = Ch;
|
|
if (C2 < C1) return 0;
|
|
for(i = C1; i <= C2; i++) SETOP(set, i);
|
|
doset = 0;
|
|
continue;
|
|
}
|
|
c++;
|
|
SETOP(set, Ch);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int AddNode(RxNode **F, RxNode **N, RxNode *A) {
|
|
if (A) {
|
|
if (*F) {
|
|
(*N)->fNext = A;
|
|
A->fPrev = (*N);
|
|
*N = A;
|
|
} else {
|
|
(*N) = (*F) = A;
|
|
A->fPrev = A->fNext = 0;
|
|
}
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int CountWidth(RxNode *N) {
|
|
int w = 0;
|
|
|
|
while (N) {
|
|
if (N->fWhat < 32) w += 0;
|
|
else if (N->fWhat >= 32 && N->fWhat < 64)
|
|
w += 1;
|
|
N = N->fNext;
|
|
}
|
|
return w;
|
|
}
|
|
|
|
static int MakeSub(RxNode **F, RxNode **N, char What) {
|
|
//printf("MakeSub: %c\n", What);
|
|
if (*N) {
|
|
RxNode *No;
|
|
RxNode *New;
|
|
RxNode *Jump, *Skip;
|
|
RxNode *Last = (*N);
|
|
|
|
if (Last->fWhat & RE_GROUP) {
|
|
RxNode *P = Last->fPrev;
|
|
int C = 1;
|
|
|
|
while ((C > 0) && P) {
|
|
//puts("backtracking...-----");
|
|
//RxDump(0, P);
|
|
if (P->fWhat & RE_GROUP) {
|
|
if (P->fWhat & RE_CLOSE) C++;
|
|
else C--;
|
|
}
|
|
Last = P;
|
|
if (C == 0) break;
|
|
P = P->fPrev;
|
|
}
|
|
//printf("P = %s, c = %d", P ? "ok":"null", C);
|
|
if (C != 0) return 0;
|
|
}
|
|
assert(Last);
|
|
if (What != '?' && What != '|')
|
|
if (CountWidth(Last) == 0) {
|
|
// puts("FAILED count");
|
|
return 0;
|
|
}
|
|
switch (What) {
|
|
case '?': /* BRANCH x NOTHING */
|
|
New = NewNode(RE_BRANCH | RE_GREEDY | What);
|
|
No = NewNode(RE_NOTHING);
|
|
if (!New || !No) return 0;
|
|
No->fPrev = *N;
|
|
if (*N)
|
|
(*N)->fNext = No;
|
|
New->fNext = Last;
|
|
New->fPrev = Last->fPrev;
|
|
Last->fPrev = New;
|
|
if (New->fPrev) {
|
|
New->fPrev->fNext = New;
|
|
} else {
|
|
*F = New;
|
|
}
|
|
New->fPtr = No;
|
|
No->fPtr = New;
|
|
*N = No;
|
|
//puts("BRANCH ?");
|
|
break;
|
|
|
|
case '*':
|
|
case '@':
|
|
New = NewNode(RE_BRANCH | What | ((What == '*') ? RE_GREEDY : 0));
|
|
Jump = NewNode(RE_JUMP);
|
|
No = NewNode(RE_NOTHING);
|
|
|
|
if (!New || !No || !Jump) return 0;
|
|
No->fPrev = Jump;
|
|
Jump->fNext = No;
|
|
Jump->fPrev = *N;
|
|
if (*N)
|
|
(*N)->fNext = Jump;
|
|
New->fNext = Last;
|
|
New->fPrev = Last->fPrev;
|
|
Last->fPrev = New;
|
|
if (New->fPrev) {
|
|
New->fPrev->fNext = New;
|
|
} else {
|
|
*F = New;
|
|
}
|
|
New->fPtr = No;
|
|
No->fPtr = New;
|
|
Jump->fPtr = New;
|
|
*N = No;
|
|
//puts("BRANCH *");
|
|
break;
|
|
|
|
case '#':
|
|
case '+':
|
|
New = NewNode(RE_BRANCH | What | ((What == '+') ? RE_GREEDY : 0));
|
|
Skip = NewNode(RE_JUMP);
|
|
Jump = NewNode(RE_JUMP);
|
|
No = NewNode(RE_NOTHING);
|
|
|
|
if (!New || !No || !Jump) return 0;
|
|
No->fPrev = Jump;
|
|
Jump->fPrev = *N;
|
|
Jump->fNext = No;
|
|
|
|
Skip->fNext = New;
|
|
New->fPrev = Skip;
|
|
if (*N)
|
|
(*N)->fNext = Jump;
|
|
New->fNext = Last;
|
|
Skip->fPrev = Last->fPrev;
|
|
Last->fPrev = New;
|
|
if (Skip->fPrev) {
|
|
Skip->fPrev->fNext = Skip;
|
|
} else {
|
|
*F = Skip;
|
|
}
|
|
New->fPtr = No;
|
|
No->fPtr = New;
|
|
Jump->fPtr = New;
|
|
Skip->fPtr = Last;
|
|
*N = No;
|
|
//puts("BRANCH +");
|
|
break;
|
|
case '|':
|
|
New = NewNode(RE_BRANCH | RE_GREEDY | What);
|
|
Jump = NewNode(RE_BREAK);
|
|
No = NewNode(RE_NOTHING);
|
|
|
|
if (!New || !No || !Jump) return 0;
|
|
No->fPrev = Jump;
|
|
Jump->fNext = No;
|
|
Jump->fPrev = *N;
|
|
if (*N)
|
|
(*N)->fNext = Jump;
|
|
New->fNext = Last;
|
|
New->fPrev = Last->fPrev;
|
|
Last->fPrev = New;
|
|
if (New->fPrev) {
|
|
New->fPrev->fNext = New;
|
|
} else {
|
|
*F = New;
|
|
}
|
|
New->fPtr = No;
|
|
No->fPtr = New;
|
|
Jump->fPtr = New;
|
|
*N = No;
|
|
//puts("BRANCH |");
|
|
break;
|
|
}
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#define CHECK(n) do { if ((n) == 0) { return 0;} } while (0)
|
|
|
|
static RxNode *RxComp(const char **const Regexp) {
|
|
RxNode *F = 0;
|
|
RxNode *N = 0;
|
|
int C;
|
|
char Ch;
|
|
|
|
while (**Regexp) {
|
|
// puts(*Regexp);
|
|
switch (Ch = (*(*Regexp)++)) {
|
|
case '?':
|
|
case '*':
|
|
case '+':
|
|
case '@':
|
|
case '#':
|
|
case '|':
|
|
CHECK(MakeSub(&F, &N, Ch));
|
|
break;
|
|
case '}':
|
|
case ')':
|
|
return F;
|
|
case '{':
|
|
CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_OPEN)));
|
|
CHECK(AddNode(&F, &N, RxComp(Regexp)));
|
|
while (N->fNext) N = N->fNext;
|
|
CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_CLOSE)));
|
|
break;
|
|
case '(':
|
|
C = ++RegCount;
|
|
CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_OPEN | RE_MEM | C)));
|
|
CHECK(AddNode(&F, &N, RxComp(Regexp)));
|
|
while (N->fNext) N = N->fNext;
|
|
CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_CLOSE | RE_MEM | C)));
|
|
break;
|
|
case '\\':CHECK(AddNode(&F, &N, NewEscape(Regexp))); break;
|
|
case '[': CHECK(AddNode(&F, &N, NewSet(Regexp))); break;
|
|
case '^': CHECK(AddNode(&F, &N, NewNode(RE_ATBOL))); break;
|
|
case '$': CHECK(AddNode(&F, &N, NewNode(RE_ATEOL))); break;
|
|
case '.': CHECK(AddNode(&F, &N, NewNode(RE_ANY))); break;
|
|
case '<': CHECK(AddNode(&F, &N, NewNode(RE_ATBOW))); break;
|
|
case '>': CHECK(AddNode(&F, &N, NewNode(RE_ATEOW))); break;
|
|
default:
|
|
--*Regexp;
|
|
CHECK(AddNode(&F, &N, NewChar(**Regexp)));
|
|
++*Regexp;
|
|
break;
|
|
}
|
|
}
|
|
return F;
|
|
}
|
|
|
|
RxNode *RxOptimize(RxNode *rx) {
|
|
return rx;
|
|
}
|
|
|
|
RxNode *RxCompile(const char *Regexp) {
|
|
RxNode *n = 0, *x;
|
|
if (Regexp == 0) return 0;
|
|
RegCount = 0;
|
|
n = RxComp(&Regexp);
|
|
if (n == 0) return 0;
|
|
n = RxOptimize(n);
|
|
x = n;
|
|
while (x->fNext) x = x->fNext;
|
|
x->fNext = NewNode(RE_END);
|
|
return n;
|
|
}
|
|
|
|
void RxFree(RxNode *n) {
|
|
RxNode *p;
|
|
|
|
while (n) {
|
|
p = n;
|
|
n = n->fNext;
|
|
switch (p->fWhat) {
|
|
case RE_INSET:
|
|
case RE_NOTINSET:
|
|
case RE_CHAR:
|
|
free(p->fChar);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
free(p);
|
|
}
|
|
}
|
|
|
|
#define ChClass(x) (((((x) >= 'A') && ((x) <= 'Z')) || (((x) >= 'a') && ((x) <= 'z')) || (((x) >= '0') && ((x) <= '9')))?1:0)
|
|
|
|
static RxMatchRes *match;
|
|
static const char *bop;
|
|
static const char *eop;
|
|
static int flags = RX_CASE;
|
|
static const char *rex;
|
|
|
|
int RxMatch(RxNode *rx) {
|
|
RxNode *n = rx;
|
|
|
|
//printf(">>");
|
|
while (n) {
|
|
//printf("%-50.50s\n", rex);
|
|
//RxDump(1, n);
|
|
switch (n->fWhat) {
|
|
case RE_NOTHING:
|
|
break;
|
|
case RE_CASE:
|
|
flags |= RX_CASE;
|
|
break;
|
|
case RE_NCASE:
|
|
flags &= ~RX_CASE;
|
|
break;
|
|
case RE_ATBOL:
|
|
if (rex != bop) return 0;
|
|
break;
|
|
case RE_ATEOL:
|
|
if (rex != eop) return 0;
|
|
break;
|
|
case RE_ANY:
|
|
if (rex == eop) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_WSPACE:
|
|
if (rex == eop) return 0;
|
|
if (*rex != ' ' && *rex != '\n' && *rex != '\r' && *rex != '\t') return 0;
|
|
rex++;
|
|
break;
|
|
case RE_NWSPACE:
|
|
if (rex == eop) return 0;
|
|
if (*rex == ' ' || *rex == '\n' || *rex == '\r' || *rex == '\t') return 0;
|
|
rex++;
|
|
break;
|
|
case RE_WORD:
|
|
if (rex == eop) return 0;
|
|
if (!isalnum(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_NWORD:
|
|
if (rex == eop) return 0;
|
|
if (isalnum(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_DIGIT:
|
|
if (rex == eop) return 0;
|
|
if (!isdigit(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_NDIGIT:
|
|
if (rex == eop) return 0;
|
|
if (isdigit(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_UPPER:
|
|
if (rex == eop) return 0;
|
|
if (!isupper(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_LOWER:
|
|
if (rex == eop) return 0;
|
|
if (!islower(*rex)) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_ATBOW:
|
|
if (rex >= eop) return 0;
|
|
if (rex > bop) {
|
|
if ((ChClass(*rex) != 1) || (ChClass(*(rex-1)) != 0)) return 0;
|
|
}
|
|
break;
|
|
case RE_ATEOW:
|
|
if (rex <= bop) return 0;
|
|
if (rex < eop) {
|
|
if ((ChClass(*rex) != 0) || (ChClass(*(rex-1)) != 1)) return 0;
|
|
}
|
|
break;
|
|
case RE_CHAR:
|
|
if (rex == eop) return 0;
|
|
if (flags & RX_CASE) {
|
|
if (*n->fChar != *rex) return 0;
|
|
if (memcmp(rex, n->fChar, n->fLen) != 0) return 0;
|
|
} else {
|
|
for (int i = 0; i < n->fLen; i++)
|
|
if (toupper(rex[i]) != toupper(n->fChar[i]))
|
|
return 0;
|
|
}
|
|
rex += n->fLen;
|
|
break;
|
|
case RE_INSET:
|
|
if (rex == eop) return 0;
|
|
if ((n->fChar[(unsigned char)(*rex) >> 3] & (1 << ((unsigned char)(*rex) & 7))) == 0) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_NOTINSET:
|
|
if (rex == eop) return 0;
|
|
if (n->fChar[(unsigned char)(*rex) >> 3] & (1 << ((unsigned char)(*rex) & 7))) return 0;
|
|
rex++;
|
|
break;
|
|
case RE_JUMP:
|
|
n = n->fPtr;
|
|
continue;
|
|
case RE_END:
|
|
return 1;
|
|
case RE_BREAK:
|
|
n = n->fNext;
|
|
if (n->fNext == 0) break;
|
|
n = n->fNext;
|
|
if (n->fWhat & RE_BRANCH) {
|
|
while ((n->fWhat & RE_BRANCH) && n->fPtr && ((n->fWhat & 0xFF) == '|'))
|
|
n = n->fPtr->fNext;
|
|
}
|
|
if (n->fWhat & RE_GROUP) {
|
|
int C = 1;
|
|
n = n->fNext;
|
|
while ((C > 0) && n) {
|
|
if (n->fWhat & RE_GROUP) {
|
|
if (n->fWhat & RE_OPEN) C++;
|
|
else C--;
|
|
}
|
|
if (C == 0) break;
|
|
n = n->fNext;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
if (n->fWhat & RE_GROUP) {
|
|
if (n->fWhat & RE_MEM) {
|
|
const char *save = rex;
|
|
int b = n->fWhat & 0xFF;
|
|
int fl = flags;
|
|
|
|
if (RxMatch(n->fNext) == 0) {
|
|
flags = fl;
|
|
if (n->fWhat & RE_OPEN)
|
|
match->Open[b] = -1;
|
|
else
|
|
match->Close[b] = -1;
|
|
return 0;
|
|
}
|
|
|
|
if (n->fWhat & RE_OPEN) {
|
|
// if (match->Open[b] == -1)
|
|
match->Open[b] = (int) (save - bop);
|
|
} else {
|
|
// if (match->Close[b] == -1)
|
|
match->Close[b] = (int) (save - bop);
|
|
}
|
|
return 1;
|
|
}
|
|
} else if (n->fWhat & RE_BRANCH) {
|
|
const char *save = rex;
|
|
int fl = flags;
|
|
|
|
if ((n->fWhat & RE_GREEDY) == 0) {
|
|
if (RxMatch(n->fPtr) == 1) return 1;
|
|
flags = fl;
|
|
rex = save;
|
|
} else {
|
|
if (RxMatch(n->fNext) == 1) return 1;
|
|
flags = fl;
|
|
rex = save;
|
|
n = n->fPtr;
|
|
continue;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
n = n->fNext;
|
|
}
|
|
/* NOTREACHED */
|
|
assert(1 == 0 /* internal regexp error */);
|
|
return 0;
|
|
}
|
|
|
|
int RxTry(RxNode *rx, const char *s) {
|
|
int fl = flags;
|
|
rex = s;
|
|
for (int i = 0; i < NSEXPS; i++)
|
|
match->Open[i] = match->Close[i] = -1;
|
|
if (RxMatch(rx)) {
|
|
match->Open[0] = (int) (s - bop);
|
|
match->Close[0] = (int) (rex - bop);
|
|
return 1;
|
|
}
|
|
flags = fl;
|
|
return 0;
|
|
}
|
|
|
|
int RxExec(RxNode *Regexp, const char *Data, int Len, const char *Start, RxMatchRes *Match, unsigned int RxOpt) {
|
|
char Ch;
|
|
if (Regexp == 0) return 0;
|
|
|
|
match = Match;
|
|
bop = Data;
|
|
eop = Data + Len;
|
|
|
|
flags = RxOpt;
|
|
|
|
for (int i = 0; i < NSEXPS; i++) Match->Open[i] = Match->Close[i] = -1;
|
|
|
|
switch (Regexp->fWhat) { // this should be more clever
|
|
case RE_ATBOL: // match is anchored
|
|
return RxTry(Regexp, Start);
|
|
case RE_CHAR: // search for a character to match
|
|
Ch = Regexp->fChar[0];
|
|
if (Start == eop)
|
|
break;
|
|
if (flags & RX_CASE) {
|
|
while (1) {
|
|
while (Start < eop && *Start != Ch)
|
|
Start++;
|
|
if (Start == eop)
|
|
break;
|
|
if (RxTry(Regexp, Start))
|
|
return 1;
|
|
if (++Start == eop)
|
|
break;
|
|
}
|
|
} else {
|
|
Ch = toupper(Ch);
|
|
while (1) {
|
|
while (Start < eop && toupper(*Start) != Ch)
|
|
Start++;
|
|
if (Start == eop)
|
|
break;
|
|
if (RxTry(Regexp, Start))
|
|
return 1;
|
|
if (++Start == eop)
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
default: // (slow)
|
|
do {
|
|
if (RxTry(Regexp, Start)) return 1;
|
|
} while (Start++ < eop);
|
|
break;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#define FLAG_UP_CASE 1
|
|
#define FLAG_DOWN_CASE 2
|
|
#define FLAG_UP_NEXT 4
|
|
#define FLAG_DOWN_NEXT 8
|
|
|
|
static int add(int *len, char **s, const char *a, int alen, int &flag) {
|
|
int NewLen = *len + alen;
|
|
int i;
|
|
|
|
NewLen = NewLen * 2;
|
|
|
|
if (alen == 0)
|
|
return 0;
|
|
|
|
if (*s) {
|
|
*s = (char *) realloc(*s, NewLen);
|
|
assert(*s);
|
|
memcpy(*s + *len, a, alen);
|
|
} else {
|
|
*s = (char *) malloc(NewLen);
|
|
assert(*s);
|
|
memcpy(*s, a, alen);
|
|
*len = 0;
|
|
}
|
|
if (flag & FLAG_UP_CASE) {
|
|
char *p = *s + *len;
|
|
|
|
for (i = 0; i < alen; i++) {
|
|
*p = (char)toupper(*p);
|
|
p++;
|
|
}
|
|
} else if (flag & FLAG_DOWN_CASE) {
|
|
char *p = *s + *len;
|
|
|
|
for (i = 0; i < alen; i++) {
|
|
*p = (char)tolower(*p);
|
|
p++;
|
|
}
|
|
}
|
|
if (flag & FLAG_UP_NEXT) {
|
|
char *p = *s + *len;
|
|
|
|
*p = (char)toupper(*p);
|
|
flag &= ~FLAG_UP_NEXT;
|
|
} else if (flag & FLAG_DOWN_NEXT) {
|
|
char *p = *s + *len;
|
|
|
|
*p = (char)tolower(*p);
|
|
flag &= ~FLAG_DOWN_NEXT;
|
|
}
|
|
*len += alen;
|
|
return 0;
|
|
}
|
|
|
|
int RxReplace(const char *rep, const char *Src, int /*len*/, RxMatchRes match, char **Dest, int *Dlen) {
|
|
int dlen = 0;
|
|
char *dest = 0;
|
|
char Ch;
|
|
int n;
|
|
int flag = 0;
|
|
|
|
*Dest = 0;
|
|
*Dlen = 0;
|
|
// add(&dlen, &dest, Src, match.Open[0]);
|
|
while (*rep) {
|
|
switch (Ch = *rep++) {
|
|
// case '&':
|
|
// add(&dlen, &dest, Src + match.Open[0], match.Close[0] - match.Open[0], flag);
|
|
// break;
|
|
case '\\':
|
|
switch (Ch = *rep++) {
|
|
case '0':
|
|
case '1': case '2': case '3':
|
|
case '4': case '5': case '6':
|
|
case '7': case '8': case '9':
|
|
n = Ch - 48;
|
|
|
|
if (match.Open[n] != -1 && match.Close[n] != -1) {
|
|
add(&dlen, &dest, Src + match.Open[n], match.Close[n] - match.Open[n], flag);
|
|
} else return -1;
|
|
break;
|
|
case 0:
|
|
if (dest) free(dest);
|
|
return -1; // error
|
|
case 'r': Ch = '\r'; add(&dlen, &dest, &Ch, 1, flag); break;
|
|
case 'n': Ch = '\n'; add(&dlen, &dest, &Ch, 1, flag); break;
|
|
case 'b': Ch = '\b'; add(&dlen, &dest, &Ch, 1, flag); break;
|
|
case 'a': Ch = '\a'; add(&dlen, &dest, &Ch, 1, flag); break;
|
|
case 't': Ch = '\t'; add(&dlen, &dest, &Ch, 1, flag); break;
|
|
case 'U': flag |= FLAG_UP_CASE; break;
|
|
case 'u': flag |= FLAG_UP_NEXT; break;
|
|
case 'L': flag |= FLAG_DOWN_CASE; break;
|
|
case 'l': flag |= FLAG_DOWN_NEXT; break;
|
|
case 'E':
|
|
case 'e': flag &= ~(FLAG_UP_CASE | FLAG_DOWN_CASE); break;
|
|
case 'x':
|
|
{
|
|
int N = 0;
|
|
int A = 0;
|
|
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
rep++;
|
|
A = N << 4;
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
|
|
rep++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
add(&dlen, &dest, &Ch, 1, flag);
|
|
break;
|
|
case 'd':
|
|
{
|
|
int N = 0;
|
|
int A = 0;
|
|
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
|
|
rep++;
|
|
A = N * 100;
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
|
|
rep++;
|
|
A = N * 10;
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
|
|
rep++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
add(&dlen, &dest, &Ch, 1, flag);
|
|
break;
|
|
case 'o':
|
|
{
|
|
int N = 0;
|
|
int A = 0;
|
|
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
|
|
rep++;
|
|
A = N * 64;
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
|
|
rep++;
|
|
A = N * 8;
|
|
if (*rep == 0) { free(dest); return 0; }
|
|
N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
|
|
rep++;
|
|
A = A + N;
|
|
Ch = (char)A;
|
|
}
|
|
add(&dlen, &dest, &Ch, 1, flag);
|
|
break;
|
|
default:
|
|
add(&dlen, &dest, &Ch, 1, flag);
|
|
break;
|
|
}
|
|
break;
|
|
default:
|
|
add(&dlen, &dest, &Ch, 1, flag);
|
|
break;
|
|
}
|
|
}
|
|
// add(&dlen, &dest, Src + match.Close[0], len - match.Close[0]);
|
|
*Dlen = dlen;
|
|
*Dest = dest;
|
|
return 0;
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
static void RxDump(int N, RxNode *n) {
|
|
while (n) {
|
|
for (int i = 0; i < N; i++) printf(" ");
|
|
switch (n->fWhat) {
|
|
case RE_NOTHING: printf("NOTHING\n"); break;
|
|
case RE_CHAR: printf("CHAR '%.1s'\n", n->fChar); break;
|
|
case RE_ATBOL: printf("^\n"); break;
|
|
case RE_ATEOL: printf("$\n"); break;
|
|
case RE_ANY: printf(".\n"); break;
|
|
case RE_INSET: printf("[\n"/*, n->fChar*/); break;
|
|
case RE_NOTINSET: printf("[^\n"/*, n->fChar*/); break;
|
|
case RE_ATBOW: printf("<\n"); break;
|
|
case RE_ATEOW: printf(">\n"); break;
|
|
case RE_WSPACE: printf("WSPACE\n"); break;
|
|
case RE_NWSPACE: printf("NWSPACE\n"); break;
|
|
case RE_UPPER: printf("UPPER\n"); break;
|
|
case RE_LOWER: printf("LOWER\n"); break;
|
|
case RE_JUMP: printf("JUMP\n"); break;
|
|
case RE_BREAK: printf("BREAK\n"); break;
|
|
case RE_END: printf("END\n"); break;
|
|
default:
|
|
if (n->fWhat & RE_GROUP) {
|
|
if (n->fWhat & RE_MEM) {
|
|
if (n->fWhat & RE_OPEN) printf("( %d\n", n->fWhat & 0xFF);
|
|
if (n->fWhat & RE_CLOSE) printf(") %d\n", n->fWhat & 0xFF);
|
|
} else {
|
|
if (n->fWhat & RE_OPEN) printf("{\n");
|
|
if (n->fWhat & RE_CLOSE) printf("}\n");
|
|
}
|
|
} else if (n->fWhat & RE_BRANCH) {
|
|
if (n->fWhat & RE_GREEDY) {
|
|
printf("%c\n", n->fWhat & 0xFF);
|
|
} else {
|
|
printf("%c\n", n->fWhat & 0xFF);
|
|
}
|
|
} else {
|
|
printf("???????????????\n");
|
|
}
|
|
break;
|
|
}
|
|
n = n->fNext;
|
|
}
|
|
}
|
|
|
|
#define TEST(rc,rx,st) \
|
|
strcpy(line,st); \
|
|
assert((a = RxCompile(rx)) != 0); \
|
|
puts("\n--- " rx " -- " st " -- "); \
|
|
RxDump(0,a);\
|
|
assert(rc == RxExec(a, line, strlen(line), line, &b)); \
|
|
RxFree(a);
|
|
|
|
int main() {
|
|
RxNode *a;
|
|
RxMatchRes b;
|
|
char line[1024];
|
|
|
|
TEST(1, "a", "a");
|
|
TEST(0, "b", "a");
|
|
TEST(1, "aaaa", "aaaa");
|
|
TEST(0, "bbbb", "aaaa");
|
|
TEST(1, ".", "a");
|
|
TEST(0, ".", "");
|
|
TEST(1, "a..", "axx");
|
|
TEST(0, "a..", "b..");
|
|
TEST(1, "a?b", "ab");
|
|
TEST(1, "a?b", "xb");
|
|
TEST(0, "a?C", "xb");
|
|
TEST(1, "{aa}?b", "aab");
|
|
TEST(1, "{aa}?b", "xab");
|
|
TEST(0, "{aa}?C", "xxb");
|
|
TEST(1, "^aa", "aa");
|
|
TEST(0, "^aa", "baa");
|
|
TEST(1, "^aa$" ,"aa");
|
|
TEST(0, "^aa$", "baab");
|
|
TEST(1, "a*b", "aaab");
|
|
TEST(0, "a*b", "aaaa");
|
|
TEST(1, "{aa}*b", "aaab");
|
|
TEST(0, "{aa}*b", "aaaa");
|
|
TEST(1, "b+", "bb");
|
|
TEST(1, "b+", "b");
|
|
TEST(0, "b+", "a");
|
|
TEST(1, "^b+$", "b");
|
|
TEST(0, "^b+$", "aba");
|
|
TEST(1, "a|b", " a");
|
|
TEST(1, "a|b", " b");
|
|
TEST(0, "a|b", " c");
|
|
TEST(1, "a|b|c|d|e", " a ");
|
|
TEST(1, "a|b|c|d|e", " c ");
|
|
TEST(1, "a|b|c|d|e", " e ");
|
|
TEST(0, "a|b|c|d|e", " x ");
|
|
TEST(1, "{a}|{b}|{c}|{d}|{e}", " a ");
|
|
TEST(1, "{a}|{b}|{c}|{d}|{e}", " c ");
|
|
TEST(1, "{a}|{b}|{c}|{d}|{e}", " e ");
|
|
TEST(0, "{a}|{b}|{c}|{d}|{e}", " x ");
|
|
TEST(1, "^xx{alpha}|{beta}xx$", "xxalphaxx");
|
|
TEST(1, "^xx{alpha}|{beta}xx$", "xxbetaxx");
|
|
TEST(1, "[a-z]", "aaa");
|
|
TEST(1, "^{Error}|{Warning}", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
|
|
TEST(1, "^{Error}|{Warning} (.+)", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
|
|
TEST(1, "^{Error}|{Warning} ([a-z.]#) ([0-9]#)", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
|
|
TEST(1, "^{Error}|{Warning} (.+) ([0-9]+): (.*)$", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
|
|
TEST(1, "^{Error}|{Warning} (.+) ([0-9]+): (.*)$", "Error search.cpp 35: Conversion may lose significant digits in function AskReplace()");
|
|
TEST(1, "^([a-z]+ +)*\\(", "blabla bla bla bla (");
|
|
TEST(1, "^([a-z]+\\s+)+\\(", "blabla bla bla bla (");
|
|
TEST(1, "^([a-z]+\\s*)+\\(", "blabla bla bla bla(");
|
|
TEST(1, "^([a-z]+\\s+)+\\(", "blabla bla bla bla (");
|
|
TEST(1, "^([a-z]+\\s*)+\\(", "blabla bla bla bla(");
|
|
TEST(1, "^([a-z]# #)*\\(", "blabla bla bla bla (");
|
|
TEST(1, "^([a-z]+ @)@\\(", "blabla bla bla bla (");
|
|
TEST(1, "^[\\x20-\\xFF]+$", "blabla");
|
|
TEST(1, "{a{a{a{a|a}|{a|a}a}a}a|a}", "aaaaaaaaaaaaaaaaa");
|
|
|
|
while (1) {
|
|
printf("Regexp: "); fflush(stdout); gets(line);
|
|
if (!*line) break;
|
|
a = RxCompile(line); RxDump(0, a);
|
|
printf("String: "); fflush(stdout); gets(line);
|
|
printf("rc = %d\n", RxExec(a, line, strlen(line), line, &b));
|
|
for (int i = 0; i < NSEXPS; i++) {
|
|
if (b.Open[i] != -1) {
|
|
printf("%d: %d %d\n", i, b.Open[i], b.Close[i]);
|
|
}
|
|
}
|
|
RxFree(a);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
#endif
|