struct state *, struct state *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *);
+static void freesubreandsiblings(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
static void optst(struct vars *, struct subre *);
static int numst(struct subre *, int);
* parse - parse an RE
*
* This is actually just the top level, which parses a bunch of branches
- * tied together with '|'. They appear in the tree as the left children
- * of a chain of '|' subres.
+ * tied together with '|'. If there's more than one, they appear in the
+ * tree as the children of a '|' subre.
*/
static struct subre *
parse(struct vars *v,
struct state *init, /* initial state */
struct state *final) /* final state */
{
- struct state *left; /* scaffolding for branch */
- struct state *right;
struct subre *branches; /* top level */
- struct subre *branch; /* current branch */
- struct subre *t; /* temporary */
- int firstbranch; /* is this the first branch? */
+ struct subre *lastbranch; /* latest branch */
assert(stopper == ')' || stopper == EOS);
branches = subre(v, '|', LONGER, init, final);
NOERRN();
- branch = branches;
- firstbranch = 1;
+ lastbranch = NULL;
do
{ /* a branch */
- if (!firstbranch)
- {
- /* need a place to hang it */
- branch->right = subre(v, '|', LONGER, init, final);
- NOERRN();
- branch = branch->right;
- }
- firstbranch = 0;
+ struct subre *branch;
+ struct state *left; /* scaffolding for branch */
+ struct state *right;
+
left = newstate(v->nfa);
right = newstate(v->nfa);
NOERRN();
EMPTYARC(init, left);
EMPTYARC(right, final);
NOERRN();
- branch->left = parsebranch(v, stopper, type, left, right, 0);
+ branch = parsebranch(v, stopper, type, left, right, 0);
NOERRN();
- branch->flags |= UP(branch->flags | branch->left->flags);
- if ((branch->flags & ~branches->flags) != 0) /* new flags */
- for (t = branches; t != branch; t = t->right)
- t->flags |= branch->flags;
+ if (lastbranch)
+ lastbranch->sibling = branch;
+ else
+ branches->child = branch;
+ branches->flags |= UP(branches->flags | branch->flags);
+ lastbranch = branch;
} while (EAT('|'));
assert(SEE(stopper) || SEE(EOS));
}
/* optimize out simple cases */
- if (branch == branches)
+ if (lastbranch == branches->child)
{ /* only one branch */
- assert(branch->right == NULL);
- t = branch->left;
- branch->left = NULL;
- freesubre(v, branches);
- branches = t;
+ assert(lastbranch->sibling == NULL);
+ freesrnode(v, branches);
+ branches = lastbranch;
}
else if (!MESSY(branches->flags))
{ /* no interesting innards */
- freesubre(v, branches->left);
- branches->left = NULL;
- freesubre(v, branches->right);
- branches->right = NULL;
+ freesubreandsiblings(v, branches->child);
+ branches->child = NULL;
branches->op = '=';
}
t = subre(v, '(', atom->flags | CAP, lp, rp);
NOERR();
t->subno = subno;
- t->left = atom;
+ t->child = atom;
atom = t;
}
/* postpone everything else pending possible {0} */
/* break remaining subRE into x{...} and what follows */
t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
NOERR();
- t->left = atom;
- atomp = &t->left;
+ t->child = atom;
+ atomp = &t->child;
/*
- * Here we should recurse to fill t->right ... but we must postpone that
- * to the end.
+ * Here we should recurse to fill t->child->sibling ... but we must
+ * postpone that to the end. One reason is that t->child may be replaced
+ * below, and we don't want to worry about its sibling link.
*/
/*
- * Convert top node to a concatenation of the prefix (top->left, covering
+ * Convert top node to a concatenation of the prefix (top->child, covering
* whatever we parsed previously) and remaining (t). Note that the prefix
* could be empty, in which case this concatenation node is unnecessary.
* To keep things simple, we operate in a general way for now, and get rid
* of unnecessary subres below.
*/
- assert(top->op == '=' && top->left == NULL && top->right == NULL);
- top->left = subre(v, '=', top->flags, top->begin, lp);
+ assert(top->op == '=' && top->child == NULL);
+ top->child = subre(v, '=', top->flags, top->begin, lp);
NOERR();
top->op = '.';
- top->right = t;
+ top->child->sibling = t;
/* top->flags will get updated later */
/* if it's a backref, now is the time to replicate the subNFA */
f = COMBINE(qprefer, atom->flags);
t = subre(v, '.', f, s, atom->end); /* prefix and atom */
NOERR();
- t->left = subre(v, '=', PREF(f), s, atom->begin);
+ t->child = subre(v, '=', PREF(f), s, atom->begin);
NOERR();
- t->right = atom;
+ t->child->sibling = atom;
*atomp = t;
/* rest of branch can be strung starting from atom->end */
s2 = atom->end;
NOERR();
t->min = (short) m;
t->max = (short) n;
- t->left = atom;
+ t->child = atom;
*atomp = t;
/* rest of branch is to be strung from iteration's end state */
}
/* and finally, look after that postponed recursion */
- t = top->right;
+ t = top->child->sibling;
if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
{
- /* parse all the rest of the branch, and insert in t->right */
- t->right = parsebranch(v, stopper, type, s2, rp, 1);
+ /* parse all the rest of the branch, and insert in t->child->sibling */
+ t->child->sibling = parsebranch(v, stopper, type, s2, rp, 1);
NOERR();
assert(SEE('|') || SEE(stopper) || SEE(EOS));
/* here's the promised update of the flags */
- t->flags |= COMBINE(t->flags, t->right->flags);
+ t->flags |= COMBINE(t->flags, t->child->sibling->flags);
top->flags |= COMBINE(top->flags, t->flags);
/*
* At this point both top and t are concatenation (op == '.') subres,
- * and we have top->left = prefix of branch, top->right = t, t->left =
- * messy atom (with quantification superstructure if needed), t->right
- * = rest of branch.
+ * and we have top->child = prefix of branch, top->child->sibling = t,
+ * t->child = messy atom (with quantification superstructure if
+ * needed), t->child->sibling = rest of branch.
*
- * If the messy atom was the first thing in the branch, then top->left
- * is vacuous and we can get rid of one level of concatenation. Since
- * the caller is holding a pointer to the top node, we can't remove
- * that node; but we're allowed to change its properties.
+ * If the messy atom was the first thing in the branch, then
+ * top->child is vacuous and we can get rid of one level of
+ * concatenation. Since the caller is holding a pointer to the top
+ * node, we can't remove that node; but we're allowed to change its
+ * properties.
*/
- assert(top->left->op == '=');
- if (top->left->begin == top->left->end)
+ assert(top->child->op == '=');
+ if (top->child->begin == top->child->end)
{
- assert(!MESSY(top->left->flags));
- freesubre(v, top->left);
- top->left = t->left;
- top->right = t->right;
- t->left = t->right = NULL;
- freesubre(v, t);
+ assert(!MESSY(top->child->flags));
+ freesubre(v, top->child);
+ top->child = t->child;
+ freesrnode(v, t);
}
}
else
* concatenation node 't'. Just link s2 straight to rp.
*/
EMPTYARC(s2, rp);
- top->right = t->left;
- top->flags |= COMBINE(top->flags, top->right->flags);
- t->left = t->right = NULL;
- freesubre(v, t);
+ top->child->sibling = t->child;
+ top->flags |= COMBINE(top->flags, top->child->sibling->flags);
+ freesrnode(v, t);
/*
- * Again, it could be that top->left is vacuous (if the messy atom was
- * in fact the only thing in the branch). In that case we need no
- * concatenation at all; just replace top with top->right.
+ * Again, it could be that top->child is vacuous (if the messy atom
+ * was in fact the only thing in the branch). In that case we need no
+ * concatenation at all; just replace top with top->child->sibling.
*/
- assert(top->left->op == '=');
- if (top->left->begin == top->left->end)
+ assert(top->child->op == '=');
+ if (top->child->begin == top->child->end)
{
- assert(!MESSY(top->left->flags));
- freesubre(v, top->left);
- t = top->right;
+ assert(!MESSY(top->child->flags));
+ t = top->child->sibling;
+ freesubre(v, top->child);
top->op = t->op;
top->flags = t->flags;
top->id = t->id;
top->subno = t->subno;
top->min = t->min;
top->max = t->max;
- top->left = t->left;
- top->right = t->right;
+ top->child = t->child;
top->begin = t->begin;
top->end = t->end;
- t->left = t->right = NULL;
- freesubre(v, t);
+ freesrnode(v, t);
}
}
}
}
if (ret != NULL)
- v->treefree = ret->left;
+ v->treefree = ret->child;
else
{
ret = (struct subre *) MALLOC(sizeof(struct subre));
ret->id = 0; /* will be assigned later */
ret->subno = 0;
ret->min = ret->max = 1;
- ret->left = NULL;
- ret->right = NULL;
+ ret->child = NULL;
+ ret->sibling = NULL;
ret->begin = begin;
ret->end = end;
ZAPCNFA(ret->cnfa);
/*
* freesubre - free a subRE subtree
+ *
+ * This frees child node(s) of the given subRE too,
+ * but not its siblings.
*/
static void
freesubre(struct vars *v, /* might be NULL */
if (sr == NULL)
return;
- if (sr->left != NULL)
- freesubre(v, sr->left);
- if (sr->right != NULL)
- freesubre(v, sr->right);
+ if (sr->child != NULL)
+ freesubreandsiblings(v, sr->child);
freesrnode(v, sr);
}
+/*
+ * freesubreandsiblings - free a subRE subtree
+ *
+ * This frees child node(s) of the given subRE too,
+ * as well as any following siblings.
+ */
+static void
+freesubreandsiblings(struct vars *v, /* might be NULL */
+ struct subre *sr)
+{
+ while (sr != NULL)
+ {
+ struct subre *next = sr->sibling;
+
+ freesubre(v, sr);
+ sr = next;
+ }
+}
+
/*
* freesrnode - free one node in a subRE subtree
*/
if (v != NULL && v->treechain != NULL)
{
/* we're still parsing, maybe we can reuse the subre */
- sr->left = v->treefree;
+ sr->child = v->treefree;
v->treefree = sr;
}
else
int start) /* starting point for subtree numbers */
{
int i;
+ struct subre *t2;
assert(t != NULL);
i = start;
t->id = (short) i++;
- if (t->left != NULL)
- i = numst(t->left, i);
- if (t->right != NULL)
- i = numst(t->right, i);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ i = numst(t2, i);
return i;
}
static void
markst(struct subre *t)
{
+ struct subre *t2;
+
assert(t != NULL);
t->flags |= INUSE;
- if (t->left != NULL)
- markst(t->left);
- if (t->right != NULL)
- markst(t->right);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ markst(t2);
}
/*
struct subre *t,
FILE *f) /* for debug output */
{
+ struct subre *t2;
+
assert(t != NULL && t->begin != NULL);
- if (t->left != NULL)
- (DISCARD) nfatree(v, t->left, f);
- if (t->right != NULL)
- (DISCARD) nfatree(v, t->right, f);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ (DISCARD) nfatree(v, t2, f);
return nfanode(v, t, 0, f);
}
int nfapresent) /* is the original NFA still around? */
{
char idbuf[50];
+ struct subre *t2;
fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
if (t->flags & LONGER)
}
if (nfapresent)
fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
- if (t->left != NULL)
- fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
- if (t->right != NULL)
- fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
+ if (t->child != NULL)
+ fprintf(f, " C:%s", stid(t->child, idbuf, sizeof(idbuf)));
+ /* printing second child isn't necessary, but it is often helpful */
+ if (t->child != NULL && t->child->sibling != NULL)
+ fprintf(f, " C2:%s", stid(t->child->sibling, idbuf, sizeof(idbuf)));
+ if (t->sibling != NULL)
+ fprintf(f, " S:%s", stid(t->sibling, idbuf, sizeof(idbuf)));
if (!NULLCNFA(t->cnfa))
{
fprintf(f, "\n");
dumpcnfa(&t->cnfa, f);
}
fprintf(f, "\n");
- if (t->left != NULL)
- stdump(t->left, f, nfapresent);
- if (t->right != NULL)
- stdump(t->right, f, nfapresent);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ stdump(t2, f, nfapresent);
}
/*
zaptreesubs(struct vars *v,
struct subre *t)
{
+ struct subre *t2;
+
if (t->op == '(')
{
int n = t->subno;
}
}
- if (t->left != NULL)
- zaptreesubs(v, t->left);
- if (t->right != NULL)
- zaptreesubs(v, t->right);
+ for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+ zaptreesubs(v, t2);
}
/*
switch (t->op)
{
case '=': /* terminal node */
- assert(t->left == NULL && t->right == NULL);
+ assert(t->child == NULL);
er = REG_OKAY; /* no action, parent did the work */
break;
case 'b': /* back reference */
- assert(t->left == NULL && t->right == NULL);
+ assert(t->child == NULL);
er = cbrdissect(v, t, begin, end);
break;
case '.': /* concatenation */
- assert(t->left != NULL && t->right != NULL);
- if (t->left->flags & SHORTER) /* reverse scan */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
er = crevcondissect(v, t, begin, end);
else
er = ccondissect(v, t, begin, end);
break;
case '|': /* alternation */
- assert(t->left != NULL);
+ assert(t->child != NULL);
er = caltdissect(v, t, begin, end);
break;
case '*': /* iteration */
- assert(t->left != NULL);
- if (t->left->flags & SHORTER) /* reverse scan */
+ assert(t->child != NULL);
+ if (t->child->flags & SHORTER) /* reverse scan */
er = creviterdissect(v, t, begin, end);
else
er = citerdissect(v, t, begin, end);
break;
case '(': /* capturing */
- assert(t->left != NULL && t->right == NULL);
+ assert(t->child != NULL);
assert(t->subno > 0);
- er = cdissect(v, t->left, begin, end);
+ er = cdissect(v, t->child, begin, end);
if (er == REG_OKAY)
subset(v, t, begin, end);
break;
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->right != NULL && t->right->cnfa.nstates > 0);
- assert(!(t->left->flags & SHORTER));
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(!(left->flags & SHORTER));
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, left);
NOERR();
- d2 = getsubdfa(v, t->right);
+ d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
- er = cdissect(v, t->left, begin, mid);
+ er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
- er = cdissect(v, t->right, mid, end);
+ er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
- zaptreesubs(v, t->left);
- zaptreesubs(v, t->right);
+ zaptreesubs(v, left);
+ zaptreesubs(v, right);
}
/* can't get here */
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
+ struct subre *left = t->child;
+ struct subre *right = left->sibling;
struct dfa *d;
struct dfa *d2;
chr *mid;
int er;
assert(t->op == '.');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->right != NULL && t->right->cnfa.nstates > 0);
- assert(t->left->flags & SHORTER);
+ assert(left != NULL && left->cnfa.nstates > 0);
+ assert(right != NULL && right->cnfa.nstates > 0);
+ assert(right->sibling == NULL);
+ assert(left->flags & SHORTER);
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, left);
NOERR();
- d2 = getsubdfa(v, t->right);
+ d2 = getsubdfa(v, right);
NOERR();
MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
/* try this midpoint on for size */
if (longest(v, d2, mid, end, (int *) NULL) == end)
{
- er = cdissect(v, t->left, begin, mid);
+ er = cdissect(v, left, begin, mid);
if (er == REG_OKAY)
{
- er = cdissect(v, t->right, mid, end);
+ er = cdissect(v, right, mid, end);
if (er == REG_OKAY)
{
/* satisfaction */
return REG_NOMATCH;
}
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
- zaptreesubs(v, t->left);
- zaptreesubs(v, t->right);
+ zaptreesubs(v, left);
+ zaptreesubs(v, right);
}
/* can't get here */
struct dfa *d;
int er;
- /* We loop, rather than tail-recurse, to handle a chain of alternatives */
+ assert(t->op == '|');
+
+ t = t->child;
+ /* there should be at least 2 alternatives */
+ assert(t != NULL && t->sibling != NULL);
+
while (t != NULL)
{
- assert(t->op == '|');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
+ assert(t->cnfa.nstates > 0);
MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t);
NOERR();
if (longest(v, d, begin, end, (int *) NULL) == end)
{
MDEBUG(("%d: caltdissect matched\n", t->id));
- er = cdissect(v, t->left, begin, end);
+ er = cdissect(v, t, begin, end);
if (er != REG_NOMATCH)
return er;
}
NOERR();
- t = t->right;
+ t = t->sibling;
}
return REG_NOMATCH;
int er;
assert(t->op == '*');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(!(t->left->flags & SHORTER));
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(!(t->child->flags & SHORTER));
assert(begin <= end);
MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
return REG_ESPACE;
endpts[0] = begin;
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
for (i = nverified + 1; i <= k; i++)
{
- zaptreesubs(v, t->left);
- er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;
int er;
assert(t->op == '*');
- assert(t->left != NULL && t->left->cnfa.nstates > 0);
- assert(t->left->flags & SHORTER);
+ assert(t->child != NULL && t->child->cnfa.nstates > 0);
+ assert(t->child->flags & SHORTER);
assert(begin <= end);
MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
return REG_ESPACE;
endpts[0] = begin;
- d = getsubdfa(v, t->left);
+ d = getsubdfa(v, t->child);
if (ISERR())
{
FREE(endpts);
for (i = nverified + 1; i <= k; i++)
{
- zaptreesubs(v, t->left);
- er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
+ zaptreesubs(v, t->child);
+ er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
if (er == REG_OKAY)
{
nverified = i;