if (hitstopp != NULL)
*hitstopp = 0;
+ /* if this is a backref to a known string, just match against that */
+ if (d->backno >= 0)
+ {
+ assert((size_t) d->backno < v->nmatch);
+ if (v->pmatch[d->backno].rm_so >= 0)
+ {
+ cp = dfa_backref(v, d, start, start, stop, false);
+ if (cp == v->stop && stop == v->stop && hitstopp != NULL)
+ *hitstopp = 1;
+ return cp;
+ }
+ }
+
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
if (hitstopp != NULL)
*hitstopp = 0;
+ /* if this is a backref to a known string, just match against that */
+ if (d->backno >= 0)
+ {
+ assert((size_t) d->backno < v->nmatch);
+ if (v->pmatch[d->backno].rm_so >= 0)
+ {
+ cp = dfa_backref(v, d, start, min, max, true);
+ if (cp != NULL && coldp != NULL)
+ *coldp = start;
+ /* there is no case where we should set *hitstopp */
+ return cp;
+ }
+ }
+
/* fast path for matchall NFAs */
if (d->cnfa->flags & MATCHALL)
{
return 1;
}
+/*
+ * dfa_backref - find best match length for a known backref string
+ *
+ * When the backref's referent is already available, we can deliver an exact
+ * answer with considerably less work than running the backref node's NFA.
+ *
+ * Return match endpoint for longest or shortest valid repeated match,
+ * or NULL if there is no valid match.
+ *
+ * Should be in sync with cbrdissect(), although that has the different task
+ * of checking a match to a predetermined section of the string.
+ */
+static chr *
+dfa_backref(struct vars *v,
+ struct dfa *d,
+ chr *start, /* where the match should start */
+ chr *min, /* match must end at or after here */
+ chr *max, /* match must end at or before here */
+ bool shortest)
+{
+ int n = d->backno;
+ int backmin = d->backmin;
+ int backmax = d->backmax;
+ size_t numreps;
+ size_t minreps;
+ size_t maxreps;
+ size_t brlen;
+ chr *brstring;
+ chr *p;
+
+ /* get the backreferenced string (caller should have checked this) */
+ if (v->pmatch[n].rm_so == -1)
+ return NULL;
+ brstring = v->start + v->pmatch[n].rm_so;
+ brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+
+ /* special-case zero-length backreference to avoid divide by zero */
+ if (brlen == 0)
+ {
+ /*
+ * matches only a zero-length string, but any number of repetitions
+ * can be considered to be present
+ */
+ if (min == start && backmin <= backmax)
+ return start;
+ return NULL;
+ }
+
+ /*
+ * convert min and max into numbers of possible repetitions of the backref
+ * string, rounding appropriately
+ */
+ if (min <= start)
+ minreps = 0;
+ else
+ minreps = (min - start - 1) / brlen + 1;
+ maxreps = (max - start) / brlen;
+
+ /* apply bounds, then see if there is any allowed match length */
+ if (minreps < backmin)
+ minreps = backmin;
+ if (backmax != DUPINF && maxreps > backmax)
+ maxreps = backmax;
+ if (maxreps < minreps)
+ return NULL;
+
+ /* quick exit if zero-repetitions match is valid and preferred */
+ if (shortest && minreps == 0)
+ return start;
+
+ /* okay, compare the actual string contents */
+ p = start;
+ numreps = 0;
+ while (numreps < maxreps)
+ {
+ if ((*v->g->compare) (brstring, p, brlen) != 0)
+ break;
+ p += brlen;
+ numreps++;
+ if (shortest && numreps >= minreps)
+ break;
+ }
+
+ if (numreps >= minreps)
+ return p;
+ return NULL;
+}
+
/*
* lastcold - determine last point at which no progress had been made
*/
d->lastpost = NULL;
d->lastnopr = NULL;
d->search = d->ssets;
+ d->backno = -1; /* may be set by caller */
+ d->backmin = d->backmax = 0;
/* initialization of sset fields is done as needed */
chr *lastpost; /* location of last cache-flushed success */
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
struct sset *search; /* replacement-search-pointer memory */
+ int backno; /* if DFA for a backref, subno it refers to */
+ short backmin; /* min repetitions for backref */
+ short backmax; /* max repetitions for backref */
bool ismalloced; /* should this struct dfa be freed? */
bool arraysmalloced; /* should its subsidiary arrays be freed? */
};
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
+static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool);
static chr *lastcold(struct vars *, struct dfa *);
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
static void freedfa(struct dfa *);
getsubdfa(struct vars *v,
struct subre *t)
{
- if (v->subdfas[t->id] == NULL)
+ struct dfa *d = v->subdfas[t->id];
+
+ if (d == NULL)
{
- v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
+ d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
return NULL;
+ /* set up additional info if this is a backref node */
+ if (t->op == 'b')
+ {
+ d->backno = t->backno;
+ d->backmin = t->min;
+ d->backmax = t->max;
+ }
+ v->subdfas[t->id] = d;
}
- return v->subdfas[t->id];
+ return d;
}
/*
v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
if (ISERR())
return NULL;
+ /* a LACON can't contain a backref, so nothing else to do */
}
return v->ladfas[n];
}
/*
* cbrdissect - dissect match for backref node
+ *
+ * The backref match might already have been verified by dfa_backref(),
+ * but we don't know that for sure so must check it here.
*/
static int /* regexec return code */
cbrdissect(struct vars *v,