Implement lookbehind constraints in our regular-expression engine.

author Tom Lane <[email protected]>

Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)

committer Tom Lane <[email protected]>

Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)
author Tom Lane <[email protected]>
Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)
committer Tom Lane <[email protected]>
Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml

index 2946122350b25b5a774bfa89cf35eea578d949b5..4d482ec91f02949ec542aca51801698e60416009 100644 (file)
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -4477,13 +4477,27 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo;
         where no substring matching <replaceable>re</> begins
         (AREs only) </entry>
         </row>
+
+       <row>
+       <entry> <literal>(?&lt;=</><replaceable>re</><literal>)</> </entry>
+       <entry> <firstterm>positive lookbehind</> matches at any point
+       where a substring matching <replaceable>re</> ends
+       (AREs only) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>(?&lt;!</><replaceable>re</><literal>)</> </entry>
+       <entry> <firstterm>negative lookbehind</> matches at any point
+       where no substring matching <replaceable>re</> ends
+       (AREs only) </entry>
+       </row>
        </tbody>
       </tgroup>
      </table>
  
     <para>
-    Lookahead constraints cannot contain <firstterm>back references</>
-    (see <xref linkend="posix-escape-sequences">),
+    Lookahead and lookbehind constraints cannot contain <firstterm>back
+    references</> (see <xref linkend="posix-escape-sequences">),
      and all parentheses within them are considered non-capturing.
     </para>
     </sect3>
@@ -5355,7 +5369,7 @@ SELECT regexp_matches('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      the lack of special treatment for a trailing newline,
      the addition of complemented bracket expressions to the things
      affected by newline-sensitive matching,
-    the restrictions on parentheses and back references in lookahead
+    the restrictions on parentheses and back references in lookahead/lookbehind
      constraints, and the longest/shortest-match (rather than first-match)
      matching semantics.
     </para>
diff --git a/src/backend/regex/README b/src/backend/regex/README

index 5c24d3dfe9de08641344aa14a97e0d1d1e0ca11a..6c9f48315e3a2be030288d98db53427b6016db78 100644 (file)
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -332,10 +332,10 @@ The possible arc types are:
      as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line
      constraints respectively.
  
-    LACON constraints, which represent "(?=re)" and "(?!re)" constraints,
-    i.e. the input starting at this point must match (or not match) a
-    given sub-RE, but the matching input is not consumed.  These are
-    dumped as ":subtree_number:->to_state".
+    LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and
+    "(?<!re)" constraints, i.e. the input starting/ending at this point must
+    match (or not match) a given sub-RE, but the matching input is not
+    consumed.  These are dumped as ":subtree_number:->to_state".
  
  If you see anything else (especially any question marks) in the display of
  an arc, it's dumpnfa() trying to tell you that there's something fishy
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n

index f37bb85abdb8a2450caf643a8214c39643b435f8..4621bfc25f46c847675d25015fe89e807ddca957 100644 (file)
--- a/src/backend/regex/re_syntax.n
+++ b/src/backend/regex/re_syntax.n
@@ -196,10 +196,18 @@ where a substring matching \fIre\fR begins
  \fB(?!\fIre\fB)\fR
  \fInegative lookahead\fR (AREs only), matches at any point
  where no substring matching \fIre\fR begins
+.TP
+\fB(?<=\fIre\fB)\fR
+\fIpositive lookbehind\fR (AREs only), matches at any point
+where a substring matching \fIre\fR ends
+.TP
+\fB(?<!\fIre\fB)\fR
+\fInegative lookbehind\fR (AREs only), matches at any point
+where no substring matching \fIre\fR ends
  .RE
  .PP
-The lookahead constraints may not contain back references (see later),
-and all parentheses within them are considered non-capturing.
+Lookahead and lookbehind constraints may not contain back references
+(see later), and all parentheses within them are considered non-capturing.
  .PP
  An RE may not end with `\fB\e\fR'.
  
@@ -856,7 +864,8 @@ Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
  the lack of special treatment for a trailing newline,
  the addition of complemented bracket expressions to the things
  affected by newline-sensitive matching,
-the restrictions on parentheses and back references in lookahead constraints,
+the restrictions on parentheses and back references in lookahead/lookbehind
+constraints,
  and the longest/shortest-match (rather than first-match) matching semantics.
  .PP
  The matching rules for REs containing both normal and non-greedy quantifiers
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c

index f6ed9f09ea4d2f4cc7f3066bc4562c6c3b80df5a..bfd9dcd2a492b25142d9f932bf74c00b6324dfc1 100644 (file)
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -582,6 +582,8 @@ next(struct vars * v)
             {
                 NOTE(REG_UNONPOSIX);
                 v->now++;
+               if (ATEOS())
+                   FAILW(REG_BADRPT);
                 switch (*v->now++)
                 {
                     case CHR(':'):      /* non-capturing paren */
@@ -596,12 +598,31 @@ next(struct vars * v)
                         return next(v);
                         break;
                     case CHR('='):      /* positive lookahead */
-                       NOTE(REG_ULOOKAHEAD);
-                       RETV(LACON, 1);
+                       NOTE(REG_ULOOKAROUND);
+                       RETV(LACON, LATYPE_AHEAD_POS);
                         break;
                     case CHR('!'):      /* negative lookahead */
-                       NOTE(REG_ULOOKAHEAD);
-                       RETV(LACON, 0);
+                       NOTE(REG_ULOOKAROUND);
+                       RETV(LACON, LATYPE_AHEAD_NEG);
+                       break;
+                   case CHR('<'):
+                       if (ATEOS())
+                           FAILW(REG_BADRPT);
+                       switch (*v->now++)
+                       {
+                           case CHR('='):      /* positive lookbehind */
+                               NOTE(REG_ULOOKAROUND);
+                               RETV(LACON, LATYPE_BEHIND_POS);
+                               break;
+                           case CHR('!'):      /* negative lookbehind */
+                               NOTE(REG_ULOOKAROUND);
+                               RETV(LACON, LATYPE_BEHIND_NEG);
+                               break;
+                           default:
+                               FAILW(REG_BADRPT);
+                               break;
+                       }
+                       assert(NOTREACHED);
                         break;
                     default:
                         FAILW(REG_BADRPT);
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c

index 6f04321cd357822f76bbb58b8dd75f883d220be3..cd9a3239bd3267c2da7ea42d00e0baadc60d0d41 100644 (file)
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -1348,6 +1348,49 @@ cleartraverse(struct nfa * nfa,
         cleartraverse(nfa, a->to);
  }
  
+/*
+ * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
+ *
+ * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
+ * of a set of colors), return a state whose outarc list contains only PLAIN
+ * arcs of those color(s).  Otherwise return NULL.
+ *
+ * This is used before optimizing the NFA, so there may be EMPTY arcs, which
+ * we should ignore; the possibility of an EMPTY is why the result state could
+ * be different from s1.
+ *
+ * It's worth troubling to handle multiple parallel PLAIN arcs here because a
+ * bracket construct such as [abc] might yield either one or several parallel
+ * PLAIN arcs depending on earlier atoms in the expression.  We'd rather that
+ * that implementation detail not create user-visible performance differences.
+ */
+static struct state *
+single_color_transition(struct state * s1, struct state * s2)
+{
+   struct arc *a;
+
+   /* Ignore leading EMPTY arc, if any */
+   if (s1->nouts == 1 && s1->outs->type == EMPTY)
+       s1 = s1->outs->to;
+   /* Likewise for any trailing EMPTY arc */
+   if (s2->nins == 1 && s2->ins->type == EMPTY)
+       s2 = s2->ins->from;
+   /* Perhaps we could have a single-state loop in between, if so reject */
+   if (s1 == s2)
+       return NULL;
+   /* s1 must have at least one outarc... */
+   if (s1->outs == NULL)
+       return NULL;
+   /* ... and they must all be PLAIN arcs to s2 */
+   for (a = s1->outs; a != NULL; a = a->outchain)
+   {
+       if (a->type != PLAIN || a->to != s2)
+           return NULL;
+   }
+   /* OK, return s1 as the possessor of the relevant outarcs */
+   return s1;
+}
+
  /*
   * specialcolors - fill in special colors for an NFA
   */
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index b733bc7824e80500e3d278adf63e9e9f066040f4..aa759c264861b5e357c47910bf893fafe8ba1224 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -57,6 +57,8 @@ static const chr *scanplain(struct vars *);
  static void onechr(struct vars *, chr, struct state *, struct state *);
  static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
  static void wordchrs(struct vars *);
+static void processlacon(struct vars *, struct state *, struct state *, int,
+            struct state *, struct state *);
  static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
  static void freesubre(struct vars *, struct subre *);
  static void freesrnode(struct vars *, struct subre *);
@@ -65,7 +67,7 @@ static int    numst(struct subre *, int);
  static void markst(struct subre *);
  static void cleanst(struct vars *);
  static long nfatree(struct vars *, struct subre *, FILE *);
-static long nfanode(struct vars *, struct subre *, FILE *);
+static long nfanode(struct vars *, struct subre *, int, FILE *);
  static int newlacon(struct vars *, struct state *, struct state *, int);
  static void freelacons(struct subre *, int);
  static void rfree(regex_t *);
@@ -146,6 +148,7 @@ static void deltraverse(struct nfa *, struct state *, struct state *);
  static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
  static void duptraverse(struct nfa *, struct state *, struct state *);
  static void cleartraverse(struct nfa *, struct state *);
+static struct state *single_color_transition(struct state *, struct state *);
  static void specialcolors(struct nfa *);
  static long optimize(struct nfa *, FILE *);
  static void pullback(struct nfa *, FILE *);
@@ -245,8 +248,9 @@ struct vars
     int         ntree;          /* number of tree nodes, plus one */
     struct cvec *cv;            /* interface cvec */
     struct cvec *cv2;           /* utility cvec */
-   struct subre *lacons;       /* lookahead-constraint vector */
-   int         nlacons;        /* size of lacons */
+   struct subre *lacons;       /* lookaround-constraint vector */
+   int         nlacons;        /* size of lacons[]; note that only slots
+                                * numbered 1 .. nlacons-1 are used */
     size_t      spaceused;      /* approx. space used for compilation */
  };
  
@@ -277,7 +281,7 @@ struct vars
  #define CCLASS 'C'             /* start of [: */
  #define END 'X'                    /* end of [. [= [: */
  #define RANGE  'R'             /* - within [] which might be range delim. */
-#define LACON  'L'             /* lookahead constraint subRE */
+#define LACON  'L'             /* lookaround constraint subRE */
  #define AHEAD  'a'             /* color-lookahead arc */
  #define BEHIND 'r'             /* color-lookbehind arc */
  #define WBDRY  'w'             /* word boundary constraint */
@@ -432,11 +436,15 @@ pg_regcomp(regex_t *re,
     assert(v->nlacons == 0 || v->lacons != NULL);
     for (i = 1; i < v->nlacons; i++)
     {
+       struct subre *lasub = &v->lacons[i];
+
  #ifdef REG_DEBUG
         if (debug != NULL)
             fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
  #endif
-       nfanode(v, &v->lacons[i], debug);
+
+       /* Prepend .* to pattern if it's a lookbehind LACON */
+       nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
     }
     CNOERR();
     if (v->tree->flags & SHORTER)
@@ -640,7 +648,7 @@ makesearch(struct vars * v,
  static struct subre *
  parse(struct vars * v,
       int stopper,              /* EOS or ')' */
-     int type,                 /* LACON (lookahead subRE) or PLAIN */
+     int type,                 /* LACON (lookaround subRE) or PLAIN */
       struct state * init,      /* initial state */
       struct state * final)     /* final state */
  {
@@ -719,7 +727,7 @@ parse(struct vars * v,
  static struct subre *
  parsebranch(struct vars * v,
             int stopper,        /* EOS or ')' */
-           int type,           /* LACON (lookahead subRE) or PLAIN */
+           int type,           /* LACON (lookaround subRE) or PLAIN */
             struct state * left,    /* leftmost state */
             struct state * right,       /* rightmost state */
             int partial)        /* is this only part of a branch? */
@@ -768,7 +776,7 @@ parsebranch(struct vars * v,
  static void
  parseqatom(struct vars * v,
            int stopper,         /* EOS or ')' */
-          int type,            /* LACON (lookahead subRE) or PLAIN */
+          int type,            /* LACON (lookaround subRE) or PLAIN */
            struct state * lp,   /* left state to hang it on */
            struct state * rp,   /* right state to hang it on */
            struct subre * top)  /* subtree top */
@@ -782,7 +790,7 @@ parseqatom(struct vars * v,
     struct subre *atom;         /* atom's subtree */
     struct subre *t;
     int         cap;            /* capturing parens? */
-   int         pos;            /* positive lookahead? */
+   int         latype;         /* lookaround constraint type */
     int         subno;          /* capturing-parens or backref number */
     int         atomtype;
     int         qprefer;        /* quantifier short/long preference */
@@ -866,19 +874,18 @@ parseqatom(struct vars * v,
             nonword(v, AHEAD, s, rp);
             return;
             break;
-       case LACON:             /* lookahead constraint */
-           pos = v->nextvalue;
+       case LACON:             /* lookaround constraint */
+           latype = v->nextvalue;
             NEXT();
             s = newstate(v->nfa);
             s2 = newstate(v->nfa);
             NOERR();
             t = parse(v, ')', LACON, s, s2);
             freesubre(v, t);    /* internal structure irrelevant */
-           assert(SEE(')') || ISERR());
-           NEXT();
-           n = newlacon(v, s, s2, pos);
             NOERR();
-           ARCV(LACON, n);
+           assert(SEE(')'));
+           NEXT();
+           processlacon(v, s, s2, latype, lp, rp);
             return;
             break;
             /* then errors, to get them out of the way */
@@ -1633,6 +1640,75 @@ wordchrs(struct vars * v)
     v->wordchrs = left;
  }
  
+/*
+ * processlacon - generate the NFA representation of a LACON
+ *
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+static void
+processlacon(struct vars * v,
+            struct state * begin,      /* start of parsed LACON sub-re */
+            struct state * end,    /* end of parsed LACON sub-re */
+            int latype,
+            struct state * lp, /* left state to hang it on */
+            struct state * rp) /* right state to hang it on */
+{
+   struct state *s1;
+   int         n;
+
+   /*
+    * Check for lookaround RE consisting of a single plain color arc (or set
+    * of arcs); this would typically be a simple chr or a bracket expression.
+    */
+   s1 = single_color_transition(begin, end);
+   switch (latype)
+   {
+       case LATYPE_AHEAD_POS:
+           /* If lookahead RE is just colorset C, convert to AHEAD(C) */
+           if (s1 != NULL)
+           {
+               cloneouts(v->nfa, s1, lp, rp, AHEAD);
+               return;
+           }
+           break;
+       case LATYPE_AHEAD_NEG:
+           /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+           if (s1 != NULL)
+           {
+               colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+               newarc(v->nfa, '$', 1, lp, rp);
+               newarc(v->nfa, '$', 0, lp, rp);
+               return;
+           }
+           break;
+       case LATYPE_BEHIND_POS:
+           /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+           if (s1 != NULL)
+           {
+               cloneouts(v->nfa, s1, lp, rp, BEHIND);
+               return;
+           }
+           break;
+       case LATYPE_BEHIND_NEG:
+           /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+           if (s1 != NULL)
+           {
+               colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+               newarc(v->nfa, '^', 1, lp, rp);
+               newarc(v->nfa, '^', 0, lp, rp);
+               return;
+           }
+           break;
+       default:
+           assert(NOTREACHED);
+   }
+
+   /* General case: we need a LACON subre and arc */
+   n = newlacon(v, begin, end, latype);
+   newarc(v->nfa, LACON, n, lp, rp);
+}
+
  /*
   * subre - allocate a subre
   */
@@ -1826,15 +1902,18 @@ nfatree(struct vars * v,
     if (t->right != NULL)
         (DISCARD) nfatree(v, t->right, f);
  
-   return nfanode(v, t, f);
+   return nfanode(v, t, 0, f);
  }
  
  /*
- * nfanode - do one NFA for nfatree
+ * nfanode - do one NFA for nfatree or lacons
+ *
+ * If converttosearch is true, apply makesearch() to the NFA.
   */
  static long                        /* optimize results */
  nfanode(struct vars * v,
         struct subre * t,
+       int converttosearch,
         FILE *f)                /* for debug output */
  {
     struct nfa *nfa;
@@ -1855,10 +1934,11 @@ nfanode(struct vars * v,
     NOERRZ();
     dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
     if (!ISERR())
-   {
         specialcolors(nfa);
+   if (!ISERR())
         ret = optimize(nfa, f);
-   }
+   if (converttosearch && !ISERR())
+       makesearch(v, nfa);
     if (!ISERR())
         compact(nfa, &t->cnfa);
  
@@ -1867,13 +1947,13 @@ nfanode(struct vars * v,
  }
  
  /*
- * newlacon - allocate a lookahead-constraint subRE
+ * newlacon - allocate a lookaround-constraint subRE
   */
  static int                     /* lacon number */
  newlacon(struct vars * v,
          struct state * begin,
          struct state * end,
-        int pos)
+        int latype)
  {
     int         n;
     struct subre *newlacons;
@@ -1900,13 +1980,13 @@ newlacon(struct vars * v,
     sub = &v->lacons[n];
     sub->begin = begin;
     sub->end = end;
-   sub->subno = pos;
+   sub->subno = latype;
     ZAPCNFA(sub->cnfa);
     return n;
  }
  
  /*
- * freelacons - free lookahead-constraint subRE vector
+ * freelacons - free lookaround-constraint subRE vector
   */
  static void
  freelacons(struct subre * subs,
@@ -2020,9 +2100,29 @@ dump(regex_t *re,
     }
     for (i = 1; i < g->nlacons; i++)
     {
-       fprintf(f, "\nla%d (%s):\n", i,
-               (g->lacons[i].subno) ? "positive" : "negative");
-       dumpcnfa(&g->lacons[i].cnfa, f);
+       struct subre *lasub = &g->lacons[i];
+       const char *latype;
+
+       switch (lasub->subno)
+       {
+           case LATYPE_AHEAD_POS:
+               latype = "positive lookahead";
+               break;
+           case LATYPE_AHEAD_NEG:
+               latype = "negative lookahead";
+               break;
+           case LATYPE_BEHIND_POS:
+               latype = "positive lookbehind";
+               break;
+           case LATYPE_BEHIND_NEG:
+               latype = "negative lookbehind";
+               break;
+           default:
+               latype = "???";
+               break;
+       }
+       fprintf(f, "\nla%d (%s):\n", i, latype);
+       dumpcnfa(&lasub->cnfa, f);
     }
     fprintf(f, "\n");
     dumpst(g->tree, f, 0);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c

index a37e4b0ef96660c47b01c9fe32e10a0b39d511b9..7d90242acefd50b33fc7c2551e1cf830b5752fee 100644 (file)
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -286,6 +286,130 @@ shortest(struct vars * v,
     return cp;
  }
  
+/*
+ * matchuntil - incremental matching engine
+ *
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*).  We determine whether a
+ * match exists starting at v->start and ending at probe.  Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing.  *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ *
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+static int
+matchuntil(struct vars * v,
+          struct dfa * d,
+          chr *probe,          /* we want to know if a match ends here */
+          struct sset ** lastcss,      /* state storage across calls */
+          chr **lastcp)        /* state storage across calls */
+{
+   chr        *cp = *lastcp;
+   color       co;
+   struct sset *css = *lastcss;
+   struct sset *ss;
+   struct colormap *cm = d->cm;
+
+   /* initialize and startup, or restart, if necessary */
+   if (cp == NULL || cp > probe)
+   {
+       cp = v->start;
+       css = initialize(v, d, cp);
+       if (css == NULL)
+           return 0;
+
+       FDEBUG((">>> startup >>>\n"));
+       co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+       FDEBUG(("color %ld\n", (long) co));
+
+       css = miss(v, d, css, co, cp, v->start);
+       if (css == NULL)
+           return 0;
+       css->lastseen = cp;
+   }
+   else if (css == NULL)
+   {
+       /* we previously found that no match is possible beyond *lastcp */
+       return 0;
+   }
+   ss = css;
+
+   /*
+    * This is the main text-scanning loop.  It seems worth having two copies
+    * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+    * builds, when you're not actively tracing.
+    */
+#ifdef REG_DEBUG
+   if (v->eflags & REG_FTRACE)
+   {
+       while (cp < probe)
+       {
+           FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+           co = GETCOLOR(cm, *cp);
+           FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+           ss = css->outs[co];
+           if (ss == NULL)
+           {
+               ss = miss(v, d, css, co, cp + 1, v->start);
+               if (ss == NULL)
+                   break;      /* NOTE BREAK OUT */
+           }
+           cp++;
+           ss->lastseen = cp;
+           css = ss;
+       }
+   }
+   else
+#endif
+   {
+       while (cp < probe)
+       {
+           co = GETCOLOR(cm, *cp);
+           ss = css->outs[co];
+           if (ss == NULL)
+           {
+               ss = miss(v, d, css, co, cp + 1, v->start);
+               if (ss == NULL)
+                   break;      /* NOTE BREAK OUT */
+           }
+           cp++;
+           ss->lastseen = cp;
+           css = ss;
+       }
+   }
+
+   *lastcss = ss;
+   *lastcp = cp;
+
+   if (ss == NULL)
+       return 0;               /* impossible match, or internal error */
+
+   /* We need to process one more chr, or the EOS symbol, to check match */
+   if (cp < v->stop)
+   {
+       FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+       co = GETCOLOR(cm, *cp);
+       FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+       ss = css->outs[co];
+       if (ss == NULL)
+           ss = miss(v, d, css, co, cp + 1, v->start);
+   }
+   else
+   {
+       assert(cp == v->stop);
+       co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+       FDEBUG(("color %ld\n", (long) co));
+       ss = miss(v, d, css, co, cp, v->start);
+   }
+
+   if (ss == NULL || !(ss->flags & POSTSTATE))
+       return 0;
+
+   return 1;
+}
+
  /*
   * lastcold - determine last point at which no progress had been made
   */
@@ -613,19 +737,19 @@ miss(struct vars * v,
  }
  
  /*
- * lacon - lookahead-constraint checker for miss()
+ * lacon - lookaround-constraint checker for miss()
   */
  static int                     /* predicate:  constraint satisfied? */
  lacon(struct vars * v,
       struct cnfa * pcnfa,      /* parent cnfa */
       chr *cp,
-     pcolor co)                /* "color" of the lookahead constraint */
+     pcolor co)                /* "color" of the lookaround constraint */
  {
     int         n;
     struct subre *sub;
     struct dfa *d;
-   struct smalldfa sd;
     chr        *end;
+   int         satisfied;
  
     /* Since this is recursive, it could be driven to stack overflow */
     if (STACK_TOO_DEEP(v->re))
@@ -635,19 +759,35 @@ lacon(struct vars * v,
     }
  
     n = co - pcnfa->ncolors;
-   assert(n < v->g->nlacons && v->g->lacons != NULL);
+   assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     FDEBUG(("=== testing lacon %d\n", n));
     sub = &v->g->lacons[n];
-   d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+   d = getladfa(v, n);
     if (d == NULL)
-   {
-       ERR(REG_ESPACE);
         return 0;
+   if (LATYPE_IS_AHEAD(sub->subno))
+   {
+       /* used to use longest() here, but shortest() could be much cheaper */
+       end = shortest(v, d, cp, cp, v->stop,
+                      (chr **) NULL, (int *) NULL);
+       satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+   }
+   else
+   {
+       /*
+        * To avoid doing O(N^2) work when repeatedly testing a lookbehind
+        * constraint in an N-character string, we use matchuntil() which can
+        * cache the DFA state across calls.  We only need to restart if the
+        * probe point decreases, which is not common.  The NFA we're using is
+        * a search NFA, so it doesn't mind scanning over stuff before the
+        * nominal match.
+        */
+       satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
+       if (!LATYPE_IS_POS(sub->subno))
+           satisfied = !satisfied;
     }
-   end = longest(v, d, cp, v->stop, (int *) NULL);
-   freedfa(d);
-   FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
-   return (sub->subno) ? (end != NULL) : (end == NULL);
+   FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
+   return satisfied;
  }
  
  /*
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c

index 8a21f2cb7870b544165d363f62a0424f067e23cb..82659a0f2f468532662ac29583c9f536e727a1be 100644 (file)
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -112,7 +112,10 @@ struct vars
     chr        *search_start;   /* search start of string */
     chr        *stop;           /* just past end of string */
     int         err;            /* error code if any (0 none) */
-   struct dfa **subdfas;       /* per-subre DFAs */
+   struct dfa **subdfas;       /* per-tree-subre DFAs */
+   struct dfa **ladfas;        /* per-lacon-subre DFAs */
+   struct sset **lblastcss;    /* per-lacon-subre lookbehind restart data */
+   chr       **lblastcp;       /* per-lacon-subre lookbehind restart data */
     struct smalldfa dfa1;
     struct smalldfa dfa2;
  };
@@ -132,6 +135,7 @@ struct vars
   */
  /* === regexec.c === */
  static struct dfa *getsubdfa(struct vars *, struct subre *);
+static struct dfa *getladfa(struct vars *, int);
  static int find(struct vars *, struct cnfa *, struct colormap *);
  static int cfind(struct vars *, struct cnfa *, struct colormap *);
  static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
@@ -149,6 +153,7 @@ static int  creviterdissect(struct vars *, struct subre *, chr *, chr *);
  /* === rege_dfa.c === */
  static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
  static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
  static chr *lastcold(struct vars *, struct dfa *);
  static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
  static void freedfa(struct dfa *);
@@ -226,21 +231,54 @@ pg_regexec(regex_t *re,
     v->search_start = (chr *) string + search_start;
     v->stop = (chr *) string + len;
     v->err = 0;
+   v->subdfas = NULL;
+   v->ladfas = NULL;
+   v->lblastcss = NULL;
+   v->lblastcp = NULL;
+   /* below this point, "goto cleanup" will behave sanely */
+
     assert(v->g->ntree >= 0);
     n = (size_t) v->g->ntree;
     if (n <= LOCALDFAS)
         v->subdfas = subdfas;
     else
-       v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
-   if (v->subdfas == NULL)
     {
-       if (v->pmatch != pmatch && v->pmatch != mat)
-           FREE(v->pmatch);
-       return REG_ESPACE;
+       v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+       if (v->subdfas == NULL)
+       {
+           st = REG_ESPACE;
+           goto cleanup;
+       }
     }
     for (i = 0; i < n; i++)
         v->subdfas[i] = NULL;
  
+   assert(v->g->nlacons >= 0);
+   n = (size_t) v->g->nlacons;
+   if (n > 0)
+   {
+       v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+       if (v->ladfas == NULL)
+       {
+           st = REG_ESPACE;
+           goto cleanup;
+       }
+       for (i = 0; i < n; i++)
+           v->ladfas[i] = NULL;
+       v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+       v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+       if (v->lblastcss == NULL || v->lblastcp == NULL)
+       {
+           st = REG_ESPACE;
+           goto cleanup;
+       }
+       for (i = 0; i < n; i++)
+       {
+           v->lblastcss[i] = NULL;
+           v->lblastcp[i] = NULL;
+       }
+   }
+
     /* do it */
     assert(v->g->tree != NULL);
     if (backref)
@@ -257,22 +295,40 @@ pg_regexec(regex_t *re,
     }
  
     /* clean up */
+cleanup:
     if (v->pmatch != pmatch && v->pmatch != mat)
         FREE(v->pmatch);
-   n = (size_t) v->g->ntree;
-   for (i = 0; i < n; i++)
+   if (v->subdfas != NULL)
+   {
+       n = (size_t) v->g->ntree;
+       for (i = 0; i < n; i++)
+       {
+           if (v->subdfas[i] != NULL)
+               freedfa(v->subdfas[i]);
+       }
+       if (v->subdfas != subdfas)
+           FREE(v->subdfas);
+   }
+   if (v->ladfas != NULL)
     {
-       if (v->subdfas[i] != NULL)
-           freedfa(v->subdfas[i]);
+       n = (size_t) v->g->nlacons;
+       for (i = 0; i < n; i++)
+       {
+           if (v->ladfas[i] != NULL)
+               freedfa(v->ladfas[i]);
+       }
+       FREE(v->ladfas);
     }
-   if (v->subdfas != subdfas)
-       FREE(v->subdfas);
+   if (v->lblastcss != NULL)
+       FREE(v->lblastcss);
+   if (v->lblastcp != NULL)
+       FREE(v->lblastcp);
  
     return st;
  }
  
  /*
- * getsubdfa - create or re-fetch the DFA for a subre node
+ * getsubdfa - create or re-fetch the DFA for a tree subre node
   *
   * We only need to create the DFA once per overall regex execution.
   * The DFA will be freed by the cleanup step in pg_regexec().
@@ -290,6 +346,28 @@ getsubdfa(struct vars * v,
     return v->subdfas[t->id];
  }
  
+/*
+ * getladfa - create or re-fetch the DFA for a LACON subre node
+ *
+ * Same as above, but for LACONs.
+ */
+static struct dfa *
+getladfa(struct vars * v,
+        int n)
+{
+   assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+
+   if (v->ladfas[n] == NULL)
+   {
+       struct subre *sub = &v->g->lacons[n];
+
+       v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+       if (ISERR())
+           return NULL;
+   }
+   return v->ladfas[n];
+}
+
  /*
   * find - find a match for the main NFA (no-complications case)
   */
diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c

index c5524ae492756e7aaf33149b1de9d741616618db..91340719e88f87f9ba86bea276539881872c51bc 100644 (file)
--- a/src/backend/regex/regexport.c
+++ b/src/backend/regex/regexport.c
@@ -6,7 +6,7 @@
   * In this implementation, the NFA defines a necessary but not sufficient
   * condition for a string to match the regex: that is, there can be strings
   * that match the NFA but don't match the full regex, but not vice versa.
- * Thus, for example, it is okay for the functions below to ignore lookahead
+ * Thus, for example, it is okay for the functions below to ignore lookaround
   * constraints, which merely constrain the string some more.
   *
   * Notice that these functions return info into caller-provided arrays
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c

index ce41620a0b4b1b740955da99aacdd5f502de9b36..86928453260b20d9df647d4cc284cd308e130106 100644 (file)
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@@ -36,7 +36,7 @@ static int findprefix(struct cnfa * cnfa, struct colormap * cm,
   * the common prefix or exact value, of length *slength (measured in chrs
   * not bytes!).
   *
- * This function does not analyze all complex cases (such as lookahead
+ * This function does not analyze all complex cases (such as lookaround
   * constraints) exactly.  Therefore it is possible that some strings matching
   * the reported prefix or exact-match string do not satisfy the regex.  But
   * it should never be the case that a string satisfying the regex does not
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h

index 5e1b692d26c130d09cbe4f7e3150ba6cbba6f66a..2f89dc9326bfab16a10bd6ba070a6b1230c3ff0c 100644 (file)
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -58,7 +58,7 @@ typedef struct
     size_t      re_nsub;        /* number of subexpressions */
     long        re_info;        /* information about RE */
  #define  REG_UBACKREF       000001
-#define  REG_ULOOKAHEAD         000002
+#define  REG_ULOOKAROUND    000002
  #define  REG_UBOUNDS    000004
  #define  REG_UBRACES    000010
  #define  REG_UBSALNUM       000020
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h

index 19fe991c74faa5213261ce3b060071a8c7f2eec2..2ceffa6563b04b7b9024f7dbebf2c37a8c392c31 100644 (file)
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -89,13 +89,19 @@
   */
  
  #define NOTREACHED 0
-#define xxx        1
  
  #define DUPMAX _POSIX2_RE_DUP_MAX
  #define DUPINF (DUPMAX+1)
  
  #define REMAGIC 0xfed7         /* magic number for main struct */
  
+/* Type codes for lookaround constraints */
+#define LATYPE_AHEAD_POS   03  /* positive lookahead */
+#define LATYPE_AHEAD_NEG   02  /* negative lookahead */
+#define LATYPE_BEHIND_POS  01  /* positive lookbehind */
+#define LATYPE_BEHIND_NEG  00  /* negative lookbehind */
+#define LATYPE_IS_POS(la)  ((la) & 01)
+#define LATYPE_IS_AHEAD(la) ((la) & 02)
  
  
  /*
@@ -351,7 +357,7 @@ struct nfa
   *
   * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
   * Plain arcs just store the transition color number as "co".  LACON arcs
- * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON
+ * store the lookaround constraint number plus cnfa.ncolors as "co".  LACON
   * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
   */
  struct carc
@@ -365,7 +371,7 @@ struct cnfa
     int         nstates;        /* number of states */
     int         ncolors;        /* number of colors (max color in use + 1) */
     int         flags;
-#define  HASLACONS 01          /* uses lookahead constraints */
+#define  HASLACONS 01          /* uses lookaround constraints */
     int         pre;            /* setup state number */
     int         post;           /* teardown state number */
     color       bos[2];         /* colors, if any, assigned to BOS and BOL */
@@ -433,7 +439,8 @@ struct subre
  #define  PREF2(f1, f2)  ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
  #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
     short       id;             /* ID of subre (1..ntree-1) */
-   int         subno;          /* subexpression number (for 'b' and '(') */
+   int         subno;          /* subexpression number for 'b' and '(', or
+                                * LATYPE code for lookaround constraint */
     short       min;            /* min repetitions for iteration or backref */
     short       max;            /* max repetitions for iteration or backref */
     struct subre *left;         /* left child, if any (also freelist chain) */
@@ -479,6 +486,7 @@ struct guts
     int         ntree;          /* number of subre's, plus one */
     struct colormap cmap;
     int         FUNCPTR(compare, (const chr *, const chr *, size_t));
-   struct subre *lacons;       /* lookahead-constraint vector */
-   int         nlacons;        /* size of lacons */
+   struct subre *lacons;       /* lookaround-constraint vector */
+   int         nlacons;        /* size of lacons[]; note that only slots
+                                * numbered 1 .. nlacons-1 are used */
  };
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out

index be151858a3884a9c9f32d64608d80e959e8f912c..f0e2fc9eb896786c3206ad88e68868867be317fe 100644 (file)
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -90,6 +90,175 @@ select substring('a' from '((a)+)');
   a
  (1 row)
  
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+ regexp_matches 
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('a', 'a(?=b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('ab', 'a(?!b)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('a', 'a(?!b)b*');
+ regexp_matches 
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('b', '(?=b)b');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', '(?=b)b');
+ regexp_matches 
+----------------
+(0 rows)
+
+-- Test lookbehind constraints
+select regexp_matches('abb', '(?<=a)b*');
+ regexp_matches 
+----------------
+ {bb}
+(1 row)
+
+select regexp_matches('a', 'a(?<=a)b*');
+ regexp_matches 
+----------------
+ {a}
+(1 row)
+
+select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
+ regexp_matches 
+----------------
+ {abc}
+(1 row)
+
+select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
+ regexp_matches 
+----------------
+ {ab}
+(1 row)
+
+select regexp_matches('ab', 'a*(?<!a)b*');
+ regexp_matches 
+----------------
+ {""}
+(1 row)
+
+select regexp_matches('ab', 'a*(?<!a)b+');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('b', 'a*(?<!a)b+');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('a', 'a(?<!a)b*');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('b', '(?<=b)b');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('foobar', '(?<=f)b+');
+ regexp_matches 
+----------------
+(0 rows)
+
+select regexp_matches('foobar', '(?<=foo)b+');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+select regexp_matches('foobar', '(?<=oo)b+');
+ regexp_matches 
+----------------
+ {b}
+(1 row)
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'xy' ~ 'x(?=[xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xz' ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xy' ~ 'x(?![xy])';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'x'  ~ 'x(?![xy])';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'xyy' ~ '(?<=[xy])yy+';
+ ?column? 
+----------
+ t
+(1 row)
+
+select 'zyy' ~ '(?<=[xy])yy+';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'xyy' ~ '(?<![xy])yy+';
+ ?column? 
+----------
+ f
+(1 row)
+
+select 'zyy' ~ '(?<![xy])yy+';
+ ?column? 
+----------
+ t
+(1 row)
+
  -- Test conversion of regex patterns to indexable conditions
  explain (costs off) select * from pg_proc where proname ~ 'abc';
              QUERY PLAN             
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql

index c59fa35f24d81f5e2c4aceae0ada1ca3832f0ac4..d3030af295d3852714c2dfa13ea2ca9597ef686d 100644 (file)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -25,6 +25,41 @@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
  select substring('a' from '((a))+');
  select substring('a' from '((a)+)');
  
+-- Test lookahead constraints
+select regexp_matches('ab', 'a(?=b)b*');
+select regexp_matches('a', 'a(?=b)b*');
+select regexp_matches('abc', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?=b)b*(?=c)c*');
+select regexp_matches('ab', 'a(?!b)b*');
+select regexp_matches('a', 'a(?!b)b*');
+select regexp_matches('b', '(?=b)b');
+select regexp_matches('a', '(?=b)b');
+
+-- Test lookbehind constraints
+select regexp_matches('abb', '(?<=a)b*');
+select regexp_matches('a', 'a(?<=a)b*');
+select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*');
+select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*');
+select regexp_matches('ab', 'a*(?<!a)b*');
+select regexp_matches('ab', 'a*(?<!a)b+');
+select regexp_matches('b', 'a*(?<!a)b+');
+select regexp_matches('a', 'a(?<!a)b*');
+select regexp_matches('b', '(?<=b)b');
+select regexp_matches('foobar', '(?<=f)b+');
+select regexp_matches('foobar', '(?<=foo)b+');
+select regexp_matches('foobar', '(?<=oo)b+');
+
+-- Test optimization of single-chr-or-bracket-expression lookaround constraints
+select 'xz' ~ 'x(?=[xy])';
+select 'xy' ~ 'x(?=[xy])';
+select 'xz' ~ 'x(?![xy])';
+select 'xy' ~ 'x(?![xy])';
+select 'x'  ~ 'x(?![xy])';
+select 'xyy' ~ '(?<=[xy])yy+';
+select 'zyy' ~ '(?<=[xy])yy+';
+select 'xyy' ~ '(?<![xy])yy+';
+select 'zyy' ~ '(?<![xy])yy+';
+
  -- Test conversion of regex patterns to indexable conditions
  explain (costs off) select * from pg_proc where proname ~ 'abc';
  explain (costs off) select * from pg_proc where proname ~ '^abc';
author	Tom Lane <[email protected]>
	Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)
committer	Tom Lane <[email protected]>
	Fri, 30 Oct 2015 23:14:19 +0000 (19:14 -0400)
doc/src/sgml/func.sgml		patch \| blob \| blame \| history
src/backend/regex/README		patch \| blob \| blame \| history
src/backend/regex/re_syntax.n		patch \| blob \| blame \| history
src/backend/regex/regc_lex.c		patch \| blob \| blame \| history
src/backend/regex/regc_nfa.c		patch \| blob \| blame \| history
src/backend/regex/regcomp.c		patch \| blob \| blame \| history
src/backend/regex/rege_dfa.c		patch \| blob \| blame \| history
src/backend/regex/regexec.c		patch \| blob \| blame \| history
src/backend/regex/regexport.c		patch \| blob \| blame \| history
src/backend/regex/regprefix.c		patch \| blob \| blame \| history
src/include/regex/regex.h		patch \| blob \| blame \| history
src/include/regex/regguts.h		patch \| blob \| blame \| history
src/test/regress/expected/regex.out		patch \| blob \| blame \| history
src/test/regress/sql/regex.sql		patch \| blob \| blame \| history