Change regexp engine's ccondissect/crevdissect routines to perform DFA
authorTom Lane <[email protected]>
Mon, 1 Feb 2010 02:46:08 +0000 (02:46 +0000)
committerTom Lane <[email protected]>
Mon, 1 Feb 2010 02:46:08 +0000 (02:46 +0000)
matching before recursing instead of after.  The DFA match eliminates
unworkable midpoint choices a lot faster than the recursive check, in most
cases, so doing it first can speed things up; particularly in pathological
cases such as recently exhibited by Michael Glaesemann.

In addition, apply some cosmetic changes that were applied upstream (in the
Tcl project) at the same time, in order to sync with upstream version 1.15
of regexec.c.

Upstream apparently intends to backpatch this, so I will too.  The
pathological behavior could be unpleasant if encountered in the field,
which seems to justify any risk of introducing new bugs.

Tom Lane, reviewed by Donal K. Fellows of Tcl project

src/backend/regex/regexec.c

index 11032b431a56bc77a5f51c82ddb331f96bd95440..9704169e73c3e5b6d5a23860adb19c5e8ad5b753 100644 (file)
@@ -27,7 +27,7 @@
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $Header: /cvsroot/pgsql/src/backend/regex/regexec.c,v 1.23 2003/08/08 21:41:56 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/regex/regexec.c,v 1.23.4.1 2010/02/01 02:46:08 tgl Exp $
  *
  */
 
@@ -142,6 +142,7 @@ static int  dissect(struct vars *, struct subre *, chr *, chr *);
 static int     condissect(struct vars *, struct subre *, chr *, chr *);
 static int     altdissect(struct vars *, struct subre *, chr *, chr *);
 static int     cdissect(struct vars *, struct subre *, chr *, chr *);
+static int     ccaptdissect(struct vars *, struct subre *, chr *, chr *);
 static int     ccondissect(struct vars *, struct subre *, chr *, chr *);
 static int     crevdissect(struct vars *, struct subre *, chr *, chr *);
 static int     cbrdissect(struct vars *, struct subre *, chr *, chr *);
@@ -559,27 +560,21 @@ dissect(struct vars * v,
                case '=':                               /* terminal node */
                        assert(t->left == NULL && t->right == NULL);
                        return REG_OKAY;        /* no action, parent did the work */
-                       break;
                case '|':                               /* alternation */
                        assert(t->left != NULL);
                        return altdissect(v, t, begin, end);
-                       break;
                case 'b':                               /* back ref -- shouldn't be calling us! */
                        return REG_ASSERT;
-                       break;
                case '.':                               /* concatenation */
                        assert(t->left != NULL && t->right != NULL);
                        return condissect(v, t, begin, end);
-                       break;
                case '(':                               /* capturing */
                        assert(t->left != NULL && t->right == NULL);
                        assert(t->subno > 0);
                        subset(v, t, begin, end);
                        return dissect(v, t->left, begin, end);
-                       break;
                default:
                        return REG_ASSERT;
-                       break;
        }
 }
 
@@ -709,8 +704,6 @@ cdissect(struct vars * v,
                 chr *begin,                    /* beginning of relevant substring */
                 chr *end)                              /* end of same */
 {
-       int                     er;
-
        assert(t != NULL);
        MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
 
@@ -719,33 +712,42 @@ cdissect(struct vars * v,
                case '=':                               /* terminal node */
                        assert(t->left == NULL && t->right == NULL);
                        return REG_OKAY;        /* no action, parent did the work */
-                       break;
                case '|':                               /* alternation */
                        assert(t->left != NULL);
                        return caltdissect(v, t, begin, end);
-                       break;
                case 'b':                               /* back ref -- shouldn't be calling us! */
                        assert(t->left == NULL && t->right == NULL);
                        return cbrdissect(v, t, begin, end);
-                       break;
                case '.':                               /* concatenation */
                        assert(t->left != NULL && t->right != NULL);
                        return ccondissect(v, t, begin, end);
-                       break;
                case '(':                               /* capturing */
                        assert(t->left != NULL && t->right == NULL);
-                       assert(t->subno > 0);
-                       er = cdissect(v, t->left, begin, end);
-                       if (er == REG_OKAY)
-                               subset(v, t, begin, end);
-                       return er;
-                       break;
+                       return ccaptdissect(v, t, begin, end);
                default:
                        return REG_ASSERT;
-                       break;
        }
 }
 
+/*
+ * ccaptdissect - capture subexpression matches (with complications)
+ */
+static int                                             /* regexec return code */
+ccaptdissect(struct vars * v,
+                        struct subre * t,
+                        chr *begin,            /* beginning of relevant substring */
+                        chr *end)                      /* end of same */
+{
+       int                     er;
+
+       assert(t->subno > 0);
+
+       er = cdissect(v, t->left, begin, end);
+       if (er == REG_OKAY)
+               subset(v, t, begin, end);
+       return er;
+}
+
 /*
  * ccondissect - concatenation subexpression matches (with complications)
  * The retry memory stores the offset of the trial midpoint from begin,
@@ -803,17 +805,27 @@ ccondissect(struct vars * v,
        for (;;)
        {
                /* try this midpoint on for size */
-               er = cdissect(v, t->left, begin, mid);
-               if (er == REG_OKAY &&
-                       longest(v, d2, mid, end, (int *) NULL) == end &&
-                       (er = cdissect(v, t->right, mid, end)) ==
-                       REG_OKAY)
-                       break;                          /* NOTE BREAK OUT */
-               if (er != REG_OKAY && er != REG_NOMATCH)
+               if (longest(v, d2, mid, end, (int *) NULL) == end)
                {
-                       freedfa(d);
-                       freedfa(d2);
-                       return er;
+                       er = cdissect(v, t->left, begin, mid);
+                       if (er == REG_OKAY)
+                       {
+                               er = cdissect(v, t->right, mid, end);
+                               if (er == REG_OKAY)
+                               {
+                                       /* satisfaction */
+                                       MDEBUG(("successful\n"));
+                                       freedfa(d);
+                                       freedfa(d2);
+                                       return REG_OKAY;
+                               }
+                       }
+                       if (er != REG_OKAY && er != REG_NOMATCH)
+                       {
+                               freedfa(d);
+                               freedfa(d2);
+                               return er;
+                       }
                }
 
                /* that midpoint didn't work, find a new one */
@@ -840,11 +852,8 @@ ccondissect(struct vars * v,
                zapmem(v, t->right);
        }
 
-       /* satisfaction */
-       MDEBUG(("successful\n"));
-       freedfa(d);
-       freedfa(d2);
-       return REG_OKAY;
+       /* can't get here */
+       return REG_ASSERT;
 }
 
 /*
@@ -903,17 +912,27 @@ crevdissect(struct vars * v,
        for (;;)
        {
                /* try this midpoint on for size */
-               er = cdissect(v, t->left, begin, mid);
-               if (er == REG_OKAY &&
-                       longest(v, d2, mid, end, (int *) NULL) == end &&
-                       (er = cdissect(v, t->right, mid, end)) ==
-                       REG_OKAY)
-                       break;                          /* NOTE BREAK OUT */
-               if (er != REG_OKAY && er != REG_NOMATCH)
+               if (longest(v, d2, mid, end, (int *) NULL) == end)
                {
-                       freedfa(d);
-                       freedfa(d2);
-                       return er;
+                       er = cdissect(v, t->left, begin, mid);
+                       if (er == REG_OKAY)
+                       {
+                               er = cdissect(v, t->right, mid, end);
+                               if (er == REG_OKAY)
+                               {
+                                       /* satisfaction */
+                                       MDEBUG(("successful\n"));
+                                       freedfa(d);
+                                       freedfa(d2);
+                                       return REG_OKAY;
+                               }
+                       }
+                       if (er != REG_OKAY && er != REG_NOMATCH)
+                       {
+                               freedfa(d);
+                               freedfa(d2);
+                               return er;
+                       }
                }
 
                /* that midpoint didn't work, find a new one */
@@ -940,11 +959,8 @@ crevdissect(struct vars * v,
                zapmem(v, t->right);
        }
 
-       /* satisfaction */
-       MDEBUG(("successful\n"));
-       freedfa(d);
-       freedfa(d2);
-       return REG_OKAY;
+       /* can't get here */
+       return REG_ASSERT;
 }
 
 /*