Avoid determining regexp subexpression matches, when possible.

author Tom Lane <[email protected]>

Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)

committer Tom Lane <[email protected]>

Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)
author Tom Lane <[email protected]>
Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)
committer Tom Lane <[email protected]>
Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c

index bf1dea6352e09e3926bbc7b44f144e7988f92a5f..64816dd3703969186fd1972c0560c7c5eb0077f2 100644 (file)
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation,
      * Stage 1: Compile the regexp into a NFA, using the regexp library.
      */
  #ifdef IGNORECASE
-   RE_compile(&regex, text_re, REG_ADVANCED | REG_ICASE, collation);
+   RE_compile(&regex, text_re,
+              REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
  #else
-   RE_compile(&regex, text_re, REG_ADVANCED, collation);
+   RE_compile(&regex, text_re,
+              REG_ADVANCED | REG_NOSUB, collation);
  #endif
  
     /*
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c

index 7673dab76f486791643c1a31de7a2cc6773d8f7f..45727ffa01f719d64030a6aec2e1922c9e3a83e0 100644 (file)
--- a/src/backend/regex/regc_lex.c
+++ b/src/backend/regex/regc_lex.c
@@ -528,10 +528,7 @@ next(struct vars *v)
                 }
                 assert(NOTREACHED);
             }
-           if (v->cflags & REG_NOSUB)
-               RETV('(', 0);   /* all parens non-capturing */
-           else
-               RETV('(', 1);
+           RETV('(', 1);
             break;
         case CHR(')'):
             if (LASTTYPE('('))
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 60a220c57ab69df02decdc22a2e01b11f06fb499..ae3a7b6a38c08faf7bf763f6db7f01ab32b0df2a 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
  static void freesubre(struct vars *, struct subre *);
  static void freesubreandsiblings(struct vars *, struct subre *);
  static void freesrnode(struct vars *, struct subre *);
-static void optst(struct vars *, struct subre *);
+static void removecaptures(struct vars *, struct subre *);
  static int numst(struct subre *, int);
  static void markst(struct subre *);
  static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
         dumpst(v->tree, debug, 1);
     }
  #endif
-   optst(v, v->tree);
+   if (v->cflags & REG_NOSUB)
+       removecaptures(v, v->tree);
     v->ntree = numst(v->tree, 1);
     markst(v->tree);
     cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
             break;
         case BACKREF:           /* the Feature From The Black Lagoon */
             INSIST(type != LACON, REG_ESUBREG);
-           INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-           INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+           subno = v->nextvalue;
+           assert(subno > 0);
+           INSIST(subno < v->nsubs, REG_ESUBREG);
+           NOERRN();
+           INSIST(v->subs[subno] != NULL, REG_ESUBREG);
             NOERRN();
-           assert(v->nextvalue > 0);
             atom = subre(v, 'b', BACKR, lp, rp);
             NOERRN();
-           subno = v->nextvalue;
             atom->backno = subno;
+           v->subs[subno]->flags |= BRUSE;
             EMPTYARC(lp, rp);   /* temporarily, so there's something */
             NEXT();
             break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v,      /* might be NULL */
  }
  
  /*
- * optst - optimize a subRE subtree
+ * removecaptures - remove unnecessary capture subREs
+ *
+ * If the caller said that it doesn't care about subexpression match data,
+ * we may delete the "capture" markers on subREs that are not referenced
+ * by any backrefs, and then simplify anything that's become non-messy.
+ * Call this only if REG_NOSUB flag is set.
   */
  static void
-optst(struct vars *v,
-     struct subre *t)
+removecaptures(struct vars *v,
+              struct subre *t)
  {
+   struct subre *t2;
+
+   assert(t != NULL);
+
+   /*
+    * If this isn't itself a backref target, clear capno and tentatively
+    * clear CAP flag.
+    */
+   if (!(t->flags & BRUSE))
+   {
+       t->capno = 0;
+       t->flags &= ~CAP;
+   }
+
+   /* Now recurse to children */
+   for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
+   {
+       removecaptures(v, t2);
+       /* Propagate child CAP flag back up, if it's still set */
+       if (t2->flags & CAP)
+           t->flags |= CAP;
+   }
+
     /*
-    * DGP (2007-11-13): I assume it was the programmer's intent to eventually
-    * come back and add code to optimize subRE trees, but the routine coded
-    * just spends effort traversing the tree and doing nothing. We can do
-    * nothing with less effort.
+    * If t now contains neither captures nor backrefs, there's no longer any
+    * need to care where its sub-match boundaries are, so we can reduce it to
+    * a simple DFA node.  (Note in particular that MIXED child greediness is
+    * not a hindrance here, so we don't use the MESSY() macro.)
      */
-   return;
+   if ((t->flags & (CAP | BACKR)) == 0)
+   {
+       if (t->child)
+           freesubreandsiblings(v, t->child);
+       t->child = NULL;
+       t->op = '=';
+       t->flags &= ~MIXED;
+   }
  }
  
  /*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
         fprintf(f, " hascapture");
     if (t->flags & BACKR)
         fprintf(f, " hasbackref");
+   if (t->flags & BRUSE)
+       fprintf(f, " isreferenced");
     if (!(t->flags & INUSE))
         fprintf(f, " UNUSED");
     if (t->latype != (char) -1)
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c

index 5b9a0878203224effce404e8e798cfaa37348cc4..db54ebfba40ad455e6afc1b4e1b7cb9b25e4be02 100644 (file)
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -213,12 +213,9 @@ pg_regexec(regex_t *re,
         return REG_NOMATCH;
     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
     v->eflags = flags;
-   if (v->g->cflags & REG_NOSUB)
-       nmatch = 0;             /* override client */
-   v->nmatch = nmatch;
-   if (backref)
+   if (backref && nmatch <= v->g->nsub)
     {
-       /* need work area */
+       /* need larger work area */
         if (v->g->nsub + 1 <= LOCALMAT)
             v->pmatch = mat;
         else
@@ -229,7 +226,17 @@ pg_regexec(regex_t *re,
         v->nmatch = v->g->nsub + 1;
     }
     else
+   {
+       /* we can store results directly in caller's array */
         v->pmatch = pmatch;
+       /* ensure any extra entries in caller's array are filled with -1 */
+       if (nmatch > 0)
+           zapallsubs(pmatch, nmatch);
+       /* then forget about extra entries, to avoid useless work in find() */
+       if (nmatch > v->g->nsub + 1)
+           nmatch = v->g->nsub + 1;
+       v->nmatch = nmatch;
+   }
     v->details = details;
     v->start = (chr *) string;
     v->search_start = (chr *) string + search_start;
@@ -290,12 +297,20 @@ pg_regexec(regex_t *re,
     else
         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
  
-   /* copy (portion of) match vector over if necessary */
-   if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
+   /* on success, ensure caller's match vector is filled correctly */
+   if (st == REG_OKAY && nmatch > 0)
     {
-       zapallsubs(pmatch, nmatch);
-       n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
-       memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
+       if (v->pmatch != pmatch)
+       {
+           /* copy portion of match vector over from (larger) work area */
+           assert(nmatch <= v->nmatch);
+           memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
+       }
+       if (v->g->cflags & REG_NOSUB)
+       {
+           /* don't expose possibly-partial sub-match results to caller */
+           zapallsubs(pmatch, nmatch);
+       }
     }
  
     /* clean up */
@@ -752,7 +767,6 @@ cdissect(struct vars *v,
             break;
         case '(':               /* no-op capture node */
             assert(t->child != NULL);
-           assert(t->capno > 0);
             er = cdissect(v, t->child, begin, end);
             break;
         default:
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y

index de3d97931ef4593caf8fb277343c52ab6a0fdd22..bd5d4488a06878be75480248b90bcf4189e94af4 100644 (file)
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
                      errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
     }
  
+   /*
+    * We'll never need sub-match details at execution.  While
+    * RE_compile_and_execute would set this flag anyway, force it on here to
+    * ensure that the regex cache entries created by makeItemLikeRegex are
+    * useful.
+    */
+   cflags |= REG_NOSUB;
+
     return cflags;
  }
  
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 484d4265fd84e104b664d795b78e21f80bed6246..268cee1cbeddeffc74add77992015265a3077ec3 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
  {
     regex_t    *re;
  
+   /* Use REG_NOSUB if caller does not want sub-match details */
+   if (nmatch < 2)
+       cflags |= REG_NOSUB;
+
     /* Compile RE */
     re = RE_compile_and_cache(text_re, cflags, collation);
  
@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     int         orig_len;
     pg_wchar   *wide_str;
     int         wide_len;
+   int         cflags;
     regex_t    *cpattern;
     regmatch_t *pmatch;
     int         pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
  
     /* set up the compiled pattern */
-   cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
+   cflags = re_flags->cflags;
+   if (!use_subpatterns)
+       cflags |= REG_NOSUB;
+   cpattern = RE_compile_and_cache(pattern, cflags, collation);
  
     /* do we want to remember subpatterns? */
     if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
     if (case_insensitive)
         cflags |= REG_ICASE;
  
-   re = RE_compile_and_cache(text_re, cflags, collation);
+   re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
  
     /* Examine it to see if there's a fixed prefix */
     re_result = pg_regprefix(re, &str, &slen);
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h

index 2b48a19fb7d05a7edeba7cd39c197ecd97dbf933..0455ae8069e6ecc9e00b37753b7e2f48bf037cdf 100644 (file)
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -106,7 +106,7 @@ typedef struct
  #define REG_QUOTE  000004      /* no special characters, none */
  #define REG_NOSPEC REG_QUOTE   /* historical synonym */
  #define REG_ICASE  000010      /* ignore case */
-#define REG_NOSUB  000020      /* don't care about subexpressions */
+#define REG_NOSUB  000020      /* caller doesn't need subexpr match data */
  #define REG_EXPANDED   000040  /* expanded format, white space & comments */
  #define REG_NLSTOP 000100      /* \n doesn't match . or [^ ] */
  #define REG_NLANCH 000200      /* ^ matches after \n, $ before */
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h

index 8db631c83bb0f22c46f52c999bcf40fe3928dee6..91a52840c4743ec34bd41044a822f01d58af1fea 100644 (file)
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -477,13 +477,14 @@ struct subre
  #define  MIXED  04             /* mixed preference below */
  #define  CAP    010            /* capturing parens here or below */
  #define  BACKR  020            /* back reference here or below */
+#define  BRUSE  040            /* is referenced by a back reference */
  #define  INUSE  0100           /* in use in final tree */
-#define  NOPROP  03                /* bits which may not propagate up */
+#define  UPPROP  (MIXED|CAP|BACKR) /* flags which should propagate up */
  #define  LMIX(f) ((f)<<2)      /* LONGER -> MIXED */
  #define  SMIX(f) ((f)<<1)      /* SHORTER -> MIXED */
-#define  UP(f)  (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
+#define  UP(f)  (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
  #define  MESSY(f)   ((f)&(MIXED|CAP|BACKR))
-#define  PREF(f) ((f)&NOPROP)
+#define  PREF(f)    ((f)&(LONGER|SHORTER))
  #define  PREF2(f1, f2)  ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
  #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
     char        latype;         /* LATYPE code, if lookaround constraint */
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out

index 44da7d201909aaaf3bd7b75b20a5044126dde194..5a6cdf47c2fc443d1b516824687bb3e06bf2c6fa 100644 (file)
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
   {ae,a}
  (2 rows)
  
--- expectMatch 4.2  o      (a)e        ae
-select * from test_regex('(a)e', 'ae', 'o');
- test_regex 
-------------
- {0}
- {NULL}
+-- expectMatch 4.2  oPR    (.)\1e      abeaae  aae {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
+           test_regex           
+--------------------------------
+ {1,REG_UBACKREF,REG_UNONPOSIX}
+ {aae,NULL}
  (2 rows)
  
  -- expectMatch 4.3  b      {\(a\)b}    ab  ab  a
@@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
   {"abc abc",abc}
  (2 rows)
  
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+ substring 
+-----------
+ f
+(1 row)
+
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+ regexp_split_to_array 
+-----------------------
+ {abc,def,ghi}
+(1 row)
+
  -- doing 15 "octal escapes vs back references"
  -- # initial zero is always octal
  -- expectMatch 15.1  MP    "a\\010b"   "a\bb"  "a\bb"
@@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
   {x,x,x,NULL}
  (2 rows)
  
+-- expectMatch 21.37 RP    ((.))(\2)   xyy yy  y   y   y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+           test_regex           
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,y,y,y}
+(2 rows)
+
+-- expectMatch 21.38 oRP   ((.))(\2)   xyy yy  {}  {}  {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
+           test_regex           
+--------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX}
+ {yy,NULL,NULL,NULL}
+(2 rows)
+
  -- doing 22 "multicharacter collating elements"
  -- # again ugh
  -- MCCEs are not implemented in Postgres, so we skip all these tests
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql

index 9224fdfdd3a76bc014261f5cd0e2bc934a42e7ff..3419564203af4844771b567c72af78da27e56544 100644 (file)
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
  
  -- expectMatch 4.1  -      (a)e        ae  ae  a
  select * from test_regex('(a)e', 'ae', '-');
--- expectMatch 4.2  o      (a)e        ae
-select * from test_regex('(a)e', 'ae', 'o');
+-- expectMatch 4.2  oPR    (.)\1e      abeaae  aae {}
+select * from test_regex('(.)\1e', 'abeaae', 'oPR');
  -- expectMatch 4.3  b      {\(a\)b}    ab  ab  a
  select * from test_regex('\(a\)b', 'ab', 'b');
  -- expectMatch 4.4  -      a((b)c)     abc abc bc  b
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
  select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
  select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
  
+-- exercise oversize-regmatch_t-array paths in regexec()
+-- (that case is not reachable via test_regex, sadly)
+select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
+select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
+
  -- doing 15 "octal escapes vs back references"
  
  -- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
  select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
  -- expectMatch 21.36 RPQ   ((.))(\2){0}    xy  x   x   x   {}
  select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
+-- expectMatch 21.37 RP    ((.))(\2)   xyy yy  y   y   y
+select * from test_regex('((.))(\2)', 'xyy', 'RP');
+-- expectMatch 21.38 oRP   ((.))(\2)   xyy yy  {}  {}  {}
+select * from test_regex('((.))(\2)', 'xyy', 'oRP');
  
  -- doing 22 "multicharacter collating elements"
  -- # again ugh
author	Tom Lane <[email protected]>
	Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)
committer	Tom Lane <[email protected]>
	Mon, 9 Aug 2021 15:26:34 +0000 (11:26 -0400)
contrib/pg_trgm/trgm_regexp.c		patch \| blob \| blame \| history
src/backend/regex/regc_lex.c		patch \| blob \| blame \| history
src/backend/regex/regcomp.c		patch \| blob \| blame \| history
src/backend/regex/regexec.c		patch \| blob \| blame \| history
src/backend/utils/adt/jsonpath_gram.y		patch \| blob \| blame \| history
src/backend/utils/adt/regexp.c		patch \| blob \| blame \| history
src/include/regex/regex.h		patch \| blob \| blame \| history
src/include/regex/regguts.h		patch \| blob \| blame \| history
src/test/modules/test_regex/expected/test_regex.out		patch \| blob \| blame \| history
src/test/modules/test_regex/sql/test_regex.sql		patch \| blob \| blame \| history