Improve regex compiler's arc moving/copying logic.

author Tom Lane <[email protected]>

Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)

committer Tom Lane <[email protected]>

Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)
author Tom Lane <[email protected]>
Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)
committer Tom Lane <[email protected]>
Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c

index 6d77c59e121396bbe5198c13a79e2ff83ab90ec9..059cf64df07a1c03fa4997779f6f609485d47485 100644 (file)
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b)
   * However, if we have a whole lot of arcs to deal with, retail duplicate
   * checks become too slow.  In that case we proceed by sorting and merging
   * the arc lists, and then we can indeed just update the arcs in-place.
+ *
+ * On the other hand, it's also true that this is frequently called with
+ * a brand-new newState that has no existing in-arcs.  In that case,
+ * de-duplication is unnecessary, so we can just blindly move all the arcs.
   */
  static void
  moveins(struct nfa *nfa,
@@ -785,7 +789,18 @@ moveins(struct nfa *nfa,
  {
     assert(oldState != newState);
  
-   if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+   if (newState->nins == 0)
+   {
+       /* No need for de-duplication */
+       struct arc *a;
+
+       while ((a = oldState->ins) != NULL)
+       {
+           createarc(nfa, a->type, a->co, a->from, newState);
+           freearc(nfa, a);
+       }
+   }
+   else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     {
         /* With not too many arcs, just do them one at a time */
         struct arc *a;
@@ -869,6 +884,11 @@ moveins(struct nfa *nfa,
  
  /*
   * copyins - copy in arcs of a state to another state
+ *
+ * The comments for moveins() apply here as well.  However, in current
+ * usage, this is *only* called with brand-new target states, so that
+ * only the "no need for de-duplication" code path is ever reached.
+ * We keep the rest #ifdef'd out in case it's needed in the future.
   */
  static void
  copyins(struct nfa *nfa,
@@ -876,8 +896,18 @@ copyins(struct nfa *nfa,
         struct state *newState)
  {
     assert(oldState != newState);
+   assert(newState->nins == 0);    /* see comment above */
+
+   if (newState->nins == 0)
+   {
+       /* No need for de-duplication */
+       struct arc *a;
  
-   if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
+       for (a = oldState->ins; a != NULL; a = a->inchain)
+           createarc(nfa, a->type, a->co, a->from, newState);
+   }
+#ifdef NOT_USED                    /* see comment above */
+   else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
     {
         /* With not too many arcs, just do them one at a time */
         struct arc *a;
@@ -944,6 +974,7 @@ copyins(struct nfa *nfa,
             createarc(nfa, a->type, a->co, a->from, newState);
         }
     }
+#endif                         /* NOT_USED */
  }
  
  /*
@@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa,
  {
     assert(oldState != newState);
  
-   if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+   if (newState->nouts == 0)
+   {
+       /* No need for de-duplication */
+       struct arc *a;
+
+       while ((a = oldState->outs) != NULL)
+       {
+           createarc(nfa, a->type, a->co, newState, a->to);
+           freearc(nfa, a);
+       }
+   }
+   else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
     {
         /* With not too many arcs, just do them one at a time */
         struct arc *a;
@@ -1142,6 +1184,8 @@ moveouts(struct nfa *nfa,
  
  /*
   * copyouts - copy out arcs of a state to another state
+ *
+ * See comments for copyins()
   */
  static void
  copyouts(struct nfa *nfa,
@@ -1149,8 +1193,18 @@ copyouts(struct nfa *nfa,
          struct state *newState)
  {
     assert(oldState != newState);
+   assert(newState->nouts == 0);   /* see comment above */
+
+   if (newState->nouts == 0)
+   {
+       /* No need for de-duplication */
+       struct arc *a;
  
-   if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
+       for (a = oldState->outs; a != NULL; a = a->outchain)
+           createarc(nfa, a->type, a->co, newState, a->to);
+   }
+#ifdef NOT_USED                    /* see comment above */
+   else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
     {
         /* With not too many arcs, just do them one at a time */
         struct arc *a;
@@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa,
             createarc(nfa, a->type, a->co, newState, a->to);
         }
     }
+#endif                         /* NOT_USED */
  }
  
  /*
@@ -1975,6 +2030,7 @@ combine(struct nfa *nfa,
             else if (a->co == RAINBOW)
             {
                 /* con is incompatible if it's for a pseudocolor */
+               /* (this is hypothetical; we make no such constraints today) */
                 if (nfa->cm->cd[con->co].flags & PSEUDO)
                     return INCOMPATIBLE;
                 /* otherwise, constraint constrains arc to be only its color */
@@ -2001,6 +2057,7 @@ combine(struct nfa *nfa,
             else if (a->co == RAINBOW)
             {
                 /* con is incompatible if it's for a pseudocolor */
+               /* (this is hypothetical; we make no such constraints today) */
                 if (nfa->cm->cd[con->co].flags & PSEUDO)
                     return INCOMPATIBLE;
                 /* otherwise, constraint constrains arc to be only its color */
@@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b)
         return -1;
     if (aa->to > bb->to)
         return +1;
+   /* This is unreached, since there should be no duplicate arcs now: */
     return 0;
  }
  
diff --git a/src/test/modules/test_regex/expected/test_regex.out b/src/test/modules/test_regex/expected/test_regex.out

index 5a6cdf47c2fc443d1b516824687bb3e06bf2c6fa..6242d0baa9a0ecca5297a584217e5ed9cf8dd944 100644 (file)
--- a/src/test/modules/test_regex/expected/test_regex.out
+++ b/src/test/modules/test_regex/expected/test_regex.out
@@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb');
   {0,REG_ULOCALE}
  (1 row)
  
+-- expectMatch 9.9b  &iL   {a[[=Y=]]}  ay  ay
+select * from test_regex('a[[=Y=]]', 'ay', 'iL');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {ay}
+(2 rows)
+
+select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {ay}
+(2 rows)
+
+-- expectNomatch   9.9c  &L    {a[[=Y=]]}  ay
+select * from test_regex('a[[=Y=]]', 'ay', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
+select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
  -- expectError 9.10 &      {a[0-[=x=]]}    ERANGE
  select * from test_regex('a[0-[=x=]]', '', '');
  ERROR:  invalid regular expression: invalid character range
@@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb');
   {0,REG_UUNPORT}
  (1 row)
  
+-- expectMatch 19.6 &iM    {a[B-Z]}    aC  aC
+select * from test_regex('a[B-Z]', 'aC', 'iM');
+   test_regex    
+-----------------
+ {0,REG_UUNPORT}
+ {aC}
+(2 rows)
+
+select * from test_regex('a[B-Z]', 'aC', 'iMb');
+   test_regex    
+-----------------
+ {0,REG_UUNPORT}
+ {aC}
+(2 rows)
+
+-- expectNomatch   19.7 &iM    {a[^B-Z]}   aC
+select * from test_regex('a[^B-Z]', 'aC', 'iM');
+   test_regex    
+-----------------
+ {0,REG_UUNPORT}
+(1 row)
+
+select * from test_regex('a[^B-Z]', 'aC', 'iMb');
+   test_regex    
+-----------------
+ {0,REG_UUNPORT}
+(1 row)
+
  -- doing 20 "directors and embedded options"
  -- expectError 20.1  &     ***?        BADPAT
  select * from test_regex('***?', '', '');
@@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz',
   {foo/bar/baz,foo,bar,baz}
  (2 rows)
  
+-- expectMatch 24.14 PRT   {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$}    {foo/bar/baz/quux}  {foo/bar/baz/quux}  {foo}   {bar/baz/quux}  {}
+select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
+                  test_regex                  
+----------------------------------------------
+ {3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST}
+ {foo/bar/baz/quux,foo,bar/baz/quux,NULL}
+(2 rows)
+
  -- doing 25 "mixed quantifiers"
  -- # this is very incomplete as yet
  -- # should include |
@@ -4926,3 +4990,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP');
   {"",""}
  (2 rows)
  
+-- and now, tests not from either Spencer or the Tcl project
+-- These cases exercise additional code paths in pushfwd()/push()/combine()
+select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
+                  test_regex                   
+-----------------------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
+ {a}
+(2 rows)
+
+select * from test_regex('a(?=.)c', 'ac', 'HP');
+            test_regex             
+-----------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX}
+ {ac}
+(2 rows)
+
+select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
+            test_regex             
+-----------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX}
+ {azz33}
+(2 rows)
+
+select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
+                  test_regex                   
+-----------------------------------------------
+ {0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
+ {az3%}
+(2 rows)
+
+-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
+-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
+          test_regex           
+-------------------------------
+ {0,REG_UNONPOSIX,REG_ULOCALE}
+ {ABCDEFGHIJKLMNOPQRSTUVWXYZ3}
+(2 rows)
+
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
+                test_regex                 
+-------------------------------------------
+ {1,REG_UNONPOSIX,REG_ULOCALE}
+ {ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""}
+(2 rows)
+
+select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
+                         'az', '');
+  test_regex  
+--------------
+ {2}
+ {az,az,NULL}
+(2 rows)
+
diff --git a/src/test/modules/test_regex/expected/test_regex_utf8.out b/src/test/modules/test_regex/expected/test_regex_utf8.out

index 112698ac618bc9f51bd3774858808bc956f843dc..3b56f36c0711e2a44f70531c375e11d17f18a45d 100644 (file)
--- a/src/test/modules/test_regex/expected/test_regex_utf8.out
+++ b/src/test/modules/test_regex/expected/test_regex_utf8.out
@@ -98,3 +98,109 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
   {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
  (1 row)
  
+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+  E'\u1500\u1237', 'iELMP');
+                     test_regex                     
+----------------------------------------------------
+ {0,REG_UBBS,REG_UNONPOSIX,REG_UUNPORT,REG_ULOCALE}
+ {ᔀሷ}
+(2 rows)
+
+-- systematically test char classes
+select * from test_regex('[[:alnum:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:alpha:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:ascii:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {x}
+(2 rows)
+
+select * from test_regex('[[:blank:]]+',  E'x \t\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {"      "}
+(2 rows)
+
+select * from test_regex('[[:cntrl:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+(1 row)
+
+select * from test_regex('[[:digit:]]+',  E'x9\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {9}
+(2 rows)
+
+select * from test_regex('[[:graph:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:lower:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {x}
+(2 rows)
+
+select * from test_regex('[[:print:]]+',  E'x\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {xᔀሷ}
+(2 rows)
+
+select * from test_regex('[[:punct:]]+',  E'x.\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {.}
+(2 rows)
+
+select * from test_regex('[[:space:]]+',  E'x \t\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {"      "}
+(2 rows)
+
+select * from test_regex('[[:upper:]]+',  E'xX\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {X}
+(2 rows)
+
+select * from test_regex('[[:xdigit:]]+',  E'xa9\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {a9}
+(2 rows)
+
+select * from test_regex('[[:word:]]+',  E'x_\u1500\u1237', 'L');
+   test_regex    
+-----------------
+ {0,REG_ULOCALE}
+ {x_ᔀሷ}
+(2 rows)
+
diff --git a/src/test/modules/test_regex/sql/test_regex.sql b/src/test/modules/test_regex/sql/test_regex.sql

index 3419564203af4844771b567c72af78da27e56544..389b8b61b3bff87f7849bb8dcc9effe5744873b0 100644 (file)
--- a/src/test/modules/test_regex/sql/test_regex.sql
+++ b/src/test/modules/test_regex/sql/test_regex.sql
@@ -304,6 +304,12 @@ select * from test_regex('a[[=x=]]', 'ay', '+Lb');
  -- expectNomatch   9.9  &+L    {a[[=x=]]}  az
  select * from test_regex('a[[=x=]]', 'az', '+L');
  select * from test_regex('a[[=x=]]', 'az', '+Lb');
+-- expectMatch 9.9b  &iL   {a[[=Y=]]}  ay  ay
+select * from test_regex('a[[=Y=]]', 'ay', 'iL');
+select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
+-- expectNomatch   9.9c  &L    {a[[=Y=]]}  ay
+select * from test_regex('a[[=Y=]]', 'ay', 'L');
+select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
  -- expectError 9.10 &      {a[0-[=x=]]}    ERANGE
  select * from test_regex('a[0-[=x=]]', '', '');
  select * from test_regex('a[0-[=x=]]', '', 'b');
@@ -864,6 +870,12 @@ select * from test_regex('a[b-d]', 'aC', 'iMb');
  -- expectNomatch   19.5 &iM    {a[^b-d]}   aC
  select * from test_regex('a[^b-d]', 'aC', 'iM');
  select * from test_regex('a[^b-d]', 'aC', 'iMb');
+-- expectMatch 19.6 &iM    {a[B-Z]}    aC  aC
+select * from test_regex('a[B-Z]', 'aC', 'iM');
+select * from test_regex('a[B-Z]', 'aC', 'iMb');
+-- expectNomatch   19.7 &iM    {a[^B-Z]}   aC
+select * from test_regex('a[^B-Z]', 'aC', 'iM');
+select * from test_regex('a[^B-Z]', 'aC', 'iMb');
  
  -- doing 20 "directors and embedded options"
  
@@ -1171,6 +1183,8 @@ select * from test_regex('z*4', '123zzzz456', '-');
  select * from test_regex('z*?4', '123zzzz456', 'PT');
  -- expectMatch 24.13 PT    {^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$} {foo/bar/baz}   {foo/bar/baz} {foo} {bar} {baz}
  select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz', 'PT');
+-- expectMatch 24.14 PRT   {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$}    {foo/bar/baz/quux}  {foo/bar/baz/quux}  {foo}   {bar/baz/quux}  {}
+select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
  
  -- doing 25 "mixed quantifiers"
  -- # this is very incomplete as yet
@@ -1741,3 +1755,21 @@ select * from test_regex(repeat('x*y*z*', 200), 'x', 'N');
  --     regexp {(\Y)+} foo
  -- } 1
  select * from test_regex('(\Y)+', 'foo', 'LNP');
+
+
+-- and now, tests not from either Spencer or the Tcl project
+
+-- These cases exercise additional code paths in pushfwd()/push()/combine()
+select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
+select * from test_regex('a(?=.)c', 'ac', 'HP');
+select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
+select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
+
+-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
+-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
+select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
+                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
+select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
+                         'az', '');
diff --git a/src/test/modules/test_regex/sql/test_regex_utf8.sql b/src/test/modules/test_regex/sql/test_regex_utf8.sql

index cfd9396194feef2b97805fc18cf431d09f297244..f23907162e4fdaad422c148b4928fd335842796d 100644 (file)
--- a/src/test/modules/test_regex/sql/test_regex_utf8.sql
+++ b/src/test/modules/test_regex/sql/test_regex_utf8.sql
@@ -58,3 +58,21 @@ select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
    E'\u1500\u1237', 'ELMP');
  select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
    E'A\u1239', 'ELMP');
+select * from test_regex('[[:alnum:]]*[[:upper:]]*[\u1000-\u2000]*\u1237',
+  E'\u1500\u1237', 'iELMP');
+
+-- systematically test char classes
+select * from test_regex('[[:alnum:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:alpha:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:ascii:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:blank:]]+',  E'x \t\u1500\u1237', 'L');
+select * from test_regex('[[:cntrl:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:digit:]]+',  E'x9\u1500\u1237', 'L');
+select * from test_regex('[[:graph:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:lower:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:print:]]+',  E'x\u1500\u1237', 'L');
+select * from test_regex('[[:punct:]]+',  E'x.\u1500\u1237', 'L');
+select * from test_regex('[[:space:]]+',  E'x \t\u1500\u1237', 'L');
+select * from test_regex('[[:upper:]]+',  E'xX\u1500\u1237', 'L');
+select * from test_regex('[[:xdigit:]]+',  E'xa9\u1500\u1237', 'L');
+select * from test_regex('[[:word:]]+',  E'x_\u1500\u1237', 'L');
author	Tom Lane <[email protected]>
	Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)
committer	Tom Lane <[email protected]>
	Tue, 17 Aug 2021 16:00:02 +0000 (12:00 -0400)
src/backend/regex/regc_nfa.c		patch \| blob \| blame \| history
src/test/modules/test_regex/expected/test_regex.out		patch \| blob \| blame \| history
src/test/modules/test_regex/expected/test_regex_utf8.out		patch \| blob \| blame \| history
src/test/modules/test_regex/sql/test_regex.sql		patch \| blob \| blame \| history
src/test/modules/test_regex/sql/test_regex_utf8.sql		patch \| blob \| blame \| history