Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the

author Tom Lane <[email protected]>

Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)

committer Tom Lane <[email protected]>

Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)
author Tom Lane <[email protected]>
Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)
committer Tom Lane <[email protected]>
Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index d249772064b9b2553236d883538c30393008b7c4..a477d8332050d93be331e96029033a2a0e756cc8 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66 2006/10/04 00:29:59 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.67 2007/01/03 22:39:26 tgl Exp $
   *
   *     Alistair Crooks added the code for the regex caching
   *     agc - cached the regular expressions used - there's a good chance
@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS)
  
     PG_RETURN_TEXT_P(result);
  }
+
+/*
+ * report whether regex_flavor is currently BASIC
+ */
+bool
+regex_flavor_is_basic(void)
+{
+   return (regex_flavor == REG_BASIC);
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 73e82cd5e0194a7a740eb7ed7c09569522a0aaf5..0f934ef9d7ceff868cc58e1ef42d72b6a3abee8f 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
   * These routines support analysis of LIKE and regular-expression patterns
   * by the planner/optimizer.  It's important that they agree with the
   * regular-expression code in backend/regex/ and the LIKE code in
- * backend/utils/adt/like.c.
+ * backend/utils/adt/like.c.  Also, the computation of the fixed prefix
+ * must be conservative: if we report a string longer than the true fixed
+ * prefix, the query may produce actually wrong answers, rather than just
+ * getting a bad selectivity estimate!
   *
   * Note that the prefix-analysis functions are called from
   * backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
     Oid         typeid = patt_const->consttype;
     int         pos,
                 match_pos;
+   bool        is_multibyte = (pg_database_encoding_max_length() > 1);
  
     /* the right-hand const is type text or bytea */
     Assert(typeid == BYTEAOID || typeid == TEXTOID);
@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
         }
  
         /*
-        * XXX I suspect isalpha() is not an adequately locale-sensitive test
-        * for characters that can vary under case folding?
+        * XXX In multibyte character sets, we can't trust isalpha, so assume
+        * any multibyte char is potentially case-varying.
          */
-       if (case_insensitive && isalpha((unsigned char) patt[pos]))
-           break;
+       if (case_insensitive)
+       {
+           if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+               break;
+           if (isalpha((unsigned char) patt[pos]))
+               break;
+       }
  
         /*
          * NOTE: this code used to think that %% meant a literal %, but
@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
     int         pos,
                 match_pos,
                 prev_pos,
-               prev_match_pos,
-               paren_depth;
+               prev_match_pos;
+   bool        have_leading_paren;
     char       *patt;
     char       *rest;
     Oid         typeid = patt_const->consttype;
+   bool        is_basic = regex_flavor_is_basic();
+   bool        is_multibyte = (pg_database_encoding_max_length() > 1);
  
     /*
      * Should be unnecessary, there are no bytea regex operators defined. As
@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
     /* the right-hand const is type text for all of these */
     patt = DatumGetCString(DirectFunctionCall1(textout, patt_const->constvalue));
  
+   /*
+    * Check for ARE director prefix.  It's worth our trouble to recognize
+    * this because similar_escape() uses it.
+    */
+   pos = 0;
+   if (strncmp(patt, "***:", 4) == 0)
+   {
+       pos = 4;
+       is_basic = false;
+   }
+
     /* Pattern must be anchored left */
-   if (patt[0] != '^')
+   if (patt[pos] != '^')
     {
         rest = patt;
  
@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
  
         return Pattern_Prefix_None;
     }
+   pos++;
  
     /*
-    * If unquoted | is present at paren level 0 in pattern, then there are
-    * multiple alternatives for the start of the string.
+    * If '|' is present in pattern, then there may be multiple alternatives
+    * for the start of the string.  (There are cases where this isn't so,
+    * for instance if the '|' is inside parens, but detecting that reliably
+    * is too hard.)
      */
-   paren_depth = 0;
-   for (pos = 1; patt[pos]; pos++)
+   if (strchr(patt + pos, '|') != NULL)
     {
-       if (patt[pos] == '|' && paren_depth == 0)
-       {
-           rest = patt;
+       rest = patt;
  
-           *prefix_const = NULL;
-           *rest_const = string_to_const(rest, typeid);
+       *prefix_const = NULL;
+       *rest_const = string_to_const(rest, typeid);
  
-           return Pattern_Prefix_None;
-       }
-       else if (patt[pos] == '(')
-           paren_depth++;
-       else if (patt[pos] == ')' && paren_depth > 0)
-           paren_depth--;
-       else if (patt[pos] == '\\')
-       {
-           /* backslash quotes the next character */
-           pos++;
-           if (patt[pos] == '\0')
-               break;
-       }
+       return Pattern_Prefix_None;
     }
  
     /* OK, allocate space for pattern */
     match = palloc(strlen(patt) + 1);
     prev_match_pos = match_pos = 0;
  
-   /* note start at pos 1 to skip leading ^ */
-   for (prev_pos = pos = 1; patt[pos];)
+   /*
+    * We special-case the syntax '^(...)$' because psql uses it.  But beware:
+    * in BRE mode these parentheses are just ordinary characters.  Also,
+    * sequences beginning "(?" are not what they seem, unless they're "(?:".
+    * (We should recognize that, too, because of similar_escape().)
+    *
+    * Note: it's a bit bogus to be depending on the current regex_flavor
+    * setting here, because the setting could change before the pattern is
+    * used.  We minimize the risk by trusting the flavor as little as we can,
+    * but perhaps it would be a good idea to get rid of the "basic" setting.
+    */
+   have_leading_paren = false;
+   if (patt[pos] == '(' && !is_basic &&
+       (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
+   {
+       have_leading_paren = true;
+       pos += (patt[pos + 1] != '?' ? 1 : 3);
+   }
+
+   /* Scan remainder of pattern */
+   prev_pos = pos;
+   while (patt[pos])
     {
         int         len;
  
         /*
          * Check for characters that indicate multiple possible matches here.
-        * XXX I suspect isalpha() is not an adequately locale-sensitive test
-        * for characters that can vary under case folding?
+        * Also, drop out at ')' or '$' so the termination test works right.
          */
         if (patt[pos] == '.' ||
             patt[pos] == '(' ||
+           patt[pos] == ')' ||
             patt[pos] == '[' ||
-           patt[pos] == '$' ||
-           (case_insensitive && isalpha((unsigned char) patt[pos])))
+           patt[pos] == '^' ||
+           patt[pos] == '$')
             break;
  
         /*
-        * In AREs, backslash followed by alphanumeric is an escape, not a
-        * quoted character.  Must treat it as having multiple possible
-        * matches.
+        * XXX In multibyte character sets, we can't trust isalpha, so assume
+        * any multibyte char is potentially case-varying.
          */
-       if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
-           break;
+       if (case_insensitive)
+       {
+           if (is_multibyte && (unsigned char) patt[pos] >= 0x80)
+               break;
+           if (isalpha((unsigned char) patt[pos]))
+               break;
+       }
  
         /*
          * Check for quantifiers.  Except for +, this means the preceding
          * character is optional, so we must remove it from the prefix too!
+        * Note: in BREs, \{ is a quantifier.
          */
         if (patt[pos] == '*' ||
             patt[pos] == '?' ||
-           patt[pos] == '{')
+           patt[pos] == '{' ||
+           (patt[pos] == '\\' && patt[pos + 1] == '{'))
         {
             match_pos = prev_match_pos;
             pos = prev_pos;
@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
             pos = prev_pos;
             break;
         }
+
+       /*
+        * Normally, backslash quotes the next character.  But in AREs,
+        * backslash followed by alphanumeric is an escape, not a quoted
+        * character.  Must treat it as having multiple possible matches.
+        * In BREs, \( is a parenthesis, so don't trust that either.
+        * Note: since only ASCII alphanumerics are escapes, we don't have
+        * to be paranoid about multibyte here.
+        */
         if (patt[pos] == '\\')
         {
-           /* backslash quotes the next character */
+           if (isalnum((unsigned char) patt[pos + 1]) || patt[pos + 1] == '(')
+               break;
             pos++;
             if (patt[pos] == '\0')
                 break;
@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
     match[match_pos] = '\0';
     rest = &patt[pos];
  
+   if (have_leading_paren && patt[pos] == ')')
+       pos++;
+
     if (patt[pos] == '$' && patt[pos + 1] == '\0')
     {
         rest = &patt[pos + 1];
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h

index 56e63d7f73c0a38c86d594957ab14b9ba59d3bbf..170fa28ca8de992157d2df0c84af09145d68d086 100644 (file)
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -7,7 +7,7 @@
   * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
- * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.283 2006/12/30 21:21:55 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.284 2007/01/03 22:39:26 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
  extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS);
  extern Datum textregexreplace(PG_FUNCTION_ARGS);
  extern Datum similar_escape(PG_FUNCTION_ARGS);
+extern bool regex_flavor_is_basic(void);
  
  /* regproc.c */
  extern Datum regprocin(PG_FUNCTION_ARGS);
author	Tom Lane <[email protected]>
	Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)
committer	Tom Lane <[email protected]>
	Wed, 3 Jan 2007 22:39:26 +0000 (22:39 +0000)
src/backend/utils/adt/regexp.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/utils/builtins.h		patch \| blob \| blame \| history