Improve statistics estimation for single-column GROUP BY in sub-queries

author Alexander Korotkov <[email protected]>

Wed, 19 Feb 2025 09:56:54 +0000 (11:56 +0200)

committer Alexander Korotkov <[email protected]>

Wed, 19 Feb 2025 09:59:30 +0000 (11:59 +0200)
author Alexander Korotkov <[email protected]>
Wed, 19 Feb 2025 09:56:54 +0000 (11:56 +0200)
committer Alexander Korotkov <[email protected]>
Wed, 19 Feb 2025 09:59:30 +0000 (11:59 +0200)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index d3d1e485bb2559393e9476ac4b874b7899b0bf76..c2918c9c83130dc0dbf4bf2752b3b4df866a46f2 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -322,10 +322,10 @@ var_eq_const(VariableStatData *vardata, Oid oproid, Oid collation,
     }
  
     /*
-    * If we matched the var to a unique index or DISTINCT clause, assume
-    * there is exactly one match regardless of anything else.  (This is
-    * slightly bogus, since the index or clause's equality operator might be
-    * different from ours, but it's much more likely to be right than
+    * If we matched the var to a unique index, DISTINCT or GROUP-BY clause,
+    * assume there is exactly one match regardless of anything else.  (This
+    * is slightly bogus, since the index or clause's equality operator might
+    * be different from ours, but it's much more likely to be right than
      * ignoring the information.)
      */
     if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
@@ -484,10 +484,10 @@ var_eq_non_const(VariableStatData *vardata, Oid oproid, Oid collation,
     }
  
     /*
-    * If we matched the var to a unique index or DISTINCT clause, assume
-    * there is exactly one match regardless of anything else.  (This is
-    * slightly bogus, since the index or clause's equality operator might be
-    * different from ours, but it's much more likely to be right than
+    * If we matched the var to a unique index, DISTINCT or GROUP-BY clause,
+    * assume there is exactly one match regardless of anything else.  (This
+    * is slightly bogus, since the index or clause's equality operator might
+    * be different from ours, but it's much more likely to be right than
      * ignoring the information.)
      */
     if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0)
@@ -5018,11 +5018,11 @@ ReleaseDummy(HeapTuple tuple)
   * atttype, atttypmod: actual type/typmod of the "var" expression.  This is
   *     commonly the same as the exposed type of the variable argument,
   *     but can be different in binary-compatible-type cases.
- * isunique: true if we were able to match the var to a unique index or a
- *     single-column DISTINCT clause, implying its values are unique for
- *     this query.  (Caution: this should be trusted for statistical
- *     purposes only, since we do not check indimmediate nor verify that
- *     the exact same definition of equality applies.)
+ * isunique: true if we were able to match the var to a unique index, a
+ *     single-column DISTINCT or GROUP-BY clause, implying its values are
+ *     unique for this query.  (Caution: this should be trusted for
+ *     statistical purposes only, since we do not check indimmediate nor
+ *     verify that the exact same definition of equality applies.)
   * acl_ok: true if current user has permission to read the column(s)
   *     underlying the pg_statistic entry.  This is consulted by
   *     statistic_proc_security_check().
@@ -5680,15 +5680,14 @@ examine_simple_variable(PlannerInfo *root, Var *var,
         Assert(IsA(subquery, Query));
  
         /*
-        * Punt if subquery uses set operations or GROUP BY, as these will
-        * mash underlying columns' stats beyond recognition.  (Set ops are
-        * particularly nasty; if we forged ahead, we would return stats
+        * Punt if subquery uses set operations or grouping sets, as these
+        * will mash underlying columns' stats beyond recognition.  (Set ops
+        * are particularly nasty; if we forged ahead, we would return stats
          * relevant to only the leftmost subselect...)  DISTINCT is also
          * problematic, but we check that later because there is a possibility
          * of learning something even with it.
          */
         if (subquery->setOperations ||
-           subquery->groupClause ||
             subquery->groupingSets)
             return;
  
@@ -5718,6 +5717,16 @@ examine_simple_variable(PlannerInfo *root, Var *var,
             return;
         }
  
+       /* The same idea as with DISTINCT clause works for a GROUP-BY too */
+       if (subquery->groupClause)
+       {
+           if (list_length(subquery->groupClause) == 1 &&
+               targetIsInSortList(ste, InvalidOid, subquery->groupClause))
+               vardata->isunique = true;
+           /* cannot go further */
+           return;
+       }
+
         /*
          * If the sub-query originated from a view with the security_barrier
          * attribute, we must not look at the variable's statistics, though it
@@ -5869,11 +5878,11 @@ get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
     }
  
     /*
-    * If there is a unique index or DISTINCT clause for the variable, assume
-    * it is unique no matter what pg_statistic says; the statistics could be
-    * out of date, or we might have found a partial unique index that proves
-    * the var is unique for this query.  However, we'd better still believe
-    * the null-fraction statistic.
+    * If there is a unique index, DISTINCT or GROUP-BY clause for the
+    * variable, assume it is unique no matter what pg_statistic says; the
+    * statistics could be out of date, or we might have found a partial
+    * unique index that proves the var is unique for this query.  However,
+    * we'd better still believe the null-fraction statistic.
      */
     if (vardata->isunique)
         stadistinct = -1.0 * (1.0 - stanullfrac);
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h

index b5e95c0ff6152eeb1a5c0c9f3b45d55e7a163608..d35939651f3032742bbd7ed48fb3c7c8804b2c49 100644 (file)
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -94,7 +94,8 @@ typedef struct VariableStatData
     Oid         vartype;        /* exposed type of expression */
     Oid         atttype;        /* actual type (after stripping relabel) */
     int32       atttypmod;      /* actual typmod (after stripping relabel) */
-   bool        isunique;       /* matches unique index or DISTINCT clause */
+   bool        isunique;       /* matches unique index, DISTINCT or GROUP-BY
+                                * clause */
     bool        acl_ok;         /* result of ACL check on table or column */
  } VariableStatData;
  
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out

index 9a820404d3f8134c9268d5e5775a53ce732dda94..904d3e623f5f3578caac48e3ec46faafd52394ab 100644 (file)
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3368,3 +3368,18 @@ NOTICE:  drop cascades to 2 other objects
  DETAIL:  drop cascades to table tststats.priv_test_tbl
  drop cascades to view tststats.priv_test_view
  DROP USER regress_stats_user1;
+CREATE TABLE grouping_unique (x integer);
+INSERT INTO grouping_unique (x) SELECT gs FROM generate_series(1,1000) AS gs;
+ANALYZE grouping_unique;
+-- Optimiser treat GROUP-BY operator as an 'uniqueser' of the input
+SELECT * FROM check_estimated_rows('
+  SELECT * FROM generate_series(1, 1) t1 LEFT JOIN (
+    SELECT x FROM grouping_unique t2 GROUP BY x) AS q1
+  ON t1.t1 = q1.x;
+');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+DROP TABLE grouping_unique;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql

index 75b04e5a136002518f59a7d3dfe2ca48b7f33e70..88b33ccaef825f6d6d3dcba3500cf96f5d9404b1 100644 (file)
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1707,3 +1707,15 @@ RESET SESSION AUTHORIZATION;
  DROP TABLE stats_ext_tbl;
  DROP SCHEMA tststats CASCADE;
  DROP USER regress_stats_user1;
+
+CREATE TABLE grouping_unique (x integer);
+INSERT INTO grouping_unique (x) SELECT gs FROM generate_series(1,1000) AS gs;
+ANALYZE grouping_unique;
+
+-- Optimiser treat GROUP-BY operator as an 'uniqueser' of the input
+SELECT * FROM check_estimated_rows('
+  SELECT * FROM generate_series(1, 1) t1 LEFT JOIN (
+    SELECT x FROM grouping_unique t2 GROUP BY x) AS q1
+  ON t1.t1 = q1.x;
+');
+DROP TABLE grouping_unique;
author	Alexander Korotkov <[email protected]>
	Wed, 19 Feb 2025 09:56:54 +0000 (11:56 +0200)
committer	Alexander Korotkov <[email protected]>
	Wed, 19 Feb 2025 09:59:30 +0000 (11:59 +0200)
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history
src/test/regress/expected/stats_ext.out		patch \| blob \| blame \| history
src/test/regress/sql/stats_ext.sql		patch \| blob \| blame \| history