Have the planner consider Incremental Sort for DISTINCT

author David Rowley <[email protected]>

Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)

committer David Rowley <[email protected]>

Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)
author David Rowley <[email protected]>
Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)
committer David Rowley <[email protected]>
Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 000d757bdd86b0f24c2fd9c28a8a9a0dfb7d7569..044fb246665482ee8520167a45f8c5e1c3416dcd 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4654,22 +4654,63 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
                                           cheapest_partial_path->rows,
                                           NULL, NULL);
  
-   /* first try adding unique paths atop of sorted paths */
+   /*
+    * Try sorting the cheapest path and incrementally sorting any paths with
+    * presorted keys and put a unique paths atop of those.
+    */
     if (grouping_is_sortable(parse->distinctClause))
     {
         foreach(lc, input_rel->partial_pathlist)
         {
-           Path       *path = (Path *) lfirst(lc);
+           Path       *input_path = (Path *) lfirst(lc);
+           Path       *sorted_path;
+           bool        is_sorted;
+           int         presorted_keys;
  
-           if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys))
+           is_sorted = pathkeys_count_contained_in(root->distinct_pathkeys,
+                                                   input_path->pathkeys,
+                                                   &presorted_keys);
+
+           if (is_sorted)
+               sorted_path = input_path;
+           else
             {
-               add_partial_path(partial_distinct_rel, (Path *)
-                                create_upper_unique_path(root,
-                                                         partial_distinct_rel,
-                                                         path,
-                                                         list_length(root->distinct_pathkeys),
-                                                         numDistinctRows));
+               /*
+                * Try at least sorting the cheapest path and also try
+                * incrementally sorting any path which is partially sorted
+                * already (no need to deal with paths which have presorted
+                * keys when incremental sort is disabled unless it's the
+                * cheapest partial path).
+                */
+               if (input_path != cheapest_partial_path &&
+                   (presorted_keys == 0 || !enable_incremental_sort))
+                   continue;
+
+               /*
+                * We've no need to consider both a sort and incremental sort.
+                * We'll just do a sort if there are no presorted keys and an
+                * incremental sort when there are presorted keys.
+                */
+               if (presorted_keys == 0 || !enable_incremental_sort)
+                   sorted_path = (Path *) create_sort_path(root,
+                                                           partial_distinct_rel,
+                                                           input_path,
+                                                           root->distinct_pathkeys,
+                                                           -1.0);
+               else
+                   sorted_path = (Path *) create_incremental_sort_path(root,
+                                                                       partial_distinct_rel,
+                                                                       input_path,
+                                                                       root->distinct_pathkeys,
+                                                                       presorted_keys,
+                                                                       -1.0);
             }
+
+           add_partial_path(partial_distinct_rel, (Path *)
+                            create_upper_unique_path(root, partial_distinct_rel,
+                                                     sorted_path,
+                                                     list_length(root->distinct_pathkeys),
+                                                     numDistinctRows));
         }
     }
  
@@ -4773,9 +4814,11 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
     if (grouping_is_sortable(parse->distinctClause))
     {
         /*
-        * First, if we have any adequately-presorted paths, just stick a
-        * Unique node on those.  Then consider doing an explicit sort of the
-        * cheapest input path and Unique'ing that.
+        * Firstly, if we have any adequately-presorted paths, just stick a
+        * Unique node on those.  We also, consider doing an explicit sort of
+        * the cheapest input path and Unique'ing that.  If any paths have
+        * presorted keys then we'll create an incremental sort atop of those
+        * before adding a unique node on the top.
          *
          * When we have DISTINCT ON, we must sort by the more rigorous of
          * DISTINCT and ORDER BY, else it won't have the desired behavior.
@@ -4785,8 +4828,8 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
          * the other.)
          */
         List       *needed_pathkeys;
-       Path       *path;
         ListCell   *lc;
+       double      limittuples = root->distinct_pathkeys == NIL ? 1.0 : -1.0;
  
         if (parse->hasDistinctOn &&
             list_length(root->distinct_pathkeys) <
@@ -4797,96 +4840,89 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
  
         foreach(lc, input_rel->pathlist)
         {
-           path = (Path *) lfirst(lc);
+           Path       *input_path = (Path *) lfirst(lc);
+           Path       *sorted_path;
+           bool        is_sorted;
+           int         presorted_keys;
  
-           if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
+           is_sorted = pathkeys_count_contained_in(needed_pathkeys,
+                                                   input_path->pathkeys,
+                                                   &presorted_keys);
+
+           if (is_sorted)
+               sorted_path = input_path;
+           else
             {
                 /*
-                * distinct_pathkeys may have become empty if all of the
-                * pathkeys were determined to be redundant.  If all of the
-                * pathkeys are redundant then each DISTINCT target must only
-                * allow a single value, therefore all resulting tuples must
-                * be identical (or at least indistinguishable by an equality
-                * check).  We can uniquify these tuples simply by just taking
-                * the first tuple.  All we do here is add a path to do "LIMIT
-                * 1" atop of 'path'.  When doing a DISTINCT ON we may still
-                * have a non-NIL sort_pathkeys list, so we must still only do
-                * this with paths which are correctly sorted by
-                * sort_pathkeys.
+                * Try at least sorting the cheapest path and also try
+                * incrementally sorting any path which is partially sorted
+                * already (no need to deal with paths which have presorted
+                * keys when incremental sort is disabled unless it's the
+                * cheapest input path).
                  */
-               if (root->distinct_pathkeys == NIL)
-               {
-                   Node       *limitCount;
-
-                   limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
-                                                   sizeof(int64),
-                                                   Int64GetDatum(1), false,
-                                                   FLOAT8PASSBYVAL);
+               if (input_path != cheapest_input_path &&
+                   (presorted_keys == 0 || !enable_incremental_sort))
+                   continue;
  
-                   /*
-                    * If the query already has a LIMIT clause, then we could
-                    * end up with a duplicate LimitPath in the final plan.
-                    * That does not seem worth troubling over too much.
-                    */
-                   add_path(distinct_rel, (Path *)
-                            create_limit_path(root, distinct_rel, path, NULL,
-                                              limitCount, LIMIT_OPTION_COUNT,
-                                              0, 1));
-               }
+               /*
+                * We've no need to consider both a sort and incremental sort.
+                * We'll just do a sort if there are no presorted keys and an
+                * incremental sort when there are presorted keys.
+                */
+               if (presorted_keys == 0 || !enable_incremental_sort)
+                   sorted_path = (Path *) create_sort_path(root,
+                                                           distinct_rel,
+                                                           input_path,
+                                                           needed_pathkeys,
+                                                           limittuples);
                 else
-               {
-                   add_path(distinct_rel, (Path *)
-                            create_upper_unique_path(root, distinct_rel,
-                                                     path,
-                                                     list_length(root->distinct_pathkeys),
-                                                     numDistinctRows));
-               }
+                   sorted_path = (Path *) create_incremental_sort_path(root,
+                                                                       distinct_rel,
+                                                                       input_path,
+                                                                       needed_pathkeys,
+                                                                       presorted_keys,
+                                                                       limittuples);
             }
-       }
  
-       /* For explicit-sort case, always use the more rigorous clause */
-       if (list_length(root->distinct_pathkeys) <
-           list_length(root->sort_pathkeys))
-       {
-           needed_pathkeys = root->sort_pathkeys;
-           /* Assert checks that parser didn't mess up... */
-           Assert(pathkeys_contained_in(root->distinct_pathkeys,
-                                        needed_pathkeys));
-       }
-       else
-           needed_pathkeys = root->distinct_pathkeys;
+           /*
+            * distinct_pathkeys may have become empty if all of the pathkeys
+            * were determined to be redundant.  If all of the pathkeys are
+            * redundant then each DISTINCT target must only allow a single
+            * value, therefore all resulting tuples must be identical (or at
+            * least indistinguishable by an equality check).  We can uniquify
+            * these tuples simply by just taking the first tuple.  All we do
+            * here is add a path to do "LIMIT 1" atop of 'sorted_path'.  When
+            * doing a DISTINCT ON we may still have a non-NIL sort_pathkeys
+            * list, so we must still only do this with paths which are
+            * correctly sorted by sort_pathkeys.
+            */
+           if (root->distinct_pathkeys == NIL)
+           {
+               Node       *limitCount;
  
-       path = cheapest_input_path;
-       if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-           path = (Path *) create_sort_path(root, distinct_rel,
-                                            path,
-                                            needed_pathkeys,
-                                            root->distinct_pathkeys == NIL ?
-                                            1.0 : -1.0);
+               limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
+                                               sizeof(int64),
+                                               Int64GetDatum(1), false,
+                                               FLOAT8PASSBYVAL);
  
-       /*
-        * As above, use a LimitPath instead of a UniquePath when all of the
-        * distinct_pathkeys are redundant and we're only going to get a
-        * series of tuples all with the same values anyway.
-        */
-       if (root->distinct_pathkeys == NIL)
-       {
-           Node       *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
-                                                       sizeof(int64),
-                                                       Int64GetDatum(1), false,
-                                                       FLOAT8PASSBYVAL);
-
-           add_path(distinct_rel, (Path *)
-                    create_limit_path(root, distinct_rel, path, NULL,
-                                      limitCount, LIMIT_OPTION_COUNT, 0, 1));
-       }
-       else
-       {
-           add_path(distinct_rel, (Path *)
-                    create_upper_unique_path(root, distinct_rel,
-                                             path,
-                                             list_length(root->distinct_pathkeys),
-                                             numDistinctRows));
+               /*
+                * If the query already has a LIMIT clause, then we could end
+                * up with a duplicate LimitPath in the final plan. That does
+                * not seem worth troubling over too much.
+                */
+               add_path(distinct_rel, (Path *)
+                        create_limit_path(root, distinct_rel, sorted_path,
+                                          NULL, limitCount,
+                                          LIMIT_OPTION_COUNT, 0, 1));
+           }
+           else
+           {
+               add_path(distinct_rel, (Path *)
+                        create_upper_unique_path(root, distinct_rel,
+                                                 sorted_path,
+                                                 list_length(root->distinct_pathkeys),
+                                                 numDistinctRows));
+           }
         }
     }
  
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out

index 1a1e8b2365b4be5f5ee438db487f00c08393c03b..0c3433f8e58e7b89bee72cb566a6d0a2e5c541d5 100644 (file)
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1484,15 +1484,16 @@ explain (costs off) select * from t union select * from t order by 1,3;
  -- Full sort, not just incremental sort can be pushed below a gather merge path
  -- by generate_useful_gather_paths.
  explain (costs off) select distinct a,b from t;
-                QUERY PLAN                
-------------------------------------------
+                   QUERY PLAN                   
+------------------------------------------------
   Unique
     ->  Gather Merge
           Workers Planned: 2
-         ->  Sort
-               Sort Key: a, b
-               ->  Parallel Seq Scan on t
-(6 rows)
+         ->  Unique
+               ->  Sort
+                     Sort Key: a, b
+                     ->  Parallel Seq Scan on t
+(7 rows)
  
  drop table t;
  -- Sort pushdown can't go below where expressions are part of the rel target.
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out

index 6ce889d87c1d2f4cf2bc69196bbae07b2ac8dcbb..1fc07f220fb948e5a60dcb01e976e64dd6e8e715 100644 (file)
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -171,6 +171,20 @@ SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
  SET jit_above_cost TO DEFAULT;
  CREATE TABLE distinct_group_2 AS
  SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
+SET enable_seqscan = 0;
+-- Check to see we get an incremental sort plan
+EXPLAIN (costs off)
+SELECT DISTINCT hundred, two FROM tenk1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Unique
+   ->  Incremental Sort
+         Sort Key: hundred, two
+         Presorted Key: hundred
+         ->  Index Scan using tenk1_hundred on tenk1
+(5 rows)
+
+RESET enable_seqscan;
  SET enable_hashagg=TRUE;
  -- Produce results with hash aggregation.
  SET enable_sort=FALSE;
@@ -265,15 +279,16 @@ $$ LANGUAGE plpgsql PARALLEL SAFE;
  -- Ensure we do parallel distinct now that the function is parallel safe
  EXPLAIN (COSTS OFF)
  SELECT DISTINCT distinct_func(1) FROM tenk1;
-                  QUERY PLAN                  
-----------------------------------------------
+                     QUERY PLAN                     
+----------------------------------------------------
   Unique
-   ->  Sort
-         Sort Key: (distinct_func(1))
-         ->  Gather
-               Workers Planned: 2
-               ->  Parallel Seq Scan on tenk1
-(6 rows)
+   ->  Gather Merge
+         Workers Planned: 2
+         ->  Unique
+               ->  Sort
+                     Sort Key: (distinct_func(1))
+                     ->  Parallel Seq Scan on tenk1
+(7 rows)
  
  RESET max_parallel_workers_per_gather;
  RESET min_parallel_table_scan_size;
diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out

index 90e89fb5b68256fa5da94e253627253eda812c8e..b2c6605e60c9d705272a7363e5624851fc141d28 100644 (file)
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -3944,8 +3944,9 @@ ORDER BY depname, enroll_date;
                                            QUERY PLAN                                           
  -----------------------------------------------------------------------------------------------
   Unique
-   ->  Sort
+   ->  Incremental Sort
           Sort Key: depname, enroll_date, empno, (sum(salary) OVER (?)), (min(salary) OVER (?))
+         Presorted Key: depname, enroll_date
           ->  WindowAgg
                 ->  Incremental Sort
                       Sort Key: depname, enroll_date
@@ -3954,7 +3955,7 @@ ORDER BY depname, enroll_date;
                             ->  Sort
                                   Sort Key: depname, empno
                                   ->  Seq Scan on empsalary
-(11 rows)
+(12 rows)
  
  -- As above but adjust the ORDER BY clause to help ensure the plan with the
  -- minimum amount of sorting wasn't a fluke.
@@ -3970,8 +3971,9 @@ ORDER BY depname, empno;
                                            QUERY PLAN                                           
  -----------------------------------------------------------------------------------------------
   Unique
-   ->  Sort
+   ->  Incremental Sort
           Sort Key: depname, empno, enroll_date, (sum(salary) OVER (?)), (min(salary) OVER (?))
+         Presorted Key: depname, empno
           ->  WindowAgg
                 ->  Incremental Sort
                       Sort Key: depname, empno
@@ -3980,7 +3982,7 @@ ORDER BY depname, empno;
                             ->  Sort
                                   Sort Key: depname, enroll_date
                                   ->  Seq Scan on empsalary
-(11 rows)
+(12 rows)
  
  RESET enable_hashagg;
  -- Test Sort node reordering
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql

index 34020adad1dbf11a72677987f615a0909d1a8150..1643526d991807b38c7d651d3a4390a784f02edb 100644 (file)
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -69,6 +69,14 @@ SET jit_above_cost TO DEFAULT;
  CREATE TABLE distinct_group_2 AS
  SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
  
+SET enable_seqscan = 0;
+
+-- Check to see we get an incremental sort plan
+EXPLAIN (costs off)
+SELECT DISTINCT hundred, two FROM tenk1;
+
+RESET enable_seqscan;
+
  SET enable_hashagg=TRUE;
  
  -- Produce results with hash aggregation.
author	David Rowley <[email protected]>
	Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)
committer	David Rowley <[email protected]>
	Tue, 10 Jan 2023 21:25:43 +0000 (10:25 +1300)
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/test/regress/expected/incremental_sort.out		patch \| blob \| blame \| history
src/test/regress/expected/select_distinct.out		patch \| blob \| blame \| history
src/test/regress/expected/window.out		patch \| blob \| blame \| history
src/test/regress/sql/select_distinct.sql		patch \| blob \| blame \| history