Skip to content

Commit 2d443ae

Browse files
committed
Fix cardinality estimates for parallel joins.
For a partial path, the cardinality estimate needs to reflect the number of rows we think each worker will see, rather than the total number of rows; otherwise, costing will go wrong. The previous coding got this completely wrong for parallel joins. Unfortunately, this change may destabilize plans for users of 9.6 who have enabled parallel query, but since 9.6 is still fairly new I'm hoping expectations won't be too settled yet. Also, this is really a brown-paper-bag bug, so leaving it unfixed for the entire lifetime of 9.6 seems unwise. Related reports (whose import I initially failed to recognize) by Tomas Vondra and Tom Lane. Discussion: http://postgr.es/m/CA+TgmoaDxZ5z5Kw_oCQoymNxNoVaTCXzPaODcOuao=CzK8dMZw@mail.gmail.com
1 parent d2d7163 commit 2d443ae

File tree

1 file changed

+48
-26
lines changed

1 file changed

+48
-26
lines changed

src/backend/optimizer/path/costsize.c

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
161161
static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
162162
static double relation_byte_size(double tuples, int width);
163163
static double page_size(double tuples, int width);
164+
static double get_parallel_divisor(Path *path);
164165

165166

166167
/*
@@ -238,32 +239,7 @@ cost_seqscan(Path *path, PlannerInfo *root,
238239
/* Adjust costing for parallelism, if used. */
239240
if (path->parallel_workers > 0)
240241
{
241-
double parallel_divisor = path->parallel_workers;
242-
double leader_contribution;
243-
244-
/*
245-
* Early experience with parallel query suggests that when there is
246-
* only one worker, the leader often makes a very substantial
247-
* contribution to executing the parallel portion of the plan, but as
248-
* more workers are added, it does less and less, because it's busy
249-
* reading tuples from the workers and doing whatever non-parallel
250-
* post-processing is needed. By the time we reach 4 workers, the
251-
* leader no longer makes a meaningful contribution. Thus, for now,
252-
* estimate that the leader spends 30% of its time servicing each
253-
* worker, and the remainder executing the parallel plan.
254-
*/
255-
leader_contribution = 1.0 - (0.3 * path->parallel_workers);
256-
if (leader_contribution > 0)
257-
parallel_divisor += leader_contribution;
258-
259-
/*
260-
* In the case of a parallel plan, the row count needs to represent
261-
* the number of tuples processed per worker. Otherwise, higher-level
262-
* plan nodes that appear below the gather will be costed incorrectly,
263-
* because they'll anticipate receiving more rows than any given copy
264-
* will actually get.
265-
*/
266-
path->rows = clamp_row_est(path->rows / parallel_divisor);
242+
double parallel_divisor = get_parallel_divisor(path);
267243

268244
/* The CPU cost is divided among all the workers. */
269245
cpu_run_cost /= parallel_divisor;
@@ -274,6 +250,12 @@ cost_seqscan(Path *path, PlannerInfo *root,
274250
* prefetching. For now, we assume that the disk run cost can't be
275251
* amortized at all.
276252
*/
253+
254+
/*
255+
* In the case of a parallel plan, the row count needs to represent
256+
* the number of tuples processed per worker.
257+
*/
258+
path->rows = clamp_row_est(path->rows / parallel_divisor);
277259
}
278260

279261
path->startup_cost = startup_cost;
@@ -2014,6 +1996,10 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
20141996
else
20151997
path->path.rows = path->path.parent->rows;
20161998

1999+
/* For partial paths, scale row estimate. */
2000+
if (path->path.parallel_workers > 0)
2001+
path->path.rows /= get_parallel_divisor(&path->path);
2002+
20172003
/*
20182004
* We could include disable_cost in the preliminary estimate, but that
20192005
* would amount to optimizing for the case where the join method is
@@ -2432,6 +2418,10 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
24322418
else
24332419
path->jpath.path.rows = path->jpath.path.parent->rows;
24342420

2421+
/* For partial paths, scale row estimate. */
2422+
if (path->jpath.path.parallel_workers > 0)
2423+
path->jpath.path.rows /= get_parallel_divisor(&path->jpath.path);
2424+
24352425
/*
24362426
* We could include disable_cost in the preliminary estimate, but that
24372427
* would amount to optimizing for the case where the join method is
@@ -2811,6 +2801,10 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
28112801
else
28122802
path->jpath.path.rows = path->jpath.path.parent->rows;
28132803

2804+
/* For partial paths, scale row estimate. */
2805+
if (path->jpath.path.parallel_workers > 0)
2806+
path->jpath.path.rows /= get_parallel_divisor(&path->jpath.path);
2807+
28142808
/*
28152809
* We could include disable_cost in the preliminary estimate, but that
28162810
* would amount to optimizing for the case where the join method is
@@ -4799,3 +4793,31 @@ page_size(double tuples, int width)
47994793
{
48004794
return ceil(relation_byte_size(tuples, width) / BLCKSZ);
48014795
}
4796+
4797+
/*
4798+
* Estimate the fraction of the work that each worker will do given the
4799+
* number of workers budgeted for the path.
4800+
*/
4801+
static double
4802+
get_parallel_divisor(Path *path)
4803+
{
4804+
double parallel_divisor = path->parallel_workers;
4805+
double leader_contribution;
4806+
4807+
/*
4808+
* Early experience with parallel query suggests that when there is only
4809+
* one worker, the leader often makes a very substantial contribution to
4810+
* executing the parallel portion of the plan, but as more workers are
4811+
* added, it does less and less, because it's busy reading tuples from the
4812+
* workers and doing whatever non-parallel post-processing is needed. By
4813+
* the time we reach 4 workers, the leader no longer makes a meaningful
4814+
* contribution. Thus, for now, estimate that the leader spends 30% of
4815+
* its time servicing each worker, and the remainder executing the
4816+
* parallel plan.
4817+
*/
4818+
leader_contribution = 1.0 - (0.3 * path->parallel_workers);
4819+
if (leader_contribution > 0)
4820+
parallel_divisor += leader_contribution;
4821+
4822+
return parallel_divisor;
4823+
}

0 commit comments

Comments
 (0)