return numdistinct;
}
+/*
+ * Try to estimate the bucket size of the hash join inner side when the join
+ * condition contains two or more clauses by employing extended statistics.
+ *
+ * The main idea of this approach is that the distinct value generated by
+ * multivariate estimation on two or more columns would provide less bucket size
+ * than estimation on one separate column.
+ *
+ * IMPORTANT: It is crucial to synchronize the approach of combining different
+ * estimations with the caller's method.
+ *
+ * Return a list of clauses that didn't fetch any extended statistics.
+ */
+List *
+estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
+ List *hashclauses,
+ Selectivity *innerbucketsize)
+{
+ List *clauses = list_copy(hashclauses);
+ List *otherclauses = NIL;
+ double ndistinct = 1.0;
+
+ if (list_length(hashclauses) <= 1)
+
+ /*
+ * Nothing to do for a single clause. Could we employ univariate
+ * extended stat here?
+ */
+ return hashclauses;
+
+ while (clauses != NIL)
+ {
+ ListCell *lc;
+ int relid = -1;
+ List *varinfos = NIL;
+ List *origin_rinfos = NIL;
+ double mvndistinct;
+ List *origin_varinfos;
+ int group_relid = -1;
+ RelOptInfo *group_rel = NULL;
+ ListCell *lc1,
+ *lc2;
+
+ /*
+ * Find clauses, referencing the same single base relation and try to
+ * estimate such a group with extended statistics. Create varinfo for
+ * an approved clause, push it to otherclauses, if it can't be
+ * estimated here or ignore to process at the next iteration.
+ */
+ foreach(lc, clauses)
+ {
+ RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
+ Node *expr;
+ Relids relids;
+ GroupVarInfo *varinfo;
+
+ /*
+ * Find the inner side of the join, which we need to estimate the
+ * number of buckets. Use outer_is_left because the
+ * clause_sides_match_join routine has called on hash clauses.
+ */
+ relids = rinfo->outer_is_left ?
+ rinfo->right_relids : rinfo->left_relids;
+ expr = rinfo->outer_is_left ?
+ get_rightop(rinfo->clause) : get_leftop(rinfo->clause);
+
+ if (bms_get_singleton_member(relids, &relid) &&
+ root->simple_rel_array[relid]->statlist != NIL)
+ {
+ /*
+ * This inner-side expression references only one relation.
+ * Extended statistics on this clause can exist.
+ */
+ if (group_relid < 0)
+ {
+ RangeTblEntry *rte = root->simple_rte_array[relid];
+
+ if (!rte || (rte->relkind != RELKIND_RELATION &&
+ rte->relkind != RELKIND_MATVIEW &&
+ rte->relkind != RELKIND_FOREIGN_TABLE &&
+ rte->relkind != RELKIND_PARTITIONED_TABLE))
+ {
+ /* Extended statistics can't exist in principle */
+ otherclauses = lappend(otherclauses, rinfo);
+ clauses = foreach_delete_current(clauses, lc);
+ continue;
+ }
+
+ group_relid = relid;
+ group_rel = root->simple_rel_array[relid];
+ }
+ else if (group_relid != relid)
+
+ /*
+ * Being in the group forming state we don't need other
+ * clauses.
+ */
+ continue;
+
+ varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
+ varinfo->var = expr;
+ varinfo->rel = root->simple_rel_array[relid];
+ varinfo->ndistinct = 0.0;
+ varinfo->isdefault = false;
+ varinfos = lappend(varinfos, varinfo);
+
+ /*
+ * Remember the link to RestrictInfo for the case the clause
+ * is failed to be estimated.
+ */
+ origin_rinfos = lappend(origin_rinfos, rinfo);
+ }
+ else
+ /* This clause can't be estimated with extended statistics */
+ otherclauses = lappend(otherclauses, rinfo);
+
+ clauses = foreach_delete_current(clauses, lc);
+ }
+
+ if (list_length(varinfos) < 2)
+ {
+ /*
+ * Multivariate statistics doesn't apply to single columns except
+ * for expressions, but it has not been implemented yet.
+ */
+ otherclauses = list_concat(otherclauses, origin_rinfos);
+ list_free_deep(varinfos);
+ list_free(origin_rinfos);
+ continue;
+ }
+
+ Assert(group_rel != NULL);
+
+ /* Employ the extended statistics. */
+ origin_varinfos = varinfos;
+ for (;;)
+ {
+ bool estimated = estimate_multivariate_ndistinct(root,
+ group_rel,
+ &varinfos,
+ &mvndistinct);
+
+ if (!estimated)
+ break;
+
+ /*
+ * We've got an estimation. Use ndistinct value in a consistent
+ * way - according to the caller's logic (see
+ * final_cost_hashjoin).
+ */
+ if (ndistinct < mvndistinct)
+ ndistinct = mvndistinct;
+ Assert(ndistinct >= 1.0);
+ }
+
+ Assert(list_length(origin_varinfos) == list_length(origin_rinfos));
+
+ /* Collect unmatched clauses as otherclauses. */
+ forboth(lc1, origin_varinfos, lc2, origin_rinfos)
+ {
+ GroupVarInfo *vinfo = lfirst(lc1);
+
+ if (!list_member_ptr(varinfos, vinfo))
+ /* Already estimated */
+ continue;
+
+ /* Can't be estimated here - push to the returning list */
+ otherclauses = lappend(otherclauses, lfirst(lc2));
+ }
+ }
+
+ *innerbucketsize = 1.0 / ndistinct;
+ return otherclauses;
+}
+
/*
* Estimate hash bucket statistics when the specified expression is used
* as a hash key for the given number of buckets.
(1 row)
DROP TABLE grouping_unique;
+--
+-- Extended statistics on sb_2 (x, y, z) improve a bucket size estimation,
+-- and the optimizer may choose hash join.
+--
+CREATE TABLE sb_1 AS
+ SELECT gs % 10 AS x, gs % 10 AS y, gs % 10 AS z
+ FROM generate_series(1, 1e4) AS gs;
+CREATE TABLE sb_2 AS
+ SELECT gs % 49 AS x, gs % 51 AS y, gs % 73 AS z, 'abc' || gs AS payload
+ FROM generate_series(1, 1e4) AS gs;
+ANALYZE sb_1, sb_2;
+-- During hash join estimation, the number of distinct values on each column
+-- is calculated. The optimizer selects the smallest number of distinct values
+-- and the largest hash bucket size. The optimizer decides that the hash
+-- bucket size is quite big because there are possibly many correlations.
+EXPLAIN (COSTS OFF) -- Choose merge join
+SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
+ QUERY PLAN
+-------------------------------------------------------------
+ Merge Join
+ Merge Cond: ((a.z = b.z) AND (a.x = b.x) AND (a.y = b.y))
+ -> Sort
+ Sort Key: a.z, a.x, a.y
+ -> Seq Scan on sb_1 a
+ -> Sort
+ Sort Key: b.z, b.x, b.y
+ -> Seq Scan on sb_2 b
+(8 rows)
+
+-- The ndistinct extended statistics on (x, y, z) provides more reliable value
+-- of bucket size.
+CREATE STATISTICS extstat_sb_2 (ndistinct) ON x, y, z FROM sb_2;
+ANALYZE sb_2;
+EXPLAIN (COSTS OFF) -- Choose hash join
+SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
+ QUERY PLAN
+------------------------------------------------------------
+ Hash Join
+ Hash Cond: ((a.x = b.x) AND (a.y = b.y) AND (a.z = b.z))
+ -> Seq Scan on sb_1 a
+ -> Hash
+ -> Seq Scan on sb_2 b
+(5 rows)
+
+DROP TABLE sb_1, sb_2 CASCADE;
ON t1.t1 = q1.x;
');
DROP TABLE grouping_unique;
+
+--
+-- Extended statistics on sb_2 (x, y, z) improve a bucket size estimation,
+-- and the optimizer may choose hash join.
+--
+CREATE TABLE sb_1 AS
+ SELECT gs % 10 AS x, gs % 10 AS y, gs % 10 AS z
+ FROM generate_series(1, 1e4) AS gs;
+CREATE TABLE sb_2 AS
+ SELECT gs % 49 AS x, gs % 51 AS y, gs % 73 AS z, 'abc' || gs AS payload
+ FROM generate_series(1, 1e4) AS gs;
+ANALYZE sb_1, sb_2;
+
+-- During hash join estimation, the number of distinct values on each column
+-- is calculated. The optimizer selects the smallest number of distinct values
+-- and the largest hash bucket size. The optimizer decides that the hash
+-- bucket size is quite big because there are possibly many correlations.
+EXPLAIN (COSTS OFF) -- Choose merge join
+SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
+
+-- The ndistinct extended statistics on (x, y, z) provides more reliable value
+-- of bucket size.
+CREATE STATISTICS extstat_sb_2 (ndistinct) ON x, y, z FROM sb_2;
+ANALYZE sb_2;
+
+EXPLAIN (COSTS OFF) -- Choose hash join
+SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
+
+DROP TABLE sb_1, sb_2 CASCADE;