<literal>d</literal> for n-distinct statistics,
<literal>f</literal> for functional dependency statistics, and
<literal>m</literal> for most common values (MCV) list statistics
+ <literal>e</literal> for expression statistics
</para></entry>
</row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+ </para>
+ <para>
+ Expression trees (in <function>nodeToString()</function>
+ representation) for statistics object attributes that are not simple
+ column references. This is a list with one element per expression.
+ Null if all statistics object attributes are simple references.
+ </para></entry>
+ </row>
+
</tbody>
</tgroup>
</table>
(references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>oid</structfield>)
</para>
<para>
- Extended statistic object containing the definition for this data
+ Extended statistics object containing the definition for this data
</para></entry>
</row>
<structname>pg_mcv_list</structname> type
</para></entry>
</row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+ </para>
+ <para>
+ A list of any expressions covered by this statistics object.
+ </para></entry>
+ </row>
</tbody>
</tgroup>
</table>
see <xref linkend="logical-replication-publication"/>.
</para></entry>
</row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>stxdexpr</structfield> <type>pg_statistic[]</type>
+ </para>
+ <para>
+ Per-expression statistics, serialized as an array of
+ <structname>pg_statistic</structname> type
+ </para></entry>
+ </row>
</tbody>
</tgroup>
</table>
<entry>extended planner statistics</entry>
</row>
+ <row>
+ <entry><link linkend="view-pg-stats-ext-exprs"><structname>pg_stats_ext_exprs</structname></link></entry>
+ <entry>extended planner statistics for expressions</entry>
+ </row>
+
<row>
<entry><link linkend="view-pg-tables"><structname>pg_tables</structname></link></entry>
<entry>tables</entry>
(references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
</para>
<para>
- Name of the column described by this row
+ Names of the columns included in the extended statistics object
</para></entry>
</row>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>exprs</structfield> <type>text[]</type>
+ </para>
+ <para>
+ Expressions included in the extended statistics object
+ </para></entry>
+ </row>
+
<row>
<entry role="catalog_table_entry"><para role="column_definition">
<structfield>inherited</structfield> <type>bool</type>
<para>
The view <structname>pg_stats_ext</structname> provides access to
- the information stored in the <link
+ information about each extended statistics object in the database,
+ combining information stored in the <link
linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
catalogs. This view allows access only to rows of
(references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
</para>
<para>
- Name of schema containing extended statistic
+ Name of schema containing extended statistics object
</para></entry>
</row>
(references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
</para>
<para>
- Name of extended statistics
+ Name of extended statistics object
</para></entry>
</row>
(references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
</para>
<para>
- Owner of the extended statistics
+ Owner of the extended statistics object
</para></entry>
</row>
(references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
</para>
<para>
- Names of the columns the extended statistics is defined on
+ Names of the columns the extended statistics object is defined on
</para></entry>
</row>
<structfield>kinds</structfield> <type>char[]</type>
</para>
<para>
- Types of extended statistics enabled for this record
+ Types of extended statistics object enabled for this record
</para></entry>
</row>
</sect1>
+ <sect1 id="view-pg-stats-ext-exprs">
+ <title><structname>pg_stats_ext_exprs</structname></title>
+
+ <indexterm zone="view-pg-stats-ext-exprs">
+ <primary>pg_stats_ext_exprs</primary>
+ </indexterm>
+
+ <para>
+ The view <structname>pg_stats_ext_exprs</structname> provides access to
+ information about all expressions included in extended statistics objects,
+ combining information stored in the <link
+ linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
+ and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+ catalogs. This view allows access only to rows of
+ <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+ that correspond to tables the user has permission to read, and therefore
+ it is safe to allow public read access to this view.
+ </para>
+
+ <para>
+ <structname>pg_stats_ext_exprs</structname> is also designed to present
+ the information in a more readable format than the underlying catalogs
+ — at the cost that its schema must be extended whenever the structure
+ of statistics in <link linkend="catalog-pg-statistic"><structname>pg_statistic</structname></link> changes.
+ </para>
+
+ <table>
+ <title><structname>pg_stats_ext_exprs</structname> Columns</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para></entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>schemaname</structfield> <type>name</type>
+ (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+ </para>
+ <para>
+ Name of schema containing table
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>tablename</structfield> <type>name</type>
+ (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relname</structfield>)
+ </para>
+ <para>
+ Name of table the statistics object is defined on
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>statistics_schemaname</structfield> <type>name</type>
+ (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+ </para>
+ <para>
+ Name of schema containing extended statistics object
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>statistics_name</structfield> <type>name</type>
+ (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
+ </para>
+ <para>
+ Name of extended statistics object
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>statistics_owner</structfield> <type>name</type>
+ (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
+ </para>
+ <para>
+ Owner of the extended statistics object
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>expr</structfield> <type>text</type>
+ </para>
+ <para>
+ Expression included in the extended statistics object
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>null_frac</structfield> <type>float4</type>
+ </para>
+ <para>
+ Fraction of expression entries that are null
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>avg_width</structfield> <type>int4</type>
+ </para>
+ <para>
+ Average width in bytes of expression's entries
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>n_distinct</structfield> <type>float4</type>
+ </para>
+ <para>
+ If greater than zero, the estimated number of distinct values in the
+ expression. If less than zero, the negative of the number of distinct
+ values divided by the number of rows. (The negated form is used when
+ <command>ANALYZE</command> believes that the number of distinct values is
+ likely to increase as the table grows; the positive form is used when
+ the expression seems to have a fixed number of possible values.) For
+ example, -1 indicates a unique expression in which the number of distinct
+ values is the same as the number of rows.
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>most_common_vals</structfield> <type>anyarray</type>
+ </para>
+ <para>
+ A list of the most common values in the expression. (Null if
+ no values seem to be more common than any others.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>most_common_freqs</structfield> <type>float4[]</type>
+ </para>
+ <para>
+ A list of the frequencies of the most common values,
+ i.e., number of occurrences of each divided by total number of rows.
+ (Null when <structfield>most_common_vals</structfield> is.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>histogram_bounds</structfield> <type>anyarray</type>
+ </para>
+ <para>
+ A list of values that divide the expression's values into groups of
+ approximately equal population. The values in
+ <structfield>most_common_vals</structfield>, if present, are omitted from this
+ histogram calculation. (This expression is null if the expression data type
+ does not have a <literal><</literal> operator or if the
+ <structfield>most_common_vals</structfield> list accounts for the entire
+ population.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>correlation</structfield> <type>float4</type>
+ </para>
+ <para>
+ Statistical correlation between physical row ordering and
+ logical ordering of the expression values. This ranges from -1 to +1.
+ When the value is near -1 or +1, an index scan on the expression will
+ be estimated to be cheaper than when it is near zero, due to reduction
+ of random access to the disk. (This expression is null if the expression's
+ data type does not have a <literal><</literal> operator.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>most_common_elems</structfield> <type>anyarray</type>
+ </para>
+ <para>
+ A list of non-null element values most often appearing within values of
+ the expression. (Null for scalar types.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>most_common_elem_freqs</structfield> <type>float4[]</type>
+ </para>
+ <para>
+ A list of the frequencies of the most common element values, i.e., the
+ fraction of rows containing at least one instance of the given value.
+ Two or three additional values follow the per-element frequencies;
+ these are the minimum and maximum of the preceding per-element
+ frequencies, and optionally the frequency of null elements.
+ (Null when <structfield>most_common_elems</structfield> is.)
+ </para></entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry"><para role="column_definition">
+ <structfield>elem_count_histogram</structfield> <type>float4[]</type>
+ </para>
+ <para>
+ A histogram of the counts of distinct non-null element values within the
+ values of the expression, followed by the average number of distinct
+ non-null elements. (Null for scalar types.)
+ </para></entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ The maximum number of entries in the array fields can be controlled on a
+ column-by-column basis using the <link linkend="sql-altertable"><command>ALTER
+ TABLE SET STATISTICS</command></link> command, or globally by setting the
+ <xref linkend="guc-default-statistics-target"/> run-time parameter.
+ </para>
+
+ </sect1>
+
<sect1 id="view-pg-tables">
<title><structname>pg_tables</structname></title>
<refsynopsisdiv>
<synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+ ON ( <replaceable class="parameter">expression</replaceable> )
+ FROM <replaceable class="parameter">table_name</replaceable>
+
CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
[ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
- ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+ ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
FROM <replaceable class="parameter">table_name</replaceable>
</synopsis>
database and will be owned by the user issuing the command.
</para>
+ <para>
+ The <command>CREATE STATISTICS</command> command has two basic forms. The
+ first form allows univariate statistics for a single expression to be
+ collected, providing benefits similar to an expression index without the
+ overhead of index maintenance. This form does not allow the statistics
+ kind to be specified, since the various statistics kinds refer only to
+ multivariate statistics. The second form of the command allows
+ multivariate statistics on multiple columns and/or expressions to be
+ collected, optionally specifying which statistics kinds to include. This
+ form will also automatically cause univariate statistics to be collected on
+ any expressions included in the list.
+ </para>
+
<para>
If a schema name is given (for example, <literal>CREATE STATISTICS
myschema.mystat ...</literal>) then the statistics object is created in the
<term><replaceable class="parameter">statistics_kind</replaceable></term>
<listitem>
<para>
- A statistics kind to be computed in this statistics object.
+ A multivariate statistics kind to be computed in this statistics object.
Currently supported kinds are
<literal>ndistinct</literal>, which enables n-distinct statistics,
<literal>dependencies</literal>, which enables functional
dependency statistics, and <literal>mcv</literal> which enables
most-common values lists.
If this clause is omitted, all supported statistics kinds are
- included in the statistics object.
+ included in the statistics object. Univariate expression statistics are
+ built automatically if the statistics definition includes any complex
+ expressions rather than just simple column references.
For more information, see <xref linkend="planner-stats-extended"/>
and <xref linkend="multivariate-statistics-examples"/>.
</para>
<listitem>
<para>
The name of a table column to be covered by the computed statistics.
- At least two column names must be given; the order of the column names
- is insignificant.
+ This is only allowed when building multivariate statistics. At least
+ two column names or expressions must be specified, and their order is
+ not significant.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><replaceable class="parameter">expression</replaceable></term>
+ <listitem>
+ <para>
+ An expression to be covered by the computed statistics. This may be
+ used to build univariate statistics on a single expression, or as part
+ of a list of multiple column names and/or expressions to build
+ multivariate statistics. In the latter case, separate univariate
+ statistics are built automatically for each expression in the list.
</para>
</listitem>
</varlistentry>
reading it. Once created, however, the ownership of the statistics
object is independent of the underlying table(s).
</para>
+
+ <para>
+ Expression statistics are per-expression and are similar to creating an
+ index on the expression, except that they avoid the overhead of index
+ maintenance. Expression statistics are built automatically for each
+ expression in the statistics object definition.
+ </para>
</refsect1>
<refsect1 id="sql-createstatistics-examples">
in the table, allowing it to generate better estimates in both cases.
</para>
+ <para>
+ Create table <structname>t3</structname> with a single timestamp column,
+ and run queries using expressions on that column. Without extended
+ statistics, the planner has no information about the data distribution for
+ the expressions, and uses default estimates. The planner also does not
+ realize that the value of the date truncated to the month is fully
+ determined by the value of the date truncated to the day. Then expression
+ and ndistinct statistics are built on those two expressions:
+
+<programlisting>
+CREATE TABLE t3 (
+ a timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+ '2020-12-31'::timestamp,
+ '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+ WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+ WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+ AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+ FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+ WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+ WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+ AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+ FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+ Without expression and ndistinct statistics, the planner has no information
+ about the number of distinct values for the expressions, and has to rely
+ on default estimates. The equality and range conditions are assumed to have
+ 0.5% selectivity, and the number of distinct values in the expression is
+ assumed to be the same as for the column (i.e. unique). This results in a
+ significant underestimate of the row count in the first two queries. Moreover,
+ the planner has no information about the relationship between the expressions,
+ so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
+ conditions are independent, and multiplies their selectivities together to
+ arrive at a severe overestimate of the group count in the aggregate query.
+ This is further exacerbated by the lack of accurate statistics for the
+ expressions, forcing the planner to use a default ndistinct estimate for the
+ expression derived from ndistinct for the column. With such statistics, the
+ planner recognizes that the conditions are correlated, and arrives at much
+ more accurate estimates.
+ </para>
+
</refsect1>
<refsect1>
# Note: the order of this list determines the order in which the catalog
# header files are assembled into postgres.bki. BKI_BOOTSTRAP catalogs
-# must appear first, and there are reputedly other, undocumented ordering
-# dependencies.
+# must appear first, and pg_statistic before pg_statistic_ext_data, and
+# there are reputedly other, undocumented ordering dependencies.
CATALOG_HEADERS := \
pg_proc.h pg_type.h pg_attribute.h pg_class.h \
pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
- pg_statistic_ext.h pg_statistic_ext_data.h \
- pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
+ pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
+ pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
pg_database.h pg_db_role_setting.h pg_tablespace.h \
pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
JOIN pg_attribute a
ON (a.attrelid = s.stxrelid AND a.attnum = k)
) AS attnames,
+ pg_get_statisticsobjdef_expressions(s.oid) as exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
+CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
+ SELECT cn.nspname AS schemaname,
+ c.relname AS tablename,
+ sn.nspname AS statistics_schemaname,
+ s.stxname AS statistics_name,
+ pg_get_userbyid(s.stxowner) AS statistics_owner,
+ stat.expr,
+ (stat.a).stanullfrac AS null_frac,
+ (stat.a).stawidth AS avg_width,
+ (stat.a).stadistinct AS n_distinct,
+ (CASE
+ WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
+ END) AS most_common_vals,
+ (CASE
+ WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
+ END) AS most_common_freqs,
+ (CASE
+ WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
+ END) AS histogram_bounds,
+ (CASE
+ WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
+ WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
+ WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
+ WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
+ WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
+ END) correlation,
+ (CASE
+ WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
+ WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
+ WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
+ WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
+ WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
+ END) AS most_common_elems,
+ (CASE
+ WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
+ END) AS most_common_elem_freqs,
+ (CASE
+ WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
+ WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
+ WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
+ WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
+ WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
+ END) AS elem_count_histogram
+ FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
+ LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
+ LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
+ LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
+ JOIN LATERAL (
+ SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+ unnest(sd.stxdexpr)::pg_statistic AS a
+ ) stat ON (stat.expr IS NOT NULL);
+
-- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
REVOKE ALL on pg_statistic_ext_data FROM public;
#include "commands/comment.h"
#include "commands/defrem.h"
#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
#include "statistics/statistics.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
CreateStatistics(CreateStatsStmt *stmt)
{
int16 attnums[STATS_MAX_DIMENSIONS];
- int numcols = 0;
+ int nattnums = 0;
+ int numcols;
char *namestr;
NameData stxname;
Oid statoid;
Datum datavalues[Natts_pg_statistic_ext_data];
bool datanulls[Natts_pg_statistic_ext_data];
int2vector *stxkeys;
+ List *stxexprs = NIL;
+ Datum exprsDatum;
Relation statrel;
Relation datarel;
Relation rel = NULL;
Oid relid;
ObjectAddress parentobject,
myself;
- Datum types[3]; /* one for each possible type of statistic */
+ Datum types[4]; /* one for each possible type of statistic */
int ntypes;
ArrayType *stxkind;
bool build_ndistinct;
bool build_dependencies;
bool build_mcv;
+ bool build_expressions;
bool requested_type = false;
int i;
ListCell *cell;
+ ListCell *cell2;
Assert(IsA(stmt, CreateStatsStmt));
}
/*
- * Currently, we only allow simple column references in the expression
- * list. That will change someday, and again the grammar already supports
- * it so we have to enforce restrictions here. For now, we can convert
- * the expression list to a simple array of attnums. While at it, enforce
- * some constraints.
+ * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
+ * might be duplicates and so on, but we'll deal with those later.
+ */
+ numcols = list_length(stmt->exprs);
+ if (numcols > STATS_MAX_DIMENSIONS)
+ ereport(ERROR,
+ (errcode(ERRCODE_TOO_MANY_COLUMNS),
+ errmsg("cannot have more than %d columns in statistics",
+ STATS_MAX_DIMENSIONS)));
+
+ /*
+ * Convert the expression list to a simple array of attnums, but also keep
+ * a list of more complex expressions. While at it, enforce some
+ * constraints.
+ *
+ * XXX We do only the bare minimum to separate simple attribute and
+ * complex expressions - for example "(a)" will be treated as a complex
+ * expression. No matter how elaborate the check is, there'll always be a
+ * way around it, if the user is determined (consider e.g. "(a+0)"), so
+ * it's not worth protecting against it.
*/
foreach(cell, stmt->exprs)
{
Node *expr = (Node *) lfirst(cell);
- ColumnRef *cref;
- char *attname;
+ StatsElem *selem;
HeapTuple atttuple;
Form_pg_attribute attForm;
TypeCacheEntry *type;
- if (!IsA(expr, ColumnRef))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("only simple column references are allowed in CREATE STATISTICS")));
- cref = (ColumnRef *) expr;
-
- if (list_length(cref->fields) != 1)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("only simple column references are allowed in CREATE STATISTICS")));
- attname = strVal((Value *) linitial(cref->fields));
-
- atttuple = SearchSysCacheAttName(relid, attname);
- if (!HeapTupleIsValid(atttuple))
- ereport(ERROR,
- (errcode(ERRCODE_UNDEFINED_COLUMN),
- errmsg("column \"%s\" does not exist",
- attname)));
- attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
- /* Disallow use of system attributes in extended stats */
- if (attForm->attnum <= 0)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("statistics creation on system columns is not supported")));
-
- /* Disallow data types without a less-than operator */
- type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
- if (type->lt_opr == InvalidOid)
+ /*
+ * We should not get anything else than StatsElem, given the grammar.
+ * But let's keep it as a safety.
+ */
+ if (!IsA(expr, StatsElem))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
- attname, format_type_be(attForm->atttypid))));
+ errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
- /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
- if (numcols >= STATS_MAX_DIMENSIONS)
- ereport(ERROR,
- (errcode(ERRCODE_TOO_MANY_COLUMNS),
- errmsg("cannot have more than %d columns in statistics",
- STATS_MAX_DIMENSIONS)));
+ selem = (StatsElem *) expr;
- attnums[numcols] = attForm->attnum;
- numcols++;
- ReleaseSysCache(atttuple);
+ if (selem->name) /* column reference */
+ {
+ char *attname;
+
+ attname = selem->name;
+
+ atttuple = SearchSysCacheAttName(relid, attname);
+ if (!HeapTupleIsValid(atttuple))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("column \"%s\" does not exist",
+ attname)));
+ attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+ /* Disallow use of system attributes in extended stats */
+ if (attForm->attnum <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("statistics creation on system columns is not supported")));
+
+ /* Disallow data types without a less-than operator */
+ type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+ attname, format_type_be(attForm->atttypid))));
+
+ attnums[nattnums] = attForm->attnum;
+ nattnums++;
+ ReleaseSysCache(atttuple);
+ }
+ else /* expression */
+ {
+ Node *expr = selem->expr;
+ Oid atttype;
+
+ Assert(expr != NULL);
+
+ /*
+ * Disallow data types without a less-than operator.
+ *
+ * We ignore this for statistics on a single expression, in which
+ * case we'll build the regular statistics only (and that code can
+ * deal with such data types).
+ */
+ if (list_length(stmt->exprs) > 1)
+ {
+ atttype = exprType(expr);
+ type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
+ if (type->lt_opr == InvalidOid)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
+ format_type_be(atttype))));
+ }
+
+ stxexprs = lappend(stxexprs, expr);
+ }
}
/*
- * Check that at least two columns were specified in the statement. The
- * upper bound was already checked in the loop above.
- */
- if (numcols < 2)
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
- errmsg("extended statistics require at least 2 columns")));
-
- /*
- * Sort the attnums, which makes detecting duplicates somewhat easier, and
- * it does not hurt (it does not affect the efficiency, unlike for
- * indexes, for example).
- */
- qsort(attnums, numcols, sizeof(int16), compare_int16);
-
- /*
- * Check for duplicates in the list of columns. The attnums are sorted so
- * just check consecutive elements.
+ * Parse the statistics kinds.
+ *
+ * First check that if this is the case with a single expression, there
+ * are no statistics kinds specified (we don't allow that for the simple
+ * CREATE STATISTICS form).
*/
- for (i = 1; i < numcols; i++)
+ if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
{
- if (attnums[i] == attnums[i - 1])
+ /* statistics kinds not specified */
+ if (list_length(stmt->stat_types) > 0)
ereport(ERROR,
- (errcode(ERRCODE_DUPLICATE_COLUMN),
- errmsg("duplicate column name in statistics definition")));
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
}
- /* Form an int2vector representation of the sorted column list */
- stxkeys = buildint2vector(attnums, numcols);
-
- /*
- * Parse the statistics kinds.
- */
+ /* OK, let's check that we recognize the statistics kinds. */
build_ndistinct = false;
build_dependencies = false;
build_mcv = false;
errmsg("unrecognized statistics kind \"%s\"",
type)));
}
- /* If no statistic type was specified, build them all. */
- if (!requested_type)
+
+ /*
+ * If no statistic type was specified, build them all (but only when the
+ * statistics is defined on more than one column/expression).
+ */
+ if ((!requested_type) && (numcols >= 2))
{
build_ndistinct = true;
build_dependencies = true;
build_mcv = true;
}
+ /*
+ * When there are non-trivial expressions, build the expression stats
+ * automatically. This allows calculating good estimates for stats that
+ * consider per-clause estimates (e.g. functional dependencies).
+ */
+ build_expressions = (list_length(stxexprs) > 0);
+
+ /*
+ * Check that at least two columns were specified in the statement, or
+ * that we're building statistics on a single expression.
+ */
+ if ((numcols < 2) && (list_length(stxexprs) != 1))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+ errmsg("extended statistics require at least 2 columns")));
+
+ /*
+ * Sort the attnums, which makes detecting duplicates somewhat easier, and
+ * it does not hurt (it does not matter for the contents, unlike for
+ * indexes, for example).
+ */
+ qsort(attnums, nattnums, sizeof(int16), compare_int16);
+
+ /*
+ * Check for duplicates in the list of columns. The attnums are sorted so
+ * just check consecutive elements.
+ */
+ for (i = 1; i < nattnums; i++)
+ {
+ if (attnums[i] == attnums[i - 1])
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_COLUMN),
+ errmsg("duplicate column name in statistics definition")));
+ }
+
+ /*
+ * Check for duplicate expressions. We do two loops, counting the
+ * occurrences of each expression. This is O(N^2) but we only allow small
+ * number of expressions and it's not executed often.
+ *
+ * XXX We don't cross-check attributes and expressions, because it does
+ * not seem worth it. In principle we could check that expressions don't
+ * contain trivial attribute references like "(a)", but the reasoning is
+ * similar to why we don't bother with extracting columns from
+ * expressions. It's either expensive or very easy to defeat for
+ * determined user, and there's no risk if we allow such statistics (the
+ * statistics is useless, but harmless).
+ */
+ foreach(cell, stxexprs)
+ {
+ Node *expr1 = (Node *) lfirst(cell);
+ int cnt = 0;
+
+ foreach(cell2, stxexprs)
+ {
+ Node *expr2 = (Node *) lfirst(cell2);
+
+ if (equal(expr1, expr2))
+ cnt += 1;
+ }
+
+ /* every expression should find at least itself */
+ Assert(cnt >= 1);
+
+ if (cnt > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_COLUMN),
+ errmsg("duplicate expression in statistics definition")));
+ }
+
+ /* Form an int2vector representation of the sorted column list */
+ stxkeys = buildint2vector(attnums, nattnums);
+
/* construct the char array of enabled statistic types */
ntypes = 0;
if (build_ndistinct)
types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
if (build_mcv)
types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
+ if (build_expressions)
+ types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
Assert(ntypes > 0 && ntypes <= lengthof(types));
stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
+ /* convert the expressions (if any) to a text datum */
+ if (stxexprs != NIL)
+ {
+ char *exprsString;
+
+ exprsString = nodeToString(stxexprs);
+ exprsDatum = CStringGetTextDatum(exprsString);
+ pfree(exprsString);
+ }
+ else
+ exprsDatum = (Datum) 0;
+
statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
/*
values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
+ values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
+ if (exprsDatum == (Datum) 0)
+ nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
+
/* insert it into pg_statistic_ext */
htup = heap_form_tuple(statrel->rd_att, values, nulls);
CatalogTupleInsert(statrel, htup);
datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+ datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
/* insert it into pg_statistic_ext_data */
htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
*/
ObjectAddressSet(myself, StatisticExtRelationId, statoid);
- for (i = 0; i < numcols; i++)
+ /* add dependencies for plain column references */
+ for (i = 0; i < nattnums; i++)
{
ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
}
+ /*
+ * If there are no dependencies on a column, give the statistics an auto
+ * dependency on the whole table. In most cases, this will be redundant,
+ * but it might not be if the statistics expressions contain no Vars
+ * (which might seem strange but possible). This is consistent with what
+ * we do for indexes in index_create.
+ *
+ * XXX We intentionally don't consider the expressions before adding this
+ * dependency, because recordDependencyOnSingleRelExpr may not create any
+ * dependencies for whole-row Vars.
+ */
+ if (!nattnums)
+ {
+ ObjectAddressSet(parentobject, RelationRelationId, relid);
+ recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+ }
+
+ /*
+ * Store dependencies on anything mentioned in statistics expressions,
+ * just like we do for index expressions.
+ */
+ if (stxexprs)
+ recordDependencyOnSingleRelExpr(&myself,
+ (Node *) stxexprs,
+ relid,
+ DEPENDENCY_NORMAL,
+ DEPENDENCY_AUTO, false, true);
+
/*
* Also add dependencies on namespace and owner. These are required
* because the stats object might have a different namespace and/or owner
table_close(relation, RowExclusiveLock);
}
-/*
- * Update a statistics object for ALTER COLUMN TYPE on a source column.
- *
- * This could throw an error if the type change can't be supported.
- * If it can be supported, but the stats must be recomputed, a likely choice
- * would be to set the relevant column(s) of the pg_statistic_ext_data tuple
- * to null until the next ANALYZE. (Note that the type change hasn't actually
- * happened yet, so one option that's *not* on the table is to recompute
- * immediately.)
- *
- * For both ndistinct and functional-dependencies stats, the on-disk
- * representation is independent of the source column data types, and it is
- * plausible to assume that the old statistic values will still be good for
- * the new column contents. (Obviously, if the ALTER COLUMN TYPE has a USING
- * expression that substantially alters the semantic meaning of the column
- * values, this assumption could fail. But that seems like a corner case
- * that doesn't justify zapping the stats in common cases.)
- *
- * For MCV lists that's not the case, as those statistics store the datums
- * internally. In this case we simply reset the statistics value to NULL.
- *
- * Note that "type change" includes collation change, which means we can rely
- * on the MCV list being consistent with the collation info in pg_attribute
- * during estimation.
- */
-void
-UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
- Oid oldColumnType, Oid newColumnType)
-{
- HeapTuple stup,
- oldtup;
-
- Relation rel;
-
- Datum values[Natts_pg_statistic_ext_data];
- bool nulls[Natts_pg_statistic_ext_data];
- bool replaces[Natts_pg_statistic_ext_data];
-
- oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
- if (!HeapTupleIsValid(oldtup))
- elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
-
- /*
- * When none of the defined statistics types contain datum values from the
- * table's columns then there's no need to reset the stats. Functional
- * dependencies and ndistinct stats should still hold true.
- */
- if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
- {
- ReleaseSysCache(oldtup);
- return;
- }
-
- /*
- * OK, we need to reset some statistics. So let's build the new tuple,
- * replacing the affected statistics types with NULL.
- */
- memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
- memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
- memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
-
- replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
- nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-
- rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
-
- /* replace the old tuple */
- stup = heap_modify_tuple(oldtup,
- RelationGetDescr(rel),
- values,
- nulls,
- replaces);
-
- ReleaseSysCache(oldtup);
- CatalogTupleUpdate(rel, &stup->t_self, stup);
-
- heap_freetuple(stup);
-
- table_close(rel, RowExclusiveLock);
-}
-
/*
* Select a nonconflicting name for a new statistics.
*
buf[0] = '\0';
foreach(lc, exprs)
{
- ColumnRef *cref = (ColumnRef *) lfirst(lc);
+ StatsElem *selem = (StatsElem *) lfirst(lc);
const char *name;
/* It should be one of these, but just skip if it happens not to be */
- if (!IsA(cref, ColumnRef))
+ if (!IsA(selem, StatsElem))
continue;
- name = strVal((Value *) linitial(cref->fields));
+ name = selem->name;
if (buflen > 0)
buf[buflen++] = '_'; /* insert _ between names */
+ /*
+ * We use fixed 'expr' for expressions, which have empty column names.
+ * For indexes this is handled in ChooseIndexColumnNames, but we have
+ * no such function for stats and it does not seem worth adding. If a
+ * better name is needed, the user can specify it explicitly.
+ */
+ if (!name)
+ name = "expr";
+
/*
* At this point we have buflen <= NAMEDATALEN. name should be less
* than NAMEDATALEN already, but use strlcpy for paranoia.
}
return pstrdup(buf);
}
+
+/*
+ * StatisticsGetRelation: given a statistics's relation OID, get the OID of
+ * the relation it is an statistics on. Uses the system cache.
+ */
+Oid
+StatisticsGetRelation(Oid statId, bool missing_ok)
+{
+ HeapTuple tuple;
+ Form_pg_statistic_ext stx;
+ Oid result;
+
+ tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
+ if (!HeapTupleIsValid(tuple))
+ {
+ if (missing_ok)
+ return InvalidOid;
+ elog(ERROR, "cache lookup failed for statistics object %u", statId);
+ }
+ stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
+ Assert(stx->oid == statId);
+
+ result = stx->stxrelid;
+ ReleaseSysCache(tuple);
+ return result;
+}
#include "catalog/pg_namespace.h"
#include "catalog/pg_opclass.h"
#include "catalog/pg_tablespace.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_trigger.h"
#include "catalog/pg_type.h"
#include "catalog/storage.h"
List *changedIndexDefs; /* string definitions of same */
char *replicaIdentityIndex; /* index to reset as REPLICA IDENTITY */
char *clusterOnIndex; /* index to use for CLUSTER */
+ List *changedStatisticsOids; /* OIDs of statistics to rebuild */
+ List *changedStatisticsDefs; /* string definitions of same */
} AlteredTableInfo;
/* Struct describing one new constraint to check in Phase 3 scan */
ObjectAddresses *addrs);
static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
+static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+ CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
static ObjectAddress ATExecAddConstraint(List **wqueue,
AlteredTableInfo *tab, Relation rel,
Constraint *newConstraint, bool recurse, bool is_readd,
AlterTableCmd *cmd, LOCKMODE lockmode);
static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
+static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
LOCKMODE lockmode);
static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
lockmode);
break;
+ case AT_ReAddStatistics: /* ADD STATISTICS */
+ address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
+ true, lockmode);
+ break;
case AT_AddConstraint: /* ADD CONSTRAINT */
/* Transform the command only during initial examination */
if (cur_pass == AT_PASS_ADD_CONSTR)
return address;
}
+/*
+ * ALTER TABLE ADD STATISTICS
+ *
+ * This is no such command in the grammar, but we use this internally to add
+ * AT_ReAddStatistics subcommands to rebuild extended statistics after a table
+ * column type change.
+ */
+static ObjectAddress
+ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+ CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
+{
+ ObjectAddress address;
+
+ Assert(IsA(stmt, CreateStatsStmt));
+
+ /* The CreateStatsStmt has already been through transformStatsStmt */
+ Assert(stmt->transformed);
+
+ address = CreateStatistics(stmt);
+
+ return address;
+}
+
/*
* ALTER TABLE ADD CONSTRAINT USING INDEX
*
* Give the extended-stats machinery a chance to fix anything
* that this column type change would break.
*/
- UpdateStatisticsForTypeChange(foundObject.objectId,
- RelationGetRelid(rel), attnum,
- attTup->atttypid, targettype);
+ RememberStatisticsForRebuilding(foundObject.objectId, tab);
break;
case OCLASS_PROC:
}
}
+/*
+ * Subroutine for ATExecAlterColumnType: remember that a statistics object
+ * needs to be rebuilt (which we might already know).
+ */
+static void
+RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
+{
+ /*
+ * This de-duplication check is critical for two independent reasons: we
+ * mustn't try to recreate the same statistics object twice, and if the
+ * statistics depends on more than one column whose type is to be altered,
+ * we must capture its definition string before applying any of the type
+ * changes. ruleutils.c will get confused if we ask again later.
+ */
+ if (!list_member_oid(tab->changedStatisticsOids, stxoid))
+ {
+ /* OK, capture the index's existing definition string */
+ char *defstring = pg_get_statisticsobjdef_string(stxoid);
+
+ tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
+ stxoid);
+ tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
+ defstring);
+ }
+}
+
/*
* Cleanup after we've finished all the ALTER TYPE operations for a
* particular relation. We have to drop and recreate all the indexes
add_exact_object_address(&obj, objects);
}
+ /* add dependencies for new statistics */
+ forboth(oid_item, tab->changedStatisticsOids,
+ def_item, tab->changedStatisticsDefs)
+ {
+ Oid oldId = lfirst_oid(oid_item);
+ Oid relid;
+
+ relid = StatisticsGetRelation(oldId, false);
+ ATPostAlterTypeParse(oldId, relid, InvalidOid,
+ (char *) lfirst(def_item),
+ wqueue, lockmode, tab->rewrite);
+
+ ObjectAddressSet(obj, StatisticExtRelationId, oldId);
+ add_exact_object_address(&obj, objects);
+ }
+
/*
* Queue up command to restore replica identity index marking
*/
}
/*
- * Parse the previously-saved definition string for a constraint or index
- * against the newly-established column data type(s), and queue up the
- * resulting command parsetrees for execution.
+ * Parse the previously-saved definition string for a constraint, index or
+ * statistics object against the newly-established column data type(s), and
+ * queue up the resulting command parsetrees for execution.
*
* This might fail if, for example, you have a WHERE clause that uses an
* operator that's not available for the new column type.
querytree_list = lappend(querytree_list, stmt);
querytree_list = list_concat(querytree_list, afterStmts);
}
+ else if (IsA(stmt, CreateStatsStmt))
+ querytree_list = lappend(querytree_list,
+ transformStatsStmt(oldRelId,
+ (CreateStatsStmt *) stmt,
+ cmd));
else
querytree_list = lappend(querytree_list, stmt);
}
elog(ERROR, "unexpected statement subtype: %d",
(int) stmt->subtype);
}
+ else if (IsA(stm, CreateStatsStmt))
+ {
+ CreateStatsStmt *stmt = (CreateStatsStmt *) stm;
+ AlterTableCmd *newcmd;
+
+ /* keep the statistics object's comment */
+ stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
+
+ newcmd = makeNode(AlterTableCmd);
+ newcmd->subtype = AT_ReAddStatistics;
+ newcmd->def = (Node *) stmt;
+ tab->subcmds[AT_PASS_MISC] =
+ lappend(tab->subcmds[AT_PASS_MISC], newcmd);
+ }
else
elog(ERROR, "unexpected statement type: %d",
(int) nodeTag(stm));
return newnode;
}
+static StatsElem *
+_copyStatsElem(const StatsElem *from)
+{
+ StatsElem *newnode = makeNode(StatsElem);
+
+ COPY_STRING_FIELD(name);
+ COPY_NODE_FIELD(expr);
+
+ return newnode;
+}
+
static ColumnDef *
_copyColumnDef(const ColumnDef *from)
{
case T_IndexElem:
retval = _copyIndexElem(from);
break;
+ case T_StatsElem:
+ retval = _copyStatsElem(from);
+ break;
case T_ColumnDef:
retval = _copyColumnDef(from);
break;
return true;
}
+
+static bool
+_equalStatsElem(const StatsElem *a, const StatsElem *b)
+{
+ COMPARE_STRING_FIELD(name);
+ COMPARE_NODE_FIELD(expr);
+
+ return true;
+}
+
static bool
_equalColumnDef(const ColumnDef *a, const ColumnDef *b)
{
case T_IndexElem:
retval = _equalIndexElem(a, b);
break;
+ case T_StatsElem:
+ retval = _equalStatsElem(a, b);
+ break;
case T_ColumnDef:
retval = _equalColumnDef(a, b);
break;
WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
}
+static void
+_outStatsElem(StringInfo str, const StatsElem *node)
+{
+ WRITE_NODE_TYPE("STATSELEM");
+
+ WRITE_STRING_FIELD(name);
+ WRITE_NODE_FIELD(expr);
+}
+
static void
_outQuery(StringInfo str, const Query *node)
{
case T_IndexElem:
_outIndexElem(str, obj);
break;
+ case T_StatsElem:
+ _outStatsElem(str, obj);
+ break;
case T_Query:
_outQuery(str, obj);
break;
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
#include "nodes/supportnodes.h"
#include "optimizer/clauses.h"
#include "optimizer/cost.h"
static List *
get_relation_statistics(RelOptInfo *rel, Relation relation)
{
+ Index varno = rel->relid;
List *statoidlist;
List *stainfos = NIL;
ListCell *l;
HeapTuple htup;
HeapTuple dtup;
Bitmapset *keys = NULL;
+ List *exprs = NIL;
int i;
htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
for (i = 0; i < staForm->stxkeys.dim1; i++)
keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+ /*
+ * Preprocess expressions (if any). We read the expressions, run them
+ * through eval_const_expressions, and fix the varnos.
+ */
+ {
+ bool isnull;
+ Datum datum;
+
+ /* decode expression (if any) */
+ datum = SysCacheGetAttr(STATEXTOID, htup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+ pfree(exprsString);
+
+ /*
+ * Run the expressions through eval_const_expressions. This is
+ * not just an optimization, but is necessary, because the
+ * planner will be comparing them to similarly-processed qual
+ * clauses, and may fail to detect valid matches without this.
+ * We must not use canonicalize_qual, however, since these
+ * aren't qual expressions.
+ */
+ exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+ /* May as well fix opfuncids too */
+ fix_opfuncids((Node *) exprs);
+
+ /*
+ * Modify the copies we obtain from the relcache to have the
+ * correct varno for the parent relation, so that they match
+ * up correctly against qual clauses.
+ */
+ if (varno != 1)
+ ChangeVarNodes((Node *) exprs, 1, varno, 0);
+ }
+ }
+
/* add one StatisticExtInfo for each kind built */
if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
{
info->rel = rel;
info->kind = STATS_EXT_NDISTINCT;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
info->rel = rel;
info->kind = STATS_EXT_DEPENDENCIES;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
info->rel = rel;
info->kind = STATS_EXT_MCV;
info->keys = bms_copy(keys);
+ info->exprs = exprs;
+
+ stainfos = lappend(stainfos, info);
+ }
+
+ if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
+ {
+ StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+ info->statOid = statOid;
+ info->rel = rel;
+ info->kind = STATS_EXT_EXPRESSIONS;
+ info->keys = bms_copy(keys);
+ info->exprs = exprs;
stainfos = lappend(stainfos, info);
}
WindowDef *windef;
JoinExpr *jexpr;
IndexElem *ielem;
+ StatsElem *selem;
Alias *alias;
RangeVar *range;
IntoClause *into;
old_aggr_definition old_aggr_list
oper_argtypes RuleActionList RuleActionMulti
opt_column_list columnList opt_name_list
- sort_clause opt_sort_clause sortby_list index_params
+ sort_clause opt_sort_clause sortby_list index_params stats_params
opt_include opt_c_include index_including_params
name_list role_list from_clause from_list opt_array_bounds
qualified_name_list any_name any_name_list type_name_list
%type <list> func_alias_clause
%type <sortby> sortby
%type <ielem> index_elem index_elem_options
+%type <selem> stats_param
%type <node> table_ref
%type <jexpr> joined_table
%type <range> relation_expr
CreateStatsStmt:
CREATE STATISTICS any_name
- opt_name_list ON expr_list FROM from_list
+ opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $3;
$$ = (Node *)n;
}
| CREATE STATISTICS IF_P NOT EXISTS any_name
- opt_name_list ON expr_list FROM from_list
+ opt_name_list ON stats_params FROM from_list
{
CreateStatsStmt *n = makeNode(CreateStatsStmt);
n->defnames = $6;
}
;
+/*
+ * Statistics attributes can be either simple column references, or arbitrary
+ * expressions in parens. For compatibility with index attributes permitted
+ * in CREATE INDEX, we allow an expression that's just a function call to be
+ * written without parens.
+ */
+
+stats_params: stats_param { $$ = list_make1($1); }
+ | stats_params ',' stats_param { $$ = lappend($1, $3); }
+ ;
+
+stats_param: ColId
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = $1;
+ $$->expr = NULL;
+ }
+ | func_expr_windowless
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = NULL;
+ $$->expr = $1;
+ }
+ | '(' a_expr ')'
+ {
+ $$ = makeNode(StatsElem);
+ $$->name = NULL;
+ $$->expr = $2;
+ }
+ ;
/*****************************************************************************
*
else
err = _("grouping operations are not allowed in index predicates");
+ break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ if (isAgg)
+ err = _("aggregate functions are not allowed in statistics expressions");
+ else
+ err = _("grouping operations are not allowed in statistics expressions");
+
break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
if (isAgg)
case EXPR_KIND_INDEX_EXPRESSION:
err = _("window functions are not allowed in index expressions");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("window functions are not allowed in statistics expressions");
+ break;
case EXPR_KIND_INDEX_PREDICATE:
err = _("window functions are not allowed in index predicates");
break;
case EXPR_KIND_FUNCTION_DEFAULT:
case EXPR_KIND_INDEX_EXPRESSION:
case EXPR_KIND_INDEX_PREDICATE:
+ case EXPR_KIND_STATS_EXPRESSION:
case EXPR_KIND_ALTER_COL_TRANSFORM:
case EXPR_KIND_EXECUTE_PARAMETER:
case EXPR_KIND_TRIGGER_WHEN:
case EXPR_KIND_INDEX_PREDICATE:
err = _("cannot use subquery in index predicate");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("cannot use subquery in statistics expression");
+ break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("cannot use subquery in transform expression");
break;
return "index expression";
case EXPR_KIND_INDEX_PREDICATE:
return "index predicate";
+ case EXPR_KIND_STATS_EXPRESSION:
+ return "statistics expression";
case EXPR_KIND_ALTER_COL_TRANSFORM:
return "USING";
case EXPR_KIND_EXECUTE_PARAMETER:
case EXPR_KIND_INDEX_PREDICATE:
err = _("set-returning functions are not allowed in index predicates");
break;
+ case EXPR_KIND_STATS_EXPRESSION:
+ err = _("set-returning functions are not allowed in statistics expressions");
+ break;
case EXPR_KIND_ALTER_COL_TRANSFORM:
err = _("set-returning functions are not allowed in transform expressions");
break;
stat_types = lappend(stat_types, makeString("dependencies"));
else if (enabled[i] == STATS_EXT_MCV)
stat_types = lappend(stat_types, makeString("mcv"));
+ else if (enabled[i] == STATS_EXT_EXPRESSIONS)
+ /* expression stats are not exposed to users */
+ continue;
else
elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
}
/* Determine which columns the statistics are on */
for (i = 0; i < statsrec->stxkeys.dim1; i++)
{
- ColumnRef *cref = makeNode(ColumnRef);
+ StatsElem *selem = makeNode(StatsElem);
AttrNumber attnum = statsrec->stxkeys.values[i];
- cref->fields = list_make1(makeString(get_attname(heapRelid,
- attnum, false)));
- cref->location = -1;
+ selem->name = get_attname(heapRelid, attnum, false);
+ selem->expr = NULL;
- def_names = lappend(def_names, cref);
+ def_names = lappend(def_names, selem);
+ }
+
+ /*
+ * Now handle expressions, if there are any. The order (with respect to
+ * regular attributes) does not really matter for extended stats, so we
+ * simply append them after simple column references.
+ *
+ * XXX Some places during build/estimation treat expressions as if they
+ * are before atttibutes, but for the CREATE command that's entirely
+ * irrelevant.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, ht_stats,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ ListCell *lc;
+ List *exprs = NIL;
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+
+ foreach(lc, exprs)
+ {
+ StatsElem *selem = makeNode(StatsElem);
+
+ selem->name = NULL;
+ selem->expr = (Node *) lfirst(lc);
+
+ def_names = lappend(def_names, selem);
+ }
+
+ pfree(exprsString);
}
/* finally, build the output node */
stats->relations = list_make1(heapRel);
stats->stxcomment = NULL;
stats->if_not_exists = false;
+ stats->transformed = true; /* don't need transformStatsStmt again */
/* Clean up */
ReleaseSysCache(ht_stats);
return stmt;
}
+/*
+ * transformStatsStmt - parse analysis for CREATE STATISTICS
+ *
+ * To avoid race conditions, it's important that this function rely only on
+ * the passed-in relid (and not on stmt->relation) to determine the target
+ * relation.
+ */
+CreateStatsStmt *
+transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
+{
+ ParseState *pstate;
+ ParseNamespaceItem *nsitem;
+ ListCell *l;
+ Relation rel;
+
+ /* Nothing to do if statement already transformed. */
+ if (stmt->transformed)
+ return stmt;
+
+ /*
+ * We must not scribble on the passed-in CreateStatsStmt, so copy it.
+ * (This is overkill, but easy.)
+ */
+ stmt = copyObject(stmt);
+
+ /* Set up pstate */
+ pstate = make_parsestate(NULL);
+ pstate->p_sourcetext = queryString;
+
+ /*
+ * Put the parent table into the rtable so that the expressions can refer
+ * to its fields without qualification. Caller is responsible for locking
+ * relation, but we still need to open it.
+ */
+ rel = relation_open(relid, NoLock);
+ nsitem = addRangeTableEntryForRelation(pstate, rel,
+ AccessShareLock,
+ NULL, false, true);
+
+ /* no to join list, yes to namespaces */
+ addNSItemToQuery(pstate, nsitem, false, true, true);
+
+ /* take care of any expressions */
+ foreach(l, stmt->exprs)
+ {
+ StatsElem *selem = (StatsElem *) lfirst(l);
+
+ if (selem->expr)
+ {
+ /* Now do parse transformation of the expression */
+ selem->expr = transformExpr(pstate, selem->expr,
+ EXPR_KIND_STATS_EXPRESSION);
+
+ /* We have to fix its collations too */
+ assign_expr_collations(pstate, selem->expr);
+ }
+ }
+
+ /*
+ * Check that only the base rel is mentioned. (This should be dead code
+ * now that add_missing_from is history.)
+ */
+ if (list_length(pstate->p_rtable) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("statistics expressions can refer only to the table being indexed")));
+
+ free_parsestate(pstate);
+
+ /* Close relation */
+ table_close(rel, NoLock);
+
+ /* Mark statement as successfully transformed */
+ stmt->transformed = true;
+
+ return stmt;
+}
+
/*
* transformRuleStmt -
static DependencyGenerator DependencyGenerator_init(int n, int k);
static void DependencyGenerator_free(DependencyGenerator state);
static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
-static double dependency_degree(int numrows, HeapTuple *rows, int k,
- AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
static bool dependency_is_fully_matched(MVDependency *dependency,
Bitmapset *attnums);
static bool dependency_is_compatible_clause(Node *clause, Index relid,
AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+ List *statlist, Node **expr);
static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
- int ndependencies,
- Bitmapset *attnums);
+ int ndependencies, Bitmapset *attnums);
static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
int varRelid, JoinType jointype,
SpecialJoinInfo *sjinfo,
* the last one.
*/
static double
-dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
- VacAttrStats **stats, Bitmapset *attrs)
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
{
int i,
nitems;
MultiSortSupport mss;
SortItem *items;
- AttrNumber *attnums;
AttrNumber *attnums_dep;
- int numattrs;
/* counters valid within a group */
int group_size = 0;
mss = multi_sort_init(k);
/*
- * Transform the attrs from bitmap to an array to make accessing the i-th
- * member easier, and then construct a filtered version with only attnums
- * referenced by the dependency we validate.
+ * Translate the array of indexes to regular attnums for the dependency (we
+ * will need this to identify the columns in StatsBuildData).
*/
- attnums = build_attnums_array(attrs, &numattrs);
-
attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
for (i = 0; i < k; i++)
- attnums_dep[i] = attnums[dependency[i]];
+ attnums_dep[i] = data->attnums[dependency[i]];
/*
* Verify the dependency (a,b,...)->z, using a rather simple algorithm:
/* prepare the sort function for the dimensions */
for (i = 0; i < k; i++)
{
- VacAttrStats *colstat = stats[dependency[i]];
+ VacAttrStats *colstat = data->stats[dependency[i]];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
* descriptor. For now that assumption holds, but it might change in the
* future for example if we support statistics on multiple tables.
*/
- items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
- mss, k, attnums_dep);
+ items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
/*
* Walk through the sorted array, split it into rows according to the
pfree(items);
pfree(mss);
- pfree(attnums);
pfree(attnums_dep);
/* Compute the 'degree of validity' as (supporting/total). */
- return (n_supporting_rows * 1.0 / numrows);
+ return (n_supporting_rows * 1.0 / data->numrows);
}
/*
* (c) -> b
*/
MVDependencies *
-statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
- VacAttrStats **stats)
+statext_dependencies_build(StatsBuildData *data)
{
int i,
k;
- int numattrs;
- AttrNumber *attnums;
/* result */
MVDependencies *dependencies = NULL;
- /*
- * Transform the bms into an array, to make accessing i-th member easier.
- */
- attnums = build_attnums_array(attrs, &numattrs);
-
- Assert(numattrs >= 2);
+ Assert(data->nattnums >= 2);
/*
* We'll try build functional dependencies starting from the smallest ones
* included in the statistics object. We start from the smallest ones
* because we want to be able to skip already implied ones.
*/
- for (k = 2; k <= numattrs; k++)
+ for (k = 2; k <= data->nattnums; k++)
{
AttrNumber *dependency; /* array with k elements */
/* prepare a DependencyGenerator of variation */
- DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
+ DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
/* generate all possible variations of k values (out of n) */
while ((dependency = DependencyGenerator_next(DependencyGenerator)))
MVDependency *d;
/* compute how valid the dependency seems */
- degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+ degree = dependency_degree(data, k, dependency);
/*
* if the dependency seems entirely invalid, don't store it
d->degree = degree;
d->nattributes = k;
for (i = 0; i < k; i++)
- d->attributes[i] = attnums[dependency[i]];
+ d->attributes[i] = data->attnums[dependency[i]];
/* initialize the list of dependencies */
if (dependencies == NULL)
dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
{
Var *var;
+ Node *clause_expr;
if (IsA(clause, RestrictInfo))
{
/* Make sure non-selected argument is a pseudoconstant. */
if (is_pseudo_constant_clause(lsecond(expr->args)))
- var = linitial(expr->args);
+ clause_expr = linitial(expr->args);
else if (is_pseudo_constant_clause(linitial(expr->args)))
- var = lsecond(expr->args);
+ clause_expr = lsecond(expr->args);
else
return false;
/*
* Reject ALL() variant, we only care about ANY/IN.
*
- * FIXME Maybe we should check if all the values are the same, and
- * allow ALL in that case? Doesn't seem very practical, though.
+ * XXX Maybe we should check if all the values are the same, and allow
+ * ALL in that case? Doesn't seem very practical, though.
*/
if (!expr->useOr)
return false;
if (!is_pseudo_constant_clause(lsecond(expr->args)))
return false;
- var = linitial(expr->args);
+ clause_expr = linitial(expr->args);
/*
* If it's not an "=" operator, just ignore the clause, as it's not
}
else if (is_orclause(clause))
{
- BoolExpr *expr = (BoolExpr *) clause;
+ BoolExpr *bool_expr = (BoolExpr *) clause;
ListCell *lc;
/* start with no attribute number */
*attnum = InvalidAttrNumber;
- foreach(lc, expr->args)
+ foreach(lc, bool_expr->args)
{
AttrNumber clause_attnum;
if (*attnum == InvalidAttrNumber)
*attnum = clause_attnum;
+ /* ensure all the variables are the same (same attnum) */
if (*attnum != clause_attnum)
return false;
}
* "NOT x" can be interpreted as "x = false", so get the argument and
* proceed with seeing if it's a suitable Var.
*/
- var = (Var *) get_notclausearg(clause);
+ clause_expr = (Node *) get_notclausearg(clause);
}
else
{
* A boolean expression "x" can be interpreted as "x = true", so
* proceed with seeing if it's a suitable Var.
*/
- var = (Var *) clause;
+ clause_expr = (Node *) clause;
}
/*
* We may ignore any RelabelType node above the operand. (There won't be
* more than one, since eval_const_expressions has been applied already.)
*/
- if (IsA(var, RelabelType))
- var = (Var *) ((RelabelType *) var)->arg;
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
/* We only support plain Vars for now */
- if (!IsA(var, Var))
+ if (!IsA(clause_expr, Var))
return false;
+ /* OK, we know we have a Var */
+ var = (Var *) clause_expr;
+
/* Ensure Var is from the correct relation */
if (var->varno != relid)
return false;
return s1;
}
+/*
+ * dependency_is_compatible_expression
+ * Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+ List *vars;
+ ListCell *lc,
+ *lc2;
+ Node *clause_expr;
+
+ if (IsA(clause, RestrictInfo))
+ {
+ RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+ /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+ if (rinfo->pseudoconstant)
+ return false;
+
+ /* Clauses referencing multiple, or no, varnos are incompatible */
+ if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ return false;
+
+ clause = (Node *) rinfo->clause;
+ }
+
+ if (is_opclause(clause))
+ {
+ /* If it's an opclause, check for Var = Const or Const = Var. */
+ OpExpr *expr = (OpExpr *) clause;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /* Make sure non-selected argument is a pseudoconstant. */
+ if (is_pseudo_constant_clause(lsecond(expr->args)))
+ clause_expr = linitial(expr->args);
+ else if (is_pseudo_constant_clause(linitial(expr->args)))
+ clause_expr = lsecond(expr->args);
+ else
+ return false;
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies.
+ *
+ * This uses the function for estimating selectivity, not the operator
+ * directly (a bit awkward, but well ...).
+ *
+ * XXX this is pretty dubious; probably it'd be better to check btree
+ * or hash opclass membership, so as not to be fooled by custom
+ * selectivity functions, and to be more consistent with decisions
+ * elsewhere in the planner.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (IsA(clause, ScalarArrayOpExpr))
+ {
+ /* If it's an scalar array operator, check for Var IN Const. */
+ ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+ /*
+ * Reject ALL() variant, we only care about ANY/IN.
+ *
+ * FIXME Maybe we should check if all the values are the same, and
+ * allow ALL in that case? Doesn't seem very practical, though.
+ */
+ if (!expr->useOr)
+ return false;
+
+ /* Only expressions with two arguments are candidates. */
+ if (list_length(expr->args) != 2)
+ return false;
+
+ /*
+ * We know it's always (Var IN Const), so we assume the var is the
+ * first argument, and pseudoconstant is the second one.
+ */
+ if (!is_pseudo_constant_clause(lsecond(expr->args)))
+ return false;
+
+ clause_expr = linitial(expr->args);
+
+ /*
+ * If it's not an "=" operator, just ignore the clause, as it's not
+ * compatible with functional dependencies. The operator is identified
+ * simply by looking at which function it uses to estimate
+ * selectivity. That's a bit strange, but it's what other similar
+ * places do.
+ */
+ if (get_oprrest(expr->opno) != F_EQSEL)
+ return false;
+
+ /* OK to proceed with checking "var" */
+ }
+ else if (is_orclause(clause))
+ {
+ BoolExpr *bool_expr = (BoolExpr *) clause;
+ ListCell *lc;
+
+ /* start with no expression (we'll use the first match) */
+ *expr = NULL;
+
+ foreach(lc, bool_expr->args)
+ {
+ Node *or_expr = NULL;
+
+ /*
+ * Had we found incompatible expression in the arguments, treat
+ * the whole expression as incompatible.
+ */
+ if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+ statlist, &or_expr))
+ return false;
+
+ if (*expr == NULL)
+ *expr = or_expr;
+
+ /* ensure all the expressions are the same */
+ if (!equal(or_expr, *expr))
+ return false;
+ }
+
+ /* the expression is already checked by the recursive call */
+ return true;
+ }
+ else if (is_notclause(clause))
+ {
+ /*
+ * "NOT x" can be interpreted as "x = false", so get the argument and
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) get_notclausearg(clause);
+ }
+ else
+ {
+ /*
+ * A boolean expression "x" can be interpreted as "x = true", so
+ * proceed with seeing if it's a suitable Var.
+ */
+ clause_expr = (Node *) clause;
+ }
+
+ /*
+ * We may ignore any RelabelType node above the operand. (There won't be
+ * more than one, since eval_const_expressions has been applied already.)
+ */
+ if (IsA(clause_expr, RelabelType))
+ clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+ vars = pull_var_clause(clause_expr, 0);
+
+ foreach(lc, vars)
+ {
+ Var *var = (Var *) lfirst(lc);
+
+ /* Ensure Var is from the correct relation */
+ if (var->varno != relid)
+ return false;
+
+ /* We also better ensure the Var is from the current level */
+ if (var->varlevelsup != 0)
+ return false;
+
+ /* Also ignore system attributes (we don't allow stats on those) */
+ if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+ return false;
+ }
+
+ /*
+ * Check if we actually have a matching statistics for the expression.
+ *
+ * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+ */
+ foreach(lc, statlist)
+ {
+ StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+ /* ignore stats without dependencies */
+ if (info->kind != STATS_EXT_DEPENDENCIES)
+ continue;
+
+ foreach(lc2, info->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc2);
+
+ if (equal(clause_expr, stat_expr))
+ {
+ *expr = stat_expr;
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
/*
* dependencies_clauselist_selectivity
* Return the estimated selectivity of (a subset of) the given clauses
MVDependency **dependencies;
int ndependencies;
int i;
+ AttrNumber attnum_offset;
+
+ /* unique expressions */
+ Node **unique_exprs;
+ int unique_exprs_cnt;
/* check if there's any stats that might be useful for us. */
if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
list_length(clauses));
+ /*
+ * We allocate space as if every clause was a unique expression, although
+ * that's probably overkill. Some will be simple column references that
+ * we'll translate to attnums, and there might be duplicates. But it's
+ * easier and cheaper to just do one allocation than repalloc later.
+ */
+ unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+ unique_exprs_cnt = 0;
+
/*
* Pre-process the clauses list to extract the attnums seen in each item.
* We need to determine if there's any clauses which will be useful for
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
+ *
+ * To handle expressions, we assign them negative attnums, as if it was a
+ * system attribute (this is fine, as we only allow extended stats on user
+ * attributes). And then we offset everything by the number of
+ * expressions, so that we can store the values in a bitmapset.
*/
listidx = 0;
foreach(l, clauses)
{
Node *clause = (Node *) lfirst(l);
AttrNumber attnum;
+ Node *expr = NULL;
+
+ /* ignore clause by default */
+ list_attnums[listidx] = InvalidAttrNumber;
- if (!bms_is_member(listidx, *estimatedclauses) &&
- dependency_is_compatible_clause(clause, rel->relid, &attnum))
+ if (!bms_is_member(listidx, *estimatedclauses))
{
- list_attnums[listidx] = attnum;
- clauses_attnums = bms_add_member(clauses_attnums, attnum);
+ /*
+ * If it's a simple column refrence, just extract the attnum. If
+ * it's an expression, assign a negative attnum as if it was a
+ * system attribute.
+ */
+ if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+ {
+ list_attnums[listidx] = attnum;
+ }
+ else if (dependency_is_compatible_expression(clause, rel->relid,
+ rel->statlist,
+ &expr))
+ {
+ /* special attnum assigned to this expression */
+ attnum = InvalidAttrNumber;
+
+ Assert(expr != NULL);
+
+ /* If the expression is duplicate, use the same attnum. */
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ if (equal(unique_exprs[i], expr))
+ {
+ /* negative attribute number to expression */
+ attnum = -(i + 1);
+ break;
+ }
+ }
+
+ /* not found in the list, so add it */
+ if (attnum == InvalidAttrNumber)
+ {
+ unique_exprs[unique_exprs_cnt++] = expr;
+
+ /* after incrementing the value, to get -1, -2, ... */
+ attnum = (-unique_exprs_cnt);
+ }
+
+ /* remember which attnum was assigned to this clause */
+ list_attnums[listidx] = attnum;
+ }
}
- else
- list_attnums[listidx] = InvalidAttrNumber;
listidx++;
}
+ Assert(listidx == list_length(clauses));
+
/*
- * If there's not at least two distinct attnums then reject the whole list
- * of clauses. We must return 1.0 so the calling function's selectivity is
- * unaffected.
+ * How much we need to offset the attnums? If there are no expressions,
+ * then no offset is needed. Otherwise we need to offset enough for the
+ * lowest value (-unique_exprs_cnt) to become 1.
+ */
+ if (unique_exprs_cnt > 0)
+ attnum_offset = (unique_exprs_cnt + 1);
+ else
+ attnum_offset = 0;
+
+ /*
+ * Now that we know how many expressions there are, we can offset the
+ * values just enough to build the bitmapset.
+ */
+ for (i = 0; i < list_length(clauses); i++)
+ {
+ AttrNumber attnum;
+
+ /* ignore incompatible or already estimated clauses */
+ if (list_attnums[i] == InvalidAttrNumber)
+ continue;
+
+ /* make sure the attnum is in the expected range */
+ Assert(list_attnums[i] >= (-unique_exprs_cnt));
+ Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+ /* make sure the attnum is positive (valid AttrNumber) */
+ attnum = list_attnums[i] + attnum_offset;
+
+ /*
+ * Either it's a regular attribute, or it's an expression, in which
+ * case we must not have seen it before (expressions are unique).
+ *
+ * XXX Check whether it's a regular attribute has to be done using the
+ * original attnum, while the second check has to use the value with
+ * an offset.
+ */
+ Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+ !bms_is_member(attnum, clauses_attnums));
+
+ /*
+ * Remember the offset attnum, both for attributes and expressions.
+ * We'll pass list_attnums to clauselist_apply_dependencies, which
+ * uses it to identify clauses in a bitmap. We could also pass the
+ * offset, but this is more convenient.
+ */
+ list_attnums[i] = attnum;
+
+ clauses_attnums = bms_add_member(clauses_attnums, attnum);
+ }
+
+ /*
+ * If there's not at least two distinct attnums and expressions, then
+ * reject the whole list of clauses. We must return 1.0 so the calling
+ * function's selectivity is unaffected.
*/
if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
{
foreach(l, rel->statlist)
{
StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
- Bitmapset *matched;
- BMS_Membership membership;
+ int nmatched;
+ int nexprs;
+ int k;
+ MVDependencies *deps;
/* skip statistics that are not of the correct type */
if (stat->kind != STATS_EXT_DEPENDENCIES)
continue;
- matched = bms_intersect(clauses_attnums, stat->keys);
- membership = bms_membership(matched);
- bms_free(matched);
+ /*
+ * Count matching attributes - we have to undo the attnum offsets. The
+ * input attribute numbers are not offset (expressions are not
+ * included in stat->keys, so it's not necessary). But we need to
+ * offset it before checking against clauses_attnums.
+ */
+ nmatched = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->keys, k)) >= 0)
+ {
+ AttrNumber attnum = (AttrNumber) k;
- /* skip objects matching fewer than two attributes from clauses */
- if (membership != BMS_MULTIPLE)
+ /* skip expressions */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ /* apply the same offset as above */
+ attnum += attnum_offset;
+
+ if (bms_is_member(attnum, clauses_attnums))
+ nmatched++;
+ }
+
+ /* count matching expressions */
+ nexprs = 0;
+ for (i = 0; i < unique_exprs_cnt; i++)
+ {
+ ListCell *lc;
+
+ foreach(lc, stat->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ /* try to match it */
+ if (equal(stat_expr, unique_exprs[i]))
+ nexprs++;
+ }
+ }
+
+ /*
+ * Skip objects matching fewer than two attributes/expressions from
+ * clauses.
+ */
+ if (nmatched + nexprs < 2)
continue;
- func_dependencies[nfunc_dependencies]
- = statext_dependencies_load(stat->statOid);
+ deps = statext_dependencies_load(stat->statOid);
+
+ /*
+ * The expressions may be represented by different attnums in the
+ * stats, we need to remap them to be consistent with the clauses.
+ * That will make the later steps (e.g. picking the strongest item and
+ * so on) much simpler and cheaper, because it won't need to care
+ * about the offset at all.
+ *
+ * When we're at it, we can ignore dependencies that are not fully
+ * matched by clauses (i.e. referencing attributes or expressions that
+ * are not in the clauses).
+ *
+ * We have to do this for all statistics, as long as there are any
+ * expressions - we need to shift the attnums in all dependencies.
+ *
+ * XXX Maybe we should do this always, because it also eliminates some
+ * of the dependencies early. It might be cheaper than having to walk
+ * the longer list in find_strongest_dependency later, especially as
+ * we need to do that repeatedly?
+ *
+ * XXX We have to do this even when there are no expressions in
+ * clauses, otherwise find_strongest_dependency may fail for stats
+ * with expressions (due to lookup of negative value in bitmap). So we
+ * need to at least filter out those dependencies. Maybe we could do
+ * it in a cheaper way (if there are no expr clauses, we can just
+ * discard all negative attnums without any lookups).
+ */
+ if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+ {
+ int ndeps = 0;
+
+ for (i = 0; i < deps->ndeps; i++)
+ {
+ bool skip = false;
+ MVDependency *dep = deps->deps[i];
+ int j;
+
+ for (j = 0; j < dep->nattributes; j++)
+ {
+ int idx;
+ Node *expr;
+ int k;
+ AttrNumber unique_attnum = InvalidAttrNumber;
+ AttrNumber attnum;
+
+ /* undo the per-statistics offset */
+ attnum = dep->attributes[j];
+
+ /*
+ * For regular attributes we can simply check if it
+ * matches any clause. If there's no matching clause, we
+ * can just ignore it. We need to offset the attnum
+ * though.
+ */
+ if (AttrNumberIsForUserDefinedAttr(attnum))
+ {
+ dep->attributes[j] = attnum + attnum_offset;
+
+ if (!bms_is_member(dep->attributes[j], clauses_attnums))
+ {
+ skip = true;
+ break;
+ }
+
+ continue;
+ }
+
+ /*
+ * the attnum should be a valid system attnum (-1, -2,
+ * ...)
+ */
+ Assert(AttributeNumberIsValid(attnum));
+
+ /*
+ * For expressions, we need to do two translations. First
+ * we have to translate the negative attnum to index in
+ * the list of expressions (in the statistics object).
+ * Then we need to see if there's a matching clause. The
+ * index of the unique expression determines the attnum
+ * (and we offset it).
+ */
+ idx = -(1 + attnum);
+
+ /* Is the expression index is valid? */
+ Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+ expr = (Node *) list_nth(stat->exprs, idx);
+
+ /* try to find the expression in the unique list */
+ for (k = 0; k < unique_exprs_cnt; k++)
+ {
+ /*
+ * found a matching unique expression, use the attnum
+ * (derived from index of the unique expression)
+ */
+ if (equal(unique_exprs[k], expr))
+ {
+ unique_attnum = -(k + 1) + attnum_offset;
+ break;
+ }
+ }
+
+ /*
+ * Found no matching expression, so we can simply skip
+ * this dependency, because there's no chance it will be
+ * fully covered.
+ */
+ if (unique_attnum == InvalidAttrNumber)
+ {
+ skip = true;
+ break;
+ }
+
+ /* otherwise remap it to the new attnum */
+ dep->attributes[j] = unique_attnum;
+ }
- total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
- nfunc_dependencies++;
+ /* if found a matching dependency, keep it */
+ if (!skip)
+ {
+ /* maybe we've skipped something earlier, so move it */
+ if (ndeps != i)
+ deps->deps[ndeps] = deps->deps[i];
+
+ ndeps++;
+ }
+ }
+
+ deps->ndeps = ndeps;
+ }
+
+ /*
+ * It's possible we've removed all dependencies, in which case we
+ * don't bother adding it to the list.
+ */
+ if (deps->ndeps > 0)
+ {
+ func_dependencies[nfunc_dependencies] = deps;
+ total_ndeps += deps->ndeps;
+ nfunc_dependencies++;
+ }
}
/* if no matching stats could be found then we've nothing to do */
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
+ pfree(unique_exprs);
return 1.0;
}
pfree(func_dependencies);
bms_free(clauses_attnums);
pfree(list_attnums);
+ pfree(unique_exprs);
return s1;
}
#include "catalog/pg_collation.h"
#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_statistic_ext_data.h"
+#include "executor/executor.h"
#include "commands/progress.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "statistics/statistics.h"
#include "utils/acl.h"
#include "utils/array.h"
+#include "utils/attoptcache.h"
#include "utils/builtins.h"
+#include "utils/datum.h"
#include "utils/fmgroids.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/selfuncs.h"
#include "utils/syscache.h"
+#include "utils/typcache.h"
/*
* To avoid consuming too much memory during analysis and/or too much space
Bitmapset *columns; /* attribute numbers covered by the object */
List *types; /* 'char' list of enabled statistics kinds */
int stattarget; /* statistics target (-1 for default) */
+ List *exprs; /* expressions */
} StatExtEntry;
static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
-static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
int nvacatts, VacAttrStats **vacatts);
-static void statext_store(Oid relid,
+static void statext_store(Oid statOid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
- MCVList *mcv, VacAttrStats **stats);
+ MCVList *mcv, Datum exprs, VacAttrStats **stats);
static int statext_compute_stattarget(int stattarget,
int natts, VacAttrStats **stats);
+/* Information needed to analyze a single simple expression. */
+typedef struct AnlExprData
+{
+ Node *expr; /* expression to analyze */
+ VacAttrStats *vacattrstat; /* statistics attrs to analyze */
+} AnlExprData;
+
+static void compute_expr_stats(Relation onerel, double totalrows,
+ AnlExprData * exprdata, int nexprs,
+ HeapTuple *rows, int numrows);
+static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs);
+static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static AnlExprData *build_expr_data(List *exprs, int stattarget);
+
+static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
+ int numrows, HeapTuple *rows,
+ VacAttrStats **stats, int stattarget);
+
+
/*
* Compute requested extended stats, using the rows sampled for the plain
* (single-column) stats.
{
Relation pg_stext;
ListCell *lc;
- List *stats;
+ List *statslist;
MemoryContext cxt;
MemoryContext oldcxt;
int64 ext_cnt;
+ /* Do nothing if there are no columns to analyze. */
+ if (!natts)
+ return;
+
cxt = AllocSetContextCreate(CurrentMemoryContext,
"BuildRelationExtStatistics",
ALLOCSET_DEFAULT_SIZES);
oldcxt = MemoryContextSwitchTo(cxt);
pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
- stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+ statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
/* report this phase */
- if (stats != NIL)
+ if (statslist != NIL)
{
const int index[] = {
PROGRESS_ANALYZE_PHASE,
};
const int64 val[] = {
PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS,
- list_length(stats)
+ list_length(statslist)
};
pgstat_progress_update_multi_param(2, index, val);
}
ext_cnt = 0;
- foreach(lc, stats)
+ foreach(lc, statslist)
{
StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
MVNDistinct *ndistinct = NULL;
MVDependencies *dependencies = NULL;
MCVList *mcv = NULL;
+ Datum exprstats = (Datum) 0;
VacAttrStats **stats;
ListCell *lc2;
int stattarget;
+ StatsBuildData *data;
/*
* Check if we can build these stats based on the column analyzed. If
* not, report this fact (except in autovacuum) and move on.
*/
- stats = lookup_var_attr_stats(onerel, stat->columns,
+ stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
natts, vacattrstats);
if (!stats)
{
continue;
}
- /* check allowed number of dimensions */
- Assert(bms_num_members(stat->columns) >= 2 &&
- bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
/* compute statistics target for this statistics */
stattarget = statext_compute_stattarget(stat->stattarget,
bms_num_members(stat->columns),
if (stattarget == 0)
continue;
+ /* evaluate expressions (if the statistics has any) */
+ data = make_build_data(onerel, stat, numrows, rows, stats, stattarget);
+
/* compute statistic of each requested type */
foreach(lc2, stat->types)
{
char t = (char) lfirst_int(lc2);
if (t == STATS_EXT_NDISTINCT)
- ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
- stat->columns, stats);
+ ndistinct = statext_ndistinct_build(totalrows, data);
else if (t == STATS_EXT_DEPENDENCIES)
- dependencies = statext_dependencies_build(numrows, rows,
- stat->columns, stats);
+ dependencies = statext_dependencies_build(data);
else if (t == STATS_EXT_MCV)
- mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
- totalrows, stattarget);
+ mcv = statext_mcv_build(data, totalrows, stattarget);
+ else if (t == STATS_EXT_EXPRESSIONS)
+ {
+ AnlExprData *exprdata;
+ int nexprs;
+
+ /* should not happen, thanks to checks when defining stats */
+ if (!stat->exprs)
+ elog(ERROR, "requested expression stats, but there are no expressions");
+
+ exprdata = build_expr_data(stat->exprs, stattarget);
+ nexprs = list_length(stat->exprs);
+
+ compute_expr_stats(onerel, totalrows,
+ exprdata, nexprs,
+ rows, numrows);
+
+ exprstats = serialize_expr_stats(exprdata, nexprs);
+ }
}
/* store the statistics in the catalog */
- statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
+ statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats);
/* for reporting progress */
pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED,
++ext_cnt);
+
+ /* free the build data (allocated as a single chunk) */
+ pfree(data);
}
table_close(pg_stext, RowExclusiveLock);
MemoryContext oldcxt;
int result = 0;
+ /* If there are no columns to analyze, just return 0. */
+ if (!natts)
+ return 0;
+
cxt = AllocSetContextCreate(CurrentMemoryContext,
"ComputeExtStatisticsRows",
ALLOCSET_DEFAULT_SIZES);
* analyzed. If not, ignore it (don't report anything, we'll do that
* during the actual build BuildRelationExtStatistics).
*/
- stats = lookup_var_attr_stats(onerel, stat->columns,
+ stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
natts, vacattrstats);
if (!stats)
attnum = Anum_pg_statistic_ext_data_stxdmcv;
break;
+ case STATS_EXT_EXPRESSIONS:
+ attnum = Anum_pg_statistic_ext_data_stxdexpr;
+ break;
+
default:
elog(ERROR, "unexpected statistics type requested: %d", type);
}
ArrayType *arr;
char *enabled;
Form_pg_statistic_ext staForm;
+ List *exprs = NIL;
entry = palloc0(sizeof(StatExtEntry));
staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
{
Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
(enabled[i] == STATS_EXT_DEPENDENCIES) ||
- (enabled[i] == STATS_EXT_MCV));
+ (enabled[i] == STATS_EXT_MCV) ||
+ (enabled[i] == STATS_EXT_EXPRESSIONS));
entry->types = lappend_int(entry->types, (int) enabled[i]);
}
+ /* decode expression (if any) */
+ datum = SysCacheGetAttr(STATEXTOID, htup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ if (!isnull)
+ {
+ char *exprsString;
+
+ exprsString = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(exprsString);
+
+ pfree(exprsString);
+
+ /*
+ * Run the expressions through eval_const_expressions. This is not
+ * just an optimization, but is necessary, because the planner
+ * will be comparing them to similarly-processed qual clauses, and
+ * may fail to detect valid matches without this. We must not use
+ * canonicalize_qual, however, since these aren't qual
+ * expressions.
+ */
+ exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+ /* May as well fix opfuncids too */
+ fix_opfuncids((Node *) exprs);
+ }
+
+ entry->exprs = exprs;
+
result = lappend(result, entry);
}
return result;
}
+/*
+ * examine_attribute -- pre-analysis of a single column
+ *
+ * Determine whether the column is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it. If not, return NULL.
+ */
+static VacAttrStats *
+examine_attribute(Node *expr)
+{
+ HeapTuple typtuple;
+ VacAttrStats *stats;
+ int i;
+ bool ok;
+
+ /*
+ * Create the VacAttrStats struct. Note that we only have a copy of the
+ * fixed fields of the pg_attribute tuple.
+ */
+ stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+ /* fake the attribute */
+ stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE);
+ stats->attr->attstattarget = -1;
+
+ /*
+ * When analyzing an expression, believe the expression tree's type not
+ * the column datatype --- the latter might be the opckeytype storage
+ * type of the opclass, which is not interesting for our purposes. (Note:
+ * if we did anything with non-expression statistics columns, we'd need to
+ * figure out where to get the correct type info from, but for now that's
+ * not a problem.) It's not clear whether anyone will care about the
+ * typmod, but we store that too just in case.
+ */
+ stats->attrtypid = exprType(expr);
+ stats->attrtypmod = exprTypmod(expr);
+ stats->attrcollid = exprCollation(expr);
+
+ typtuple = SearchSysCacheCopy1(TYPEOID,
+ ObjectIdGetDatum(stats->attrtypid));
+ if (!HeapTupleIsValid(typtuple))
+ elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+ stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+
+ /*
+ * We don't actually analyze individual attributes, so no need to set the
+ * memory context.
+ */
+ stats->anl_context = NULL;
+ stats->tupattnum = InvalidAttrNumber;
+
+ /*
+ * The fields describing the stats->stavalues[n] element types default to
+ * the type of the data being analyzed, but the type-specific typanalyze
+ * function can change them if it wants to store something else.
+ */
+ for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+ {
+ stats->statypid[i] = stats->attrtypid;
+ stats->statyplen[i] = stats->attrtype->typlen;
+ stats->statypbyval[i] = stats->attrtype->typbyval;
+ stats->statypalign[i] = stats->attrtype->typalign;
+ }
+
+ /*
+ * Call the type-specific typanalyze function. If none is specified, use
+ * std_typanalyze().
+ */
+ if (OidIsValid(stats->attrtype->typanalyze))
+ ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+ PointerGetDatum(stats)));
+ else
+ ok = std_typanalyze(stats);
+
+ if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+ {
+ heap_freetuple(typtuple);
+ pfree(stats->attr);
+ pfree(stats);
+ return NULL;
+ }
+
+ return stats;
+}
+
+/*
+ * examine_expression -- pre-analysis of a single expression
+ *
+ * Determine whether the expression is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it. If not, return NULL.
+ */
+static VacAttrStats *
+examine_expression(Node *expr, int stattarget)
+{
+ HeapTuple typtuple;
+ VacAttrStats *stats;
+ int i;
+ bool ok;
+
+ Assert(expr != NULL);
+
+ /*
+ * Create the VacAttrStats struct.
+ */
+ stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+ /*
+ * When analyzing an expression, believe the expression tree's type.
+ */
+ stats->attrtypid = exprType(expr);
+ stats->attrtypmod = exprTypmod(expr);
+
+ /*
+ * We don't allow collation to be specified in CREATE STATISTICS, so we
+ * have to use the collation specified for the expression. It's possible
+ * to specify the collation in the expression "(col COLLATE "en_US")" in
+ * which case exprCollation() does the right thing.
+ */
+ stats->attrcollid = exprCollation(expr);
+
+ /*
+ * We don't have any pg_attribute for expressions, so let's fake something
+ * reasonable into attstattarget, which is the only thing std_typanalyze
+ * needs.
+ */
+ stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
+
+ /*
+ * We can't have statistics target specified for the expression, so we
+ * could use either the default_statistics_target, or the target computed
+ * for the extended statistics. The second option seems more reasonable.
+ */
+ stats->attr->attstattarget = stattarget;
+
+ /* initialize some basic fields */
+ stats->attr->attrelid = InvalidOid;
+ stats->attr->attnum = InvalidAttrNumber;
+ stats->attr->atttypid = stats->attrtypid;
+
+ typtuple = SearchSysCacheCopy1(TYPEOID,
+ ObjectIdGetDatum(stats->attrtypid));
+ if (!HeapTupleIsValid(typtuple))
+ elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+
+ stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+ stats->anl_context = CurrentMemoryContext; /* XXX should be using
+ * something else? */
+ stats->tupattnum = InvalidAttrNumber;
+
+ /*
+ * The fields describing the stats->stavalues[n] element types default to
+ * the type of the data being analyzed, but the type-specific typanalyze
+ * function can change them if it wants to store something else.
+ */
+ for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+ {
+ stats->statypid[i] = stats->attrtypid;
+ stats->statyplen[i] = stats->attrtype->typlen;
+ stats->statypbyval[i] = stats->attrtype->typbyval;
+ stats->statypalign[i] = stats->attrtype->typalign;
+ }
+
+ /*
+ * Call the type-specific typanalyze function. If none is specified, use
+ * std_typanalyze().
+ */
+ if (OidIsValid(stats->attrtype->typanalyze))
+ ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+ PointerGetDatum(stats)));
+ else
+ ok = std_typanalyze(stats);
+
+ if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+ {
+ heap_freetuple(typtuple);
+ pfree(stats);
+ return NULL;
+ }
+
+ return stats;
+}
+
/*
* Using 'vacatts' of size 'nvacatts' as input data, return a newly built
* VacAttrStats array which includes only the items corresponding to
* to the caller that the stats should not be built.
*/
static VacAttrStats **
-lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
int nvacatts, VacAttrStats **vacatts)
{
int i = 0;
int x = -1;
+ int natts;
VacAttrStats **stats;
+ ListCell *lc;
+
+ natts = bms_num_members(attrs) + list_length(exprs);
- stats = (VacAttrStats **)
- palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
+ stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *));
/* lookup VacAttrStats info for the requested columns (same attnum) */
while ((x = bms_next_member(attrs, x)) >= 0)
i++;
}
+ /* also add info for expressions */
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+
+ stats[i] = examine_attribute(expr);
+
+ /*
+ * XXX We need tuple descriptor later, and we just grab it from
+ * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded
+ * examine_attribute does not set that, so just grab it from the first
+ * vacatts element.
+ */
+ stats[i]->tupDesc = vacatts[0]->tupDesc;
+
+ i++;
+ }
+
return stats;
}
static void
statext_store(Oid statOid,
MVNDistinct *ndistinct, MVDependencies *dependencies,
- MCVList *mcv, VacAttrStats **stats)
+ MCVList *mcv, Datum exprs, VacAttrStats **stats)
{
Relation pg_stextdata;
HeapTuple stup,
nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
}
+ if (exprs != (Datum) 0)
+ {
+ nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false;
+ values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs;
+ }
/* always replace the value (either by bytea or NULL) */
replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+ replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
/* there should already be a pg_statistic_ext_data tuple */
oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
* is not necessary here (and when querying the bitmap).
*/
AttrNumber *
-build_attnums_array(Bitmapset *attrs, int *numattrs)
+build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs)
{
int i,
j;
j = -1;
while ((j = bms_next_member(attrs, j)) >= 0)
{
+ AttrNumber attnum = (j - nexprs);
+
/*
* Make sure the bitmap contains only user-defined attributes. As
* bitmaps can't contain negative values, this can be violated in two
* ways. Firstly, the bitmap might contain 0 as a member, and secondly
* the integer value might be larger than MaxAttrNumber.
*/
- Assert(AttrNumberIsForUserDefinedAttr(j));
- Assert(j <= MaxAttrNumber);
+ Assert(AttributeNumberIsValid(attnum));
+ Assert(attnum <= MaxAttrNumber);
+ Assert(attnum >= (-nexprs));
- attnums[i++] = (AttrNumber) j;
+ attnums[i++] = (AttrNumber) attnum;
/* protect against overflows */
Assert(i <= num);
* can simply pfree the return value to release all of it.
*/
SortItem *
-build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
- MultiSortSupport mss, int numattrs, AttrNumber *attnums)
+build_sorted_items(StatsBuildData *data, int *nitems,
+ MultiSortSupport mss,
+ int numattrs, AttrNumber *attnums)
{
int i,
j,
len,
- idx;
- int nvalues = numrows * numattrs;
+ nrows;
+ int nvalues = data->numrows * numattrs;
SortItem *items;
Datum *values;
bool *isnull;
char *ptr;
+ int *typlen;
/* Compute the total amount of memory we need (both items and values). */
- len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+ len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
/* Allocate the memory and split it into the pieces. */
ptr = palloc0(len);
/* items to sort */
items = (SortItem *) ptr;
- ptr += numrows * sizeof(SortItem);
+ ptr += data->numrows * sizeof(SortItem);
/* values and null flags */
values = (Datum *) ptr;
Assert((ptr - (char *) items) == len);
/* fix the pointers to Datum and bool arrays */
- idx = 0;
- for (i = 0; i < numrows; i++)
+ nrows = 0;
+ for (i = 0; i < data->numrows; i++)
{
- bool toowide = false;
+ items[nrows].values = &values[nrows * numattrs];
+ items[nrows].isnull = &isnull[nrows * numattrs];
- items[idx].values = &values[idx * numattrs];
- items[idx].isnull = &isnull[idx * numattrs];
+ nrows++;
+ }
+
+ /* build a local cache of typlen for all attributes */
+ typlen = (int *) palloc(sizeof(int) * data->nattnums);
+ for (i = 0; i < data->nattnums; i++)
+ typlen[i] = get_typlen(data->stats[i]->attrtypid);
+
+ nrows = 0;
+ for (i = 0; i < data->numrows; i++)
+ {
+ bool toowide = false;
/* load the values/null flags from sample rows */
for (j = 0; j < numattrs; j++)
{
Datum value;
bool isnull;
+ int attlen;
+ AttrNumber attnum = attnums[j];
+
+ int idx;
+
+ /* match attnum to the pre-calculated data */
+ for (idx = 0; idx < data->nattnums; idx++)
+ {
+ if (attnum == data->attnums[idx])
+ break;
+ }
- value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
+ Assert(idx < data->nattnums);
+
+ value = data->values[idx][i];
+ isnull = data->nulls[idx][i];
+ attlen = typlen[idx];
/*
* If this is a varlena value, check if it's too wide and if yes
* on the assumption that those are small (below WIDTH_THRESHOLD)
* and will be discarded at the end of analyze.
*/
- if ((!isnull) &&
- (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1))
+ if ((!isnull) && (attlen == -1))
{
if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
{
value = PointerGetDatum(PG_DETOAST_DATUM(value));
}
- items[idx].values[j] = value;
- items[idx].isnull[j] = isnull;
+ items[nrows].values[j] = value;
+ items[nrows].isnull[j] = isnull;
}
if (toowide)
continue;
- idx++;
+ nrows++;
}
/* store the actual number of items (ignoring the too-wide ones) */
- *nitems = idx;
+ *nitems = nrows;
/* all items were too wide */
- if (idx == 0)
+ if (nrows == 0)
{
/* everything is allocated as a single chunk */
pfree(items);
}
/* do the sort, using the multi-sort */
- qsort_arg((void *) items, idx, sizeof(SortItem),
+ qsort_arg((void *) items, nrows, sizeof(SortItem),
multi_sort_compare, mss);
return items;
return false;
}
+/*
+ * stat_find_expression
+ * Search for an expression in statistics object's list of expressions.
+ *
+ * Returns the index of the expression in the statistics object's list of
+ * expressions, or -1 if not found.
+ */
+static int
+stat_find_expression(StatisticExtInfo *stat, Node *expr)
+{
+ ListCell *lc;
+ int idx;
+
+ idx = 0;
+ foreach(lc, stat->exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ if (equal(stat_expr, expr))
+ return idx;
+ idx++;
+ }
+
+ /* Expression not found */
+ return -1;
+}
+
+/*
+ * stat_covers_expressions
+ * Test whether a statistics object covers all expressions in a list.
+ *
+ * Returns true if all expressions are covered. If expr_idxs is non-NULL, it
+ * is populated with the indexes of the expressions found.
+ */
+static bool
+stat_covers_expressions(StatisticExtInfo *stat, List *exprs,
+ Bitmapset **expr_idxs)
+{
+ ListCell *lc;
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ int expr_idx;
+
+ expr_idx = stat_find_expression(stat, expr);
+ if (expr_idx == -1)
+ return false;
+
+ if (expr_idxs != NULL)
+ *expr_idxs = bms_add_member(*expr_idxs, expr_idx);
+ }
+
+ /* If we reach here, all expressions are covered */
+ return true;
+}
+
/*
* choose_best_statistics
* Look for and return statistics with the specified 'requiredkind' which
*/
StatisticExtInfo *
choose_best_statistics(List *stats, char requiredkind,
- Bitmapset **clause_attnums, int nclauses)
+ Bitmapset **clause_attnums, List **clause_exprs,
+ int nclauses)
{
ListCell *lc;
StatisticExtInfo *best_match = NULL;
{
int i;
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
- Bitmapset *matched = NULL;
+ Bitmapset *matched_attnums = NULL;
+ Bitmapset *matched_exprs = NULL;
int num_matched;
int numkeys;
continue;
/*
- * Collect attributes in remaining (unestimated) clauses fully covered
- * by this statistic object.
+ * Collect attributes and expressions in remaining (unestimated)
+ * clauses fully covered by this statistic object.
*/
for (i = 0; i < nclauses; i++)
{
+ Bitmapset *expr_idxs = NULL;
+
/* ignore incompatible/estimated clauses */
- if (!clause_attnums[i])
+ if (!clause_attnums[i] && !clause_exprs[i])
continue;
/* ignore clauses that are not covered by this object */
- if (!bms_is_subset(clause_attnums[i], info->keys))
+ if (!bms_is_subset(clause_attnums[i], info->keys) ||
+ !stat_covers_expressions(info, clause_exprs[i], &expr_idxs))
continue;
- matched = bms_add_members(matched, clause_attnums[i]);
+ /* record attnums and indexes of expressions covered */
+ matched_attnums = bms_add_members(matched_attnums, clause_attnums[i]);
+ matched_exprs = bms_add_members(matched_exprs, expr_idxs);
}
- num_matched = bms_num_members(matched);
- bms_free(matched);
+ num_matched = bms_num_members(matched_attnums) + bms_num_members(matched_exprs);
+
+ bms_free(matched_attnums);
+ bms_free(matched_exprs);
/*
* save the actual number of keys in the stats so that we can choose
* the narrowest stats with the most matching keys.
*/
- numkeys = bms_num_members(info->keys);
+ numkeys = bms_num_members(info->keys) + list_length(info->exprs);
/*
- * Use this object when it increases the number of matched clauses or
- * when it matches the same number of attributes but these stats have
- * fewer keys than any previous match.
+ * Use this object when it increases the number of matched attributes
+ * and expressions or when it matches the same number of attributes
+ * and expressions but these stats have fewer keys than any previous
+ * match.
*/
if (num_matched > best_num_matched ||
(num_matched == best_num_matched && numkeys < best_match_keys))
*/
static bool
statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
- Index relid, Bitmapset **attnums)
+ Index relid, Bitmapset **attnums,
+ List **exprs)
{
/* Look inside any binary-compatible relabeling (as in examine_variable) */
if (IsA(clause, RelabelType))
return true;
}
- /* (Var op Const) or (Const op Var) */
+ /* (Var/Expr op Const) or (Const op Var/Expr) */
if (is_opclause(clause))
{
RangeTblEntry *rte = root->simple_rte_array[relid];
OpExpr *expr = (OpExpr *) clause;
- Var *var;
+ Node *clause_expr;
/* Only expressions with two arguments are considered compatible. */
if (list_length(expr->args) != 2)
return false;
- /* Check if the expression has the right shape (one Var, one Const) */
- if (!examine_clause_args(expr->args, &var, NULL, NULL))
+ /* Check if the expression has the right shape */
+ if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
return false;
/*
case F_SCALARLESEL:
case F_SCALARGTSEL:
case F_SCALARGESEL:
- /* supported, will continue with inspection of the Var */
+ /* supported, will continue with inspection of the Var/Expr */
break;
default:
!get_func_leakproof(get_opcode(expr->opno)))
return false;
- return statext_is_compatible_clause_internal(root, (Node *) var,
- relid, attnums);
+ /* Check (Var op Const) or (Const op Var) clauses by recursing. */
+ if (IsA(clause_expr, Var))
+ return statext_is_compatible_clause_internal(root, clause_expr,
+ relid, attnums, exprs);
+
+ /* Otherwise we have (Expr op Const) or (Const op Expr). */
+ *exprs = lappend(*exprs, clause_expr);
+ return true;
}
- /* Var IN Array */
+ /* Var/Expr IN Array */
if (IsA(clause, ScalarArrayOpExpr))
{
RangeTblEntry *rte = root->simple_rte_array[relid];
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
- Var *var;
+ Node *clause_expr;
/* Only expressions with two arguments are considered compatible. */
if (list_length(expr->args) != 2)
return false;
/* Check if the expression has the right shape (one Var, one Const) */
- if (!examine_clause_args(expr->args, &var, NULL, NULL))
+ if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
return false;
/*
case F_SCALARLESEL:
case F_SCALARGTSEL:
case F_SCALARGESEL:
- /* supported, will continue with inspection of the Var */
+ /* supported, will continue with inspection of the Var/Expr */
break;
default:
!get_func_leakproof(get_opcode(expr->opno)))
return false;
- return statext_is_compatible_clause_internal(root, (Node *) var,
- relid, attnums);
+ /* Check Var IN Array clauses by recursing. */
+ if (IsA(clause_expr, Var))
+ return statext_is_compatible_clause_internal(root, clause_expr,
+ relid, attnums, exprs);
+
+ /* Otherwise we have Expr IN Array. */
+ *exprs = lappend(*exprs, clause_expr);
+ return true;
}
/* AND/OR/NOT clause */
*/
if (!statext_is_compatible_clause_internal(root,
(Node *) lfirst(lc),
- relid, attnums))
+ relid, attnums, exprs))
return false;
}
return true;
}
- /* Var IS NULL */
+ /* Var/Expr IS NULL */
if (IsA(clause, NullTest))
{
NullTest *nt = (NullTest *) clause;
- /*
- * Only simple (Var IS NULL) expressions supported for now. Maybe we
- * could use examine_variable to fix this?
- */
- if (!IsA(nt->arg, Var))
- return false;
+ /* Check Var IS NULL clauses by recursing. */
+ if (IsA(nt->arg, Var))
+ return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
+ relid, attnums, exprs);
- return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
- relid, attnums);
+ /* Otherwise we have Expr IS NULL. */
+ *exprs = lappend(*exprs, nt->arg);
+ return true;
}
- return false;
+ /*
+ * Treat any other expressions as bare expressions to be matched against
+ * expressions in statistics objects.
+ */
+ *exprs = lappend(*exprs, clause);
+ return true;
}
/*
* statext_is_compatible_clause
* Determines if the clause is compatible with MCV lists.
*
- * Currently, we only support three types of clauses:
+ * Currently, we only support the following types of clauses:
*
- * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
- * is one of ("=", "<", ">", ">=", "<=")
+ * (a) OpExprs of the form (Var/Expr op Const), or (Const op Var/Expr), where
+ * the op is one of ("=", "<", ">", ">=", "<=")
*
- * (b) (Var IS [NOT] NULL)
+ * (b) (Var/Expr IS [NOT] NULL)
*
* (c) combinations using AND/OR/NOT
*
+ * (d) ScalarArrayOpExprs of the form (Var/Expr op ANY (array)) or (Var/Expr
+ * op ALL (array))
+ *
* In the future, the range of supported clauses may be expanded to more
* complex cases, for example (Var op Var).
*/
static bool
statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
- Bitmapset **attnums)
+ Bitmapset **attnums, List **exprs)
{
RangeTblEntry *rte = root->simple_rte_array[relid];
RestrictInfo *rinfo = (RestrictInfo *) clause;
+ int clause_relid;
Oid userid;
/*
foreach(lc, expr->args)
{
if (!statext_is_compatible_clause(root, (Node *) lfirst(lc),
- relid, attnums))
+ relid, attnums, exprs))
return false;
}
if (rinfo->pseudoconstant)
return false;
- /* clauses referencing multiple varnos are incompatible */
- if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+ /* Clauses referencing other varnos are incompatible. */
+ if (!bms_get_singleton_member(rinfo->clause_relids, &clause_relid) ||
+ clause_relid != relid)
return false;
/* Check the clause and determine what attributes it references. */
if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause,
- relid, attnums))
+ relid, attnums, exprs))
return false;
/*
- * Check that the user has permission to read all these attributes. Use
+ * Check that the user has permission to read all required attributes. Use
* checkAsUser if it's set, in case we're accessing the table via a view.
*/
userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK)
{
+ Bitmapset *clause_attnums;
+
/* Don't have table privilege, must check individual columns */
- if (bms_is_member(InvalidAttrNumber, *attnums))
+ if (*exprs != NIL)
+ {
+ pull_varattnos((Node *) exprs, relid, &clause_attnums);
+ clause_attnums = bms_add_members(clause_attnums, *attnums);
+ }
+ else
+ clause_attnums = *attnums;
+
+ if (bms_is_member(InvalidAttrNumber, clause_attnums))
{
/* Have a whole-row reference, must have access to all columns */
if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT,
/* Check the columns referenced by the clause */
int attnum = -1;
- while ((attnum = bms_next_member(*attnums, attnum)) >= 0)
+ while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0)
{
if (pg_attribute_aclcheck(rte->relid, attnum, userid,
ACL_SELECT) != ACLCHECK_OK)
bool is_or)
{
ListCell *l;
- Bitmapset **list_attnums;
+ Bitmapset **list_attnums; /* attnums extracted from the clause */
+ List **list_exprs; /* expressions matched to any statistic */
int listidx;
Selectivity sel = (is_or) ? 0.0 : 1.0;
list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
list_length(clauses));
+ /* expressions extracted from complex expressions */
+ list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses));
+
/*
- * Pre-process the clauses list to extract the attnums seen in each item.
- * We need to determine if there's any clauses which will be useful for
- * selectivity estimations with extended stats. Along the way we'll record
- * all of the attnums for each clause in a list which we'll reference
- * later so we don't need to repeat the same work again. We'll also keep
- * track of all attnums seen.
+ * Pre-process the clauses list to extract the attnums and expressions
+ * seen in each item. We need to determine if there are any clauses which
+ * will be useful for selectivity estimations with extended stats. Along
+ * the way we'll record all of the attnums and expressions for each clause
+ * in lists which we'll reference later so we don't need to repeat the
+ * same work again.
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
{
Node *clause = (Node *) lfirst(l);
Bitmapset *attnums = NULL;
+ List *exprs = NIL;
if (!bms_is_member(listidx, *estimatedclauses) &&
- statext_is_compatible_clause(root, clause, rel->relid, &attnums))
+ statext_is_compatible_clause(root, clause, rel->relid, &attnums, &exprs))
+ {
list_attnums[listidx] = attnums;
+ list_exprs[listidx] = exprs;
+ }
else
+ {
list_attnums[listidx] = NULL;
+ list_exprs[listidx] = NIL;
+ }
listidx++;
}
/* find the best suited statistics object for these attnums */
stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
- list_attnums, list_length(clauses));
+ list_attnums, list_exprs,
+ list_length(clauses));
/*
* if no (additional) matching stats could be found then we've nothing
/* now filter the clauses to be estimated using the selected MCV */
stat_clauses = NIL;
- /* record which clauses are simple (single column) */
+ /* record which clauses are simple (single column or expression) */
simple_clauses = NULL;
listidx = 0;
foreach(l, clauses)
{
/*
- * If the clause is compatible with the selected statistics, mark
- * it as estimated and add it to the list to estimate.
+ * If the clause is not already estimated and is compatible with
+ * the selected statistics object (all attributes and expressions
+ * covered), mark it as estimated and add it to the list to
+ * estimate.
*/
- if (list_attnums[listidx] != NULL &&
- bms_is_subset(list_attnums[listidx], stat->keys))
+ if (!bms_is_member(listidx, *estimatedclauses) &&
+ bms_is_subset(list_attnums[listidx], stat->keys) &&
+ stat_covers_expressions(stat, list_exprs[listidx], NULL))
{
- if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON)
+ /* record simple clauses (single column or expression) */
+ if ((list_attnums[listidx] == NULL &&
+ list_length(list_exprs[listidx]) == 1) ||
+ (list_exprs[listidx] == NIL &&
+ bms_membership(list_attnums[listidx]) == BMS_SINGLETON))
simple_clauses = bms_add_member(simple_clauses,
list_length(stat_clauses));
+ /* add clause to list and mark as estimated */
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
bms_free(list_attnums[listidx]);
list_attnums[listidx] = NULL;
+
+ list_free(list_exprs[listidx]);
+ list_exprs[listidx] = NULL;
}
listidx++;
}
/*
- * examine_opclause_expression
- * Split expression into Var and Const parts.
+ * examine_opclause_args
+ * Split an operator expression's arguments into Expr and Const parts.
*
- * Attempts to match the arguments to either (Var op Const) or (Const op Var),
- * possibly with a RelabelType on top. When the expression matches this form,
- * returns true, otherwise returns false.
+ * Attempts to match the arguments to either (Expr op Const) or (Const op
+ * Expr), possibly with a RelabelType on top. When the expression matches this
+ * form, returns true, otherwise returns false.
*
- * Optionally returns pointers to the extracted Var/Const nodes, when passed
- * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
- * on which side of the operator we found the Var node.
+ * Optionally returns pointers to the extracted Expr/Const nodes, when passed
+ * non-null pointers (exprp, cstp and expronleftp). The expronleftp flag
+ * specifies on which side of the operator we found the expression node.
*/
bool
-examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
+examine_opclause_args(List *args, Node **exprp, Const **cstp,
+ bool *expronleftp)
{
- Var *var;
+ Node *expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
Node *leftop,
*rightop;
if (IsA(rightop, RelabelType))
rightop = (Node *) ((RelabelType *) rightop)->arg;
- if (IsA(leftop, Var) && IsA(rightop, Const))
+ if (IsA(rightop, Const))
{
- var = (Var *) leftop;
+ expr = (Node *) leftop;
cst = (Const *) rightop;
- varonleft = true;
+ expronleft = true;
}
- else if (IsA(leftop, Const) && IsA(rightop, Var))
+ else if (IsA(leftop, Const))
{
- var = (Var *) rightop;
+ expr = (Node *) rightop;
cst = (Const *) leftop;
- varonleft = false;
+ expronleft = false;
}
else
return false;
/* return pointers to the extracted parts if requested */
- if (varp)
- *varp = var;
+ if (exprp)
+ *exprp = expr;
if (cstp)
*cstp = cst;
- if (varonleftp)
- *varonleftp = varonleft;
+ if (expronleftp)
+ *expronleftp = expronleft;
return true;
}
+
+
+/*
+ * Compute statistics about expressions of a relation.
+ */
+static void
+compute_expr_stats(Relation onerel, double totalrows,
+ AnlExprData *exprdata, int nexprs,
+ HeapTuple *rows, int numrows)
+{
+ MemoryContext expr_context,
+ old_context;
+ int ind,
+ i;
+
+ expr_context = AllocSetContextCreate(CurrentMemoryContext,
+ "Analyze Expression",
+ ALLOCSET_DEFAULT_SIZES);
+ old_context = MemoryContextSwitchTo(expr_context);
+
+ for (ind = 0; ind < nexprs; ind++)
+ {
+ AnlExprData *thisdata = &exprdata[ind];
+ VacAttrStats *stats = thisdata->vacattrstat;
+ Node *expr = thisdata->expr;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ Datum *exprvals;
+ bool *exprnulls;
+ ExprState *exprstate;
+ int tcnt;
+
+ /* Are we still in the main context? */
+ Assert(CurrentMemoryContext == expr_context);
+
+ /*
+ * Need an EState for evaluation of expressions. Create it in the
+ * per-expression context to be sure it gets cleaned up at the bottom
+ * of the loop.
+ */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Set up expression evaluation state */
+ exprstate = ExecPrepareExpr((Expr *) expr, estate);
+
+ /* Need a slot to hold the current heap tuple, too */
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel),
+ &TTSOpsHeapTuple);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Compute and save expression values */
+ exprvals = (Datum *) palloc(numrows * sizeof(Datum));
+ exprnulls = (bool *) palloc(numrows * sizeof(bool));
+
+ tcnt = 0;
+ for (i = 0; i < numrows; i++)
+ {
+ Datum datum;
+ bool isnull;
+
+ /*
+ * Reset the per-tuple context each time, to reclaim any cruft
+ * left behind by evaluating the statistics expressions.
+ */
+ ResetExprContext(econtext);
+
+ /* Set up for expression evaluation */
+ ExecStoreHeapTuple(rows[i], slot, false);
+
+ /*
+ * Evaluate the expression. We do this in the per-tuple context so
+ * as not to leak memory, and then copy the result into the
+ * context created at the beginning of this function.
+ */
+ datum = ExecEvalExprSwitchContext(exprstate,
+ GetPerTupleExprContext(estate),
+ &isnull);
+ if (isnull)
+ {
+ exprvals[tcnt] = (Datum) 0;
+ exprnulls[tcnt] = true;
+ }
+ else
+ {
+ /* Make sure we copy the data into the context. */
+ Assert(CurrentMemoryContext == expr_context);
+
+ exprvals[tcnt] = datumCopy(datum,
+ stats->attrtype->typbyval,
+ stats->attrtype->typlen);
+ exprnulls[tcnt] = false;
+ }
+
+ tcnt++;
+ }
+
+ /*
+ * Now we can compute the statistics for the expression columns.
+ *
+ * XXX Unlike compute_index_stats we don't need to switch and reset
+ * memory contexts here, because we're only computing stats for a
+ * single expression (and not iterating over many indexes), so we just
+ * do it in expr_context. Note that compute_stats copies the result
+ * into stats->anl_context, so it does not disappear.
+ */
+ if (tcnt > 0)
+ {
+ AttributeOpts *aopt =
+ get_attribute_options(stats->attr->attrelid,
+ stats->attr->attnum);
+
+ stats->exprvals = exprvals;
+ stats->exprnulls = exprnulls;
+ stats->rowstride = 1;
+ stats->compute_stats(stats,
+ expr_fetch_func,
+ tcnt,
+ tcnt);
+
+ /*
+ * If the n_distinct option is specified, it overrides the above
+ * computation.
+ */
+ if (aopt != NULL && aopt->n_distinct != 0.0)
+ stats->stadistinct = aopt->n_distinct;
+ }
+
+ /* And clean up */
+ MemoryContextSwitchTo(expr_context);
+
+ ExecDropSingleTupleTableSlot(slot);
+ FreeExecutorState(estate);
+ MemoryContextResetAndDeleteChildren(expr_context);
+ }
+
+ MemoryContextSwitchTo(old_context);
+ MemoryContextDelete(expr_context);
+}
+
+
+/*
+ * Fetch function for analyzing statistics object expressions.
+ *
+ * We have not bothered to construct tuples from the data, instead the data
+ * is just in Datum arrays.
+ */
+static Datum
+expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
+{
+ int i;
+
+ /* exprvals and exprnulls are already offset for proper column */
+ i = rownum * stats->rowstride;
+ *isNull = stats->exprnulls[i];
+ return stats->exprvals[i];
+}
+
+/*
+ * Build analyze data for a list of expressions. As this is not tied
+ * directly to a relation (table or index), we have to fake some of
+ * the fields in examine_expression().
+ */
+static AnlExprData *
+build_expr_data(List *exprs, int stattarget)
+{
+ int idx;
+ int nexprs = list_length(exprs);
+ AnlExprData *exprdata;
+ ListCell *lc;
+
+ exprdata = (AnlExprData *) palloc0(nexprs * sizeof(AnlExprData));
+
+ idx = 0;
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ AnlExprData *thisdata = &exprdata[idx];
+
+ thisdata->expr = expr;
+ thisdata->vacattrstat = examine_expression(expr, stattarget);
+ idx++;
+ }
+
+ return exprdata;
+}
+
+/* form an array of pg_statistic rows (per update_attstats) */
+static Datum
+serialize_expr_stats(AnlExprData *exprdata, int nexprs)
+{
+ int exprno;
+ Oid typOid;
+ Relation sd;
+
+ ArrayBuildState *astate = NULL;
+
+ sd = table_open(StatisticRelationId, RowExclusiveLock);
+
+ /* lookup OID of composite type for pg_statistic */
+ typOid = get_rel_type_id(StatisticRelationId);
+ if (!OidIsValid(typOid))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("relation \"pg_statistic\" does not have a composite type")));
+
+ for (exprno = 0; exprno < nexprs; exprno++)
+ {
+ int i,
+ k;
+ VacAttrStats *stats = exprdata[exprno].vacattrstat;
+
+ Datum values[Natts_pg_statistic];
+ bool nulls[Natts_pg_statistic];
+ HeapTuple stup;
+
+ if (!stats->stats_valid)
+ {
+ astate = accumArrayResult(astate,
+ (Datum) 0,
+ true,
+ typOid,
+ CurrentMemoryContext);
+ continue;
+ }
+
+ /*
+ * Construct a new pg_statistic tuple
+ */
+ for (i = 0; i < Natts_pg_statistic; ++i)
+ {
+ nulls[i] = false;
+ }
+
+ values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(InvalidOid);
+ values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(InvalidAttrNumber);
+ values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(false);
+ values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
+ values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
+ values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
+ i = Anum_pg_statistic_stakind1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */
+ }
+ i = Anum_pg_statistic_staop1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */
+ }
+ i = Anum_pg_statistic_stacoll1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */
+ }
+ i = Anum_pg_statistic_stanumbers1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ int nnum = stats->numnumbers[k];
+
+ if (nnum > 0)
+ {
+ int n;
+ Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
+ ArrayType *arry;
+
+ for (n = 0; n < nnum; n++)
+ numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
+ /* XXX knows more than it should about type float4: */
+ arry = construct_array(numdatums, nnum,
+ FLOAT4OID,
+ sizeof(float4), true, TYPALIGN_INT);
+ values[i++] = PointerGetDatum(arry); /* stanumbersN */
+ }
+ else
+ {
+ nulls[i] = true;
+ values[i++] = (Datum) 0;
+ }
+ }
+ i = Anum_pg_statistic_stavalues1 - 1;
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ if (stats->numvalues[k] > 0)
+ {
+ ArrayType *arry;
+
+ arry = construct_array(stats->stavalues[k],
+ stats->numvalues[k],
+ stats->statypid[k],
+ stats->statyplen[k],
+ stats->statypbyval[k],
+ stats->statypalign[k]);
+ values[i++] = PointerGetDatum(arry); /* stavaluesN */
+ }
+ else
+ {
+ nulls[i] = true;
+ values[i++] = (Datum) 0;
+ }
+ }
+
+ stup = heap_form_tuple(RelationGetDescr(sd), values, nulls);
+
+ astate = accumArrayResult(astate,
+ heap_copy_tuple_as_datum(stup, RelationGetDescr(sd)),
+ false,
+ typOid,
+ CurrentMemoryContext);
+ }
+
+ table_close(sd, RowExclusiveLock);
+
+ return makeArrayResult(astate, CurrentMemoryContext);
+}
+
+/*
+ * Loads pg_statistic record from expression statistics for expression
+ * identified by the supplied index.
+ */
+HeapTuple
+statext_expressions_load(Oid stxoid, int idx)
+{
+ bool isnull;
+ Datum value;
+ HeapTuple htup;
+ ExpandedArrayHeader *eah;
+ HeapTupleHeader td;
+ HeapTupleData tmptup;
+ HeapTuple tup;
+
+ htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(stxoid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", stxoid);
+
+ value = SysCacheGetAttr(STATEXTDATASTXOID, htup,
+ Anum_pg_statistic_ext_data_stxdexpr, &isnull);
+ if (isnull)
+ elog(ERROR,
+ "requested statistics kind \"%c\" is not yet built for statistics object %u",
+ STATS_EXT_DEPENDENCIES, stxoid);
+
+ eah = DatumGetExpandedArray(value);
+
+ deconstruct_expanded_array(eah);
+
+ td = DatumGetHeapTupleHeader(eah->dvalues[idx]);
+
+ /* Build a temporary HeapTuple control structure */
+ tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+ tmptup.t_data = td;
+
+ tup = heap_copytuple(&tmptup);
+
+ ReleaseSysCache(htup);
+
+ return tup;
+}
+
+/*
+ * Evaluate the expressions, so that we can use the results to build
+ * all the requested statistics types. This matters especially for
+ * expensive expressions, of course.
+ */
+static StatsBuildData *
+make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows,
+ VacAttrStats **stats, int stattarget)
+{
+ /* evaluated expressions */
+ StatsBuildData *result;
+ char *ptr;
+ Size len;
+
+ int i;
+ int k;
+ int idx;
+ TupleTableSlot *slot;
+ EState *estate;
+ ExprContext *econtext;
+ List *exprstates = NIL;
+ int nkeys = bms_num_members(stat->columns) + list_length(stat->exprs);
+ ListCell *lc;
+
+ /* allocate everything as a single chunk, so we can free it easily */
+ len = MAXALIGN(sizeof(StatsBuildData));
+ len += MAXALIGN(sizeof(AttrNumber) * nkeys); /* attnums */
+ len += MAXALIGN(sizeof(VacAttrStats *) * nkeys); /* stats */
+
+ /* values */
+ len += MAXALIGN(sizeof(Datum *) * nkeys);
+ len += nkeys * MAXALIGN(sizeof(Datum) * numrows);
+
+ /* nulls */
+ len += MAXALIGN(sizeof(bool *) * nkeys);
+ len += nkeys * MAXALIGN(sizeof(bool) * numrows);
+
+ ptr = palloc(len);
+
+ /* set the pointers */
+ result = (StatsBuildData *) ptr;
+ ptr += MAXALIGN(sizeof(StatsBuildData));
+
+ /* attnums */
+ result->attnums = (AttrNumber *) ptr;
+ ptr += MAXALIGN(sizeof(AttrNumber) * nkeys);
+
+ /* stats */
+ result->stats = (VacAttrStats **) ptr;
+ ptr += MAXALIGN(sizeof(VacAttrStats *) * nkeys);
+
+ /* values */
+ result->values = (Datum **) ptr;
+ ptr += MAXALIGN(sizeof(Datum *) * nkeys);
+
+ /* nulls */
+ result->nulls = (bool **) ptr;
+ ptr += MAXALIGN(sizeof(bool *) * nkeys);
+
+ for (i = 0; i < nkeys; i++)
+ {
+ result->values[i] = (Datum *) ptr;
+ ptr += MAXALIGN(sizeof(Datum) * numrows);
+
+ result->nulls[i] = (bool *) ptr;
+ ptr += MAXALIGN(sizeof(bool) * numrows);
+ }
+
+ Assert((ptr - (char *) result) == len);
+
+ /* we have it allocated, so let's fill the values */
+ result->nattnums = nkeys;
+ result->numrows = numrows;
+
+ /* fill the attribute info - first attributes, then expressions */
+ idx = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->columns, k)) >= 0)
+ {
+ result->attnums[idx] = k;
+ result->stats[idx] = stats[idx];
+
+ idx++;
+ }
+
+ k = -1;
+ foreach(lc, stat->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+
+ result->attnums[idx] = k;
+ result->stats[idx] = examine_expression(expr, stattarget);
+
+ idx++;
+ k--;
+ }
+
+ /* first extract values for all the regular attributes */
+ for (i = 0; i < numrows; i++)
+ {
+ idx = 0;
+ k = -1;
+ while ((k = bms_next_member(stat->columns, k)) >= 0)
+ {
+ result->values[idx][i] = heap_getattr(rows[i], k,
+ result->stats[idx]->tupDesc,
+ &result->nulls[idx][i]);
+
+ idx++;
+ }
+ }
+
+ /* Need an EState for evaluation expressions. */
+ estate = CreateExecutorState();
+ econtext = GetPerTupleExprContext(estate);
+
+ /* Need a slot to hold the current heap tuple, too */
+ slot = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+ &TTSOpsHeapTuple);
+
+ /* Arrange for econtext's scan tuple to be the tuple under test */
+ econtext->ecxt_scantuple = slot;
+
+ /* Set up expression evaluation state */
+ exprstates = ExecPrepareExprList(stat->exprs, estate);
+
+ for (i = 0; i < numrows; i++)
+ {
+ /*
+ * Reset the per-tuple context each time, to reclaim any cruft left
+ * behind by evaluating the statistics object expressions.
+ */
+ ResetExprContext(econtext);
+
+ /* Set up for expression evaluation */
+ ExecStoreHeapTuple(rows[i], slot, false);
+
+ idx = bms_num_members(stat->columns);
+ foreach(lc, exprstates)
+ {
+ Datum datum;
+ bool isnull;
+ ExprState *exprstate = (ExprState *) lfirst(lc);
+
+ /*
+ * XXX This probably leaks memory. Maybe we should use
+ * ExecEvalExprSwitchContext but then we need to copy the result
+ * somewhere else.
+ */
+ datum = ExecEvalExpr(exprstate,
+ GetPerTupleExprContext(estate),
+ &isnull);
+ if (isnull)
+ {
+ result->values[idx][i] = (Datum) 0;
+ result->nulls[idx][i] = true;
+ }
+ else
+ {
+ result->values[idx][i] = (Datum) datum;
+ result->nulls[idx][i] = false;
+ }
+
+ idx++;
+ }
+ }
+
+ ExecDropSingleTupleTableSlot(slot);
+ FreeExecutorState(estate);
+
+ return result;
+}
((ndims) * sizeof(DimensionInfo)) + \
((nitems) * ITEM_SIZE(ndims)))
-static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
+static MultiSortSupport build_mss(StatsBuildData *data);
static SortItem *build_distinct_groups(int numrows, SortItem *items,
MultiSortSupport mss, int *ndistinct);
*
*/
MCVList *
-statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
- VacAttrStats **stats, double totalrows, int stattarget)
+statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
{
int i,
numattrs,
+ numrows,
ngroups,
nitems;
- AttrNumber *attnums;
double mincount;
SortItem *items;
SortItem *groups;
MCVList *mcvlist = NULL;
MultiSortSupport mss;
- attnums = build_attnums_array(attrs, &numattrs);
-
/* comparator for all the columns */
- mss = build_mss(stats, numattrs);
+ mss = build_mss(data);
/* sort the rows */
- items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
- mss, numattrs, attnums);
+ items = build_sorted_items(data, &nitems, mss,
+ data->nattnums, data->attnums);
if (!items)
return NULL;
+ /* for convenience */
+ numattrs = data->nattnums;
+ numrows = data->numrows;
+
/* transform the sorted rows into groups (sorted by frequency) */
groups = build_distinct_groups(nitems, items, mss, &ngroups);
/* store info about data type OIDs */
for (i = 0; i < numattrs; i++)
- mcvlist->types[i] = stats[i]->attrtypid;
+ mcvlist->types[i] = data->stats[i]->attrtypid;
/* Copy the first chunk of groups into the result. */
for (i = 0; i < nitems; i++)
* build MultiSortSupport for the attributes passed in attrs
*/
static MultiSortSupport
-build_mss(VacAttrStats **stats, int numattrs)
+build_mss(StatsBuildData *data)
{
int i;
+ int numattrs = data->nattnums;
/* Sort by multiple columns (using array of SortSupport) */
MultiSortSupport mss = multi_sort_init(numattrs);
/* prepare the sort functions for all the attributes */
for (i = 0; i < numattrs; i++)
{
- VacAttrStats *colstat = stats[i];
+ VacAttrStats *colstat = data->stats[i];
TypeCacheEntry *type;
type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
return byteasend(fcinfo);
}
+/*
+ * match the attribute/expression to a dimension of the statistic
+ *
+ * Match the attribute/expression to statistics dimension. Optionally
+ * determine the collation.
+ */
+static int
+mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
+{
+ int idx = -1;
+
+ if (IsA(expr, Var))
+ {
+ /* simple Var, so just lookup using varattno */
+ Var *var = (Var *) expr;
+
+ if (collid)
+ *collid = var->varcollid;
+
+ idx = bms_member_index(keys, var->varattno);
+
+ /* make sure the index is valid */
+ Assert((idx >= 0) && (idx <= bms_num_members(keys)));
+ }
+ else
+ {
+ ListCell *lc;
+
+ /* expressions are stored after the simple columns */
+ idx = bms_num_members(keys);
+
+ if (collid)
+ *collid = exprCollation(expr);
+
+ /* expression - lookup in stats expressions */
+ foreach(lc, exprs)
+ {
+ Node *stat_expr = (Node *) lfirst(lc);
+
+ if (equal(expr, stat_expr))
+ break;
+
+ idx++;
+ }
+
+ /* make sure the index is valid */
+ Assert((idx >= bms_num_members(keys)) &&
+ (idx <= bms_num_members(keys) + list_length(exprs)));
+ }
+
+ return idx;
+}
+
/*
* mcv_get_match_bitmap
* Evaluate clauses using the MCV list, and update the match bitmap.
*/
static bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
- Bitmapset *keys, MCVList *mcvlist, bool is_or)
+ Bitmapset *keys, List *exprs,
+ MCVList *mcvlist, bool is_or)
{
int i;
ListCell *l;
OpExpr *expr = (OpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ int idx;
+ Oid collid;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
+
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
{
- int idx;
+ bool match = true;
+ MCVItem *item = &mcvlist->items[i];
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
+ Assert(idx >= 0);
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (item->isnull[idx] || cst->constisnull)
{
- bool match = true;
- MCVItem *item = &mcvlist->items[i];
-
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
- {
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
- continue;
- }
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
+ }
- /*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
- */
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ /*
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
+ */
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
- /*
- * First check whether the constant is below the lower
- * boundary (in that case we can skip the bucket, because
- * there's no overlap).
- *
- * We don't store collations used to build the statistics,
- * but we can use the collation for the attribute itself,
- * as stored in varcollid. We do reset the statistics
- * after a type change (including collation change), so
- * this is OK. We may need to relax this after allowing
- * extended statistics on expressions.
- */
- if (varonleft)
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- cst->constvalue));
- else
- match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- cst->constvalue,
- item->values[idx]));
-
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
- }
+ /*
+ * First check whether the constant is below the lower
+ * boundary (in that case we can skip the bucket, because
+ * there's no overlap).
+ *
+ * We don't store collations used to build the statistics, but
+ * we can use the collation for the attribute itself, as
+ * stored in varcollid. We do reset the statistics after a
+ * type change (including collation change), so this is OK.
+ * For expressions we use the collation extracted from the
+ * expression itself.
+ */
+ if (expronleft)
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ cst->constvalue));
+ else
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ cst->constvalue,
+ item->values[idx]));
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, ScalarArrayOpExpr))
ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
FmgrInfo opproc;
- /* valid only after examine_clause_args returns true */
- Var *var;
+ /* valid only after examine_opclause_args returns true */
+ Node *clause_expr;
Const *cst;
- bool varonleft;
+ bool expronleft;
+ Oid collid;
+ int idx;
+
+ /* array evaluation */
+ ArrayType *arrayval;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ int num_elems;
+ Datum *elem_values;
+ bool *elem_nulls;
fmgr_info(get_opcode(expr->opno), &opproc);
- /* extract the var and const from the expression */
- if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+ /* extract the var/expr and const from the expression */
+ if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+ elog(ERROR, "incompatible clause");
+
+ /* ScalarArrayOpExpr has the Var always on the left */
+ Assert(expronleft);
+
+ /* XXX what if (cst->constisnull == NULL)? */
+ if (!cst->constisnull)
{
- int idx;
+ arrayval = DatumGetArrayTypeP(cst->constvalue);
+ get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+ &elmlen, &elmbyval, &elmalign);
+ deconstruct_array(arrayval,
+ ARR_ELEMTYPE(arrayval),
+ elmlen, elmbyval, elmalign,
+ &elem_values, &elem_nulls, &num_elems);
+ }
- ArrayType *arrayval;
- int16 elmlen;
- bool elmbyval;
- char elmalign;
- int num_elems;
- Datum *elem_values;
- bool *elem_nulls;
+ /* match the attribute/expression to a dimension of the statistic */
+ idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
- /* ScalarArrayOpExpr has the Var always on the left */
- Assert(varonleft);
+ /*
+ * Walk through the MCV items and evaluate the current clause. We
+ * can skip items that were already ruled out, and terminate if
+ * there are no remaining MCV items that might possibly match.
+ */
+ for (i = 0; i < mcvlist->nitems; i++)
+ {
+ int j;
+ bool match = (expr->useOr ? false : true);
+ MCVItem *item = &mcvlist->items[i];
- if (!cst->constisnull)
+ /*
+ * When the MCV item or the Const value is NULL we can treat
+ * this as a mismatch. We must not call the operator because
+ * of strictness.
+ */
+ if (item->isnull[idx] || cst->constisnull)
{
- arrayval = DatumGetArrayTypeP(cst->constvalue);
- get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
- &elmlen, &elmbyval, &elmalign);
- deconstruct_array(arrayval,
- ARR_ELEMTYPE(arrayval),
- elmlen, elmbyval, elmalign,
- &elem_values, &elem_nulls, &num_elems);
+ matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ continue;
}
- /* match the attribute to a dimension of the statistic */
- idx = bms_member_index(keys, var->varattno);
-
/*
- * Walk through the MCV items and evaluate the current clause.
- * We can skip items that were already ruled out, and
- * terminate if there are no remaining MCV items that might
- * possibly match.
+ * Skip MCV items that can't change result in the bitmap. Once
+ * the value gets false for AND-lists, or true for OR-lists,
+ * we don't need to look at more clauses.
*/
- for (i = 0; i < mcvlist->nitems; i++)
+ if (RESULT_IS_FINAL(matches[i], is_or))
+ continue;
+
+ for (j = 0; j < num_elems; j++)
{
- int j;
- bool match = (expr->useOr ? false : true);
- MCVItem *item = &mcvlist->items[i];
+ Datum elem_value = elem_values[j];
+ bool elem_isnull = elem_nulls[j];
+ bool elem_match;
- /*
- * When the MCV item or the Const value is NULL we can
- * treat this as a mismatch. We must not call the operator
- * because of strictness.
- */
- if (item->isnull[idx] || cst->constisnull)
+ /* NULL values always evaluate as not matching. */
+ if (elem_isnull)
{
- matches[i] = RESULT_MERGE(matches[i], is_or, false);
+ match = RESULT_MERGE(match, expr->useOr, false);
continue;
}
/*
- * Skip MCV items that can't change result in the bitmap.
- * Once the value gets false for AND-lists, or true for
- * OR-lists, we don't need to look at more clauses.
+ * Stop evaluating the array elements once we reach match
+ * value that can't change - ALL() is the same as
+ * AND-list, ANY() is the same as OR-list.
*/
- if (RESULT_IS_FINAL(matches[i], is_or))
- continue;
+ if (RESULT_IS_FINAL(match, expr->useOr))
+ break;
- for (j = 0; j < num_elems; j++)
- {
- Datum elem_value = elem_values[j];
- bool elem_isnull = elem_nulls[j];
- bool elem_match;
-
- /* NULL values always evaluate as not matching. */
- if (elem_isnull)
- {
- match = RESULT_MERGE(match, expr->useOr, false);
- continue;
- }
-
- /*
- * Stop evaluating the array elements once we reach
- * match value that can't change - ALL() is the same
- * as AND-list, ANY() is the same as OR-list.
- */
- if (RESULT_IS_FINAL(match, expr->useOr))
- break;
-
- elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
- var->varcollid,
- item->values[idx],
- elem_value));
-
- match = RESULT_MERGE(match, expr->useOr, elem_match);
- }
+ elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
+ collid,
+ item->values[idx],
+ elem_value));
- /* update the match bitmap with the result */
- matches[i] = RESULT_MERGE(matches[i], is_or, match);
+ match = RESULT_MERGE(match, expr->useOr, elem_match);
}
+
+ /* update the match bitmap with the result */
+ matches[i] = RESULT_MERGE(matches[i], is_or, match);
}
}
else if (IsA(clause, NullTest))
{
NullTest *expr = (NullTest *) clause;
- Var *var = (Var *) (expr->arg);
+ Node *clause_expr = (Node *) (expr->arg);
- /* match the attribute to a dimension of the statistic */
- int idx = bms_member_index(keys, var->varattno);
+ /* match the attribute/expression to a dimension of the statistic */
+ int idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
/*
* Walk through the MCV items and evaluate the current clause. We
Assert(list_length(bool_clauses) >= 2);
/* build the match bitmap for the OR-clauses */
- bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
+ bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
mcvlist, is_orclause(clause));
/*
Assert(list_length(not_args) == 1);
/* build the match bitmap for the NOT-clause */
- not_matches = mcv_get_match_bitmap(root, not_args, keys,
+ not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
mcvlist, false);
/*
mcv = statext_mcv_load(stat->statOid);
/* build a match bitmap for the clauses */
- matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
+ matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
+ mcv, false);
/* sum frequencies for all the matching MCV items */
*basesel = 0.0;
/* build the match bitmap for the new clause */
new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
- mcv, false);
+ stat->exprs, mcv, false);
/*
* Sum the frequencies for all the MCV items matching this clause and also
#include "utils/syscache.h"
#include "utils/typcache.h"
-static double ndistinct_for_combination(double totalrows, int numrows,
- HeapTuple *rows, VacAttrStats **stats,
+static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
int k, int *combination);
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
static int n_choose_k(int n, int k);
*
* This computes the ndistinct estimate using the same estimator used
* in analyze.c and then computes the coefficient.
+ *
+ * To handle expressions easily, we treat them as system attributes with
+ * negative attnums, and offset everything by number of expressions to
+ * allow using Bitmapsets.
*/
MVNDistinct *
-statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats)
+statext_ndistinct_build(double totalrows, StatsBuildData *data)
{
MVNDistinct *result;
int k;
int itemcnt;
- int numattrs = bms_num_members(attrs);
+ int numattrs = data->nattnums;
int numcombs = num_combinations(numattrs);
result = palloc(offsetof(MVNDistinct, items) +
MVNDistinctItem *item = &result->items[itemcnt];
int j;
- item->attrs = NULL;
+ item->attributes = palloc(sizeof(AttrNumber) * k);
+ item->nattributes = k;
+
+ /* translate the indexes to attnums */
for (j = 0; j < k; j++)
- item->attrs = bms_add_member(item->attrs,
- stats[combination[j]]->attr->attnum);
+ {
+ item->attributes[j] = data->attnums[combination[j]];
+
+ Assert(AttributeNumberIsValid(item->attributes[j]));
+ }
+
item->ndistinct =
- ndistinct_for_combination(totalrows, numrows, rows,
- stats, k, combination);
+ ndistinct_for_combination(totalrows, data, k, combination);
itemcnt++;
Assert(itemcnt <= result->nitems);
{
int nmembers;
- nmembers = bms_num_members(ndistinct->items[i].attrs);
+ nmembers = ndistinct->items[i].nattributes;
Assert(nmembers >= 2);
len += SizeOfItem(nmembers);
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem item = ndistinct->items[i];
- int nmembers = bms_num_members(item.attrs);
- int x;
+ int nmembers = item.nattributes;
memcpy(tmp, &item.ndistinct, sizeof(double));
tmp += sizeof(double);
memcpy(tmp, &nmembers, sizeof(int));
tmp += sizeof(int);
- x = -1;
- while ((x = bms_next_member(item.attrs, x)) >= 0)
- {
- AttrNumber value = (AttrNumber) x;
-
- memcpy(tmp, &value, sizeof(AttrNumber));
- tmp += sizeof(AttrNumber);
- }
+ memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
+ tmp += nmembers * sizeof(AttrNumber);
/* protect against overflows */
Assert(tmp <= ((char *) output + len));
for (i = 0; i < ndistinct->nitems; i++)
{
MVNDistinctItem *item = &ndistinct->items[i];
- int nelems;
-
- item->attrs = NULL;
/* ndistinct value */
memcpy(&item->ndistinct, tmp, sizeof(double));
tmp += sizeof(double);
/* number of attributes */
- memcpy(&nelems, tmp, sizeof(int));
+ memcpy(&item->nattributes, tmp, sizeof(int));
tmp += sizeof(int);
- Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS));
+ Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
- while (nelems-- > 0)
- {
- AttrNumber attno;
+ item->attributes
+ = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
- memcpy(&attno, tmp, sizeof(AttrNumber));
- tmp += sizeof(AttrNumber);
- item->attrs = bms_add_member(item->attrs, attno);
- }
+ memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
+ tmp += sizeof(AttrNumber) * item->nattributes;
/* still within the bytea */
Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
for (i = 0; i < ndist->nitems; i++)
{
+ int j;
MVNDistinctItem item = ndist->items[i];
- int x = -1;
- bool first = true;
if (i > 0)
appendStringInfoString(&str, ", ");
- while ((x = bms_next_member(item.attrs, x)) >= 0)
+ for (j = 0; j < item.nattributes; j++)
{
- appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x);
- first = false;
+ AttrNumber attnum = item.attributes[j];
+
+ appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
}
appendStringInfo(&str, "\": %d", (int) item.ndistinct);
}
* combination of multiple columns.
*/
static double
-ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
- VacAttrStats **stats, int k, int *combination)
+ndistinct_for_combination(double totalrows, StatsBuildData *data,
+ int k, int *combination)
{
int i,
j;
Datum *values;
SortItem *items;
MultiSortSupport mss;
+ int numrows = data->numrows;
mss = multi_sort_init(k);
*/
for (i = 0; i < k; i++)
{
- VacAttrStats *colstat = stats[combination[i]];
+ Oid typid;
TypeCacheEntry *type;
+ Oid collid = InvalidOid;
+ VacAttrStats *colstat = data->stats[combination[i]];
+
+ typid = colstat->attrtypid;
+ collid = colstat->attrcollid;
- type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
+ type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
if (type->lt_opr == InvalidOid) /* shouldn't happen */
elog(ERROR, "cache lookup failed for ordering operator for type %u",
- colstat->attrtypid);
+ typid);
/* prepare the sort function for this dimension */
- multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
+ multi_sort_add_dimension(mss, i, type->lt_opr, collid);
/* accumulate all the data for this dimension into the arrays */
for (j = 0; j < numrows; j++)
{
- items[j].values[i] =
- heap_getattr(rows[j],
- colstat->attr->attnum,
- colstat->tupDesc,
- &items[j].isnull[i]);
+ items[j].values[i] = data->values[combination[i]][j];
+ items[j].isnull[i] = data->nulls[combination[i]][j];
}
}
break;
case T_CreateStatsStmt:
- address = CreateStatistics((CreateStatsStmt *) parsetree);
+ {
+ Oid relid;
+ CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree;
+ RangeVar *rel = (RangeVar *) linitial(stmt->relations);
+
+ if (!IsA(rel, RangeVar))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("only a single relation is allowed in CREATE STATISTICS")));
+
+ /*
+ * CREATE STATISTICS will influence future execution plans
+ * but does not interfere with currently executing plans.
+ * So it should be enough to take ShareUpdateExclusiveLock
+ * on relation, conflicting with ANALYZE and other DDL
+ * that sets statistical information, but not with normal
+ * queries.
+ *
+ * XXX RangeVarCallbackOwnsRelation not needed here, to
+ * keep the same behavior as before.
+ */
+ relid = RangeVarGetRelid(rel, ShareUpdateExclusiveLock, false);
+
+ /* Run parse analysis ... */
+ stmt = transformStatsStmt(relid, stmt, queryString);
+
+ address = CreateStatistics(stmt);
+ }
break;
case T_AlterStatsStmt:
bool attrsOnly, bool keysOnly,
bool showTblSpc, bool inherits,
int prettyFlags, bool missing_ok);
-static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
+static char *pg_get_statisticsobj_worker(Oid statextid, bool columns_only,
+ bool missing_ok);
static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
bool attrsOnly, bool missing_ok);
static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
Oid statextid = PG_GETARG_OID(0);
char *res;
- res = pg_get_statisticsobj_worker(statextid, true);
+ res = pg_get_statisticsobj_worker(statextid, false, true);
+
+ if (res == NULL)
+ PG_RETURN_NULL();
+
+ PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal version for use by ALTER TABLE.
+ * Includes a tablespace clause in the result.
+ * Returns a palloc'd C string; no pretty-printing.
+ */
+char *
+pg_get_statisticsobjdef_string(Oid statextid)
+{
+ return pg_get_statisticsobj_worker(statextid, false, false);
+}
+
+/*
+ * pg_get_statisticsobjdef_columns
+ * Get columns and expressions for an extended statistics object
+ */
+Datum
+pg_get_statisticsobjdef_columns(PG_FUNCTION_ARGS)
+{
+ Oid statextid = PG_GETARG_OID(0);
+ char *res;
+
+ res = pg_get_statisticsobj_worker(statextid, true, true);
if (res == NULL)
PG_RETURN_NULL();
* Internal workhorse to decompile an extended statistics object.
*/
static char *
-pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
+pg_get_statisticsobj_worker(Oid statextid, bool columns_only, bool missing_ok)
{
Form_pg_statistic_ext statextrec;
HeapTuple statexttup;
bool dependencies_enabled;
bool mcv_enabled;
int i;
+ List *context;
+ ListCell *lc;
+ List *exprs = NIL;
+ bool has_exprs;
+ int ncolumns;
statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
elog(ERROR, "cache lookup failed for statistics object %u", statextid);
}
- statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+ /* has the statistics expressions? */
+ has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
- initStringInfo(&buf);
-
- nsp = get_namespace_name(statextrec->stxnamespace);
- appendStringInfo(&buf, "CREATE STATISTICS %s",
- quote_qualified_identifier(nsp,
- NameStr(statextrec->stxname)));
+ statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
/*
- * Decode the stxkind column so that we know which stats types to print.
+ * Get the statistics expressions, if any. (NOTE: we do not use the
+ * relcache versions of the expressions, because we want to display
+ * non-const-folded expressions.)
*/
- datum = SysCacheGetAttr(STATEXTOID, statexttup,
- Anum_pg_statistic_ext_stxkind, &isnull);
- Assert(!isnull);
- arr = DatumGetArrayTypeP(datum);
- if (ARR_NDIM(arr) != 1 ||
- ARR_HASNULL(arr) ||
- ARR_ELEMTYPE(arr) != CHAROID)
- elog(ERROR, "stxkind is not a 1-D char array");
- enabled = (char *) ARR_DATA_PTR(arr);
-
- ndistinct_enabled = false;
- dependencies_enabled = false;
- mcv_enabled = false;
-
- for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+ if (has_exprs)
{
- if (enabled[i] == STATS_EXT_NDISTINCT)
- ndistinct_enabled = true;
- if (enabled[i] == STATS_EXT_DEPENDENCIES)
- dependencies_enabled = true;
- if (enabled[i] == STATS_EXT_MCV)
- mcv_enabled = true;
+ Datum exprsDatum;
+ bool isnull;
+ char *exprsString;
+
+ exprsDatum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+ Assert(!isnull);
+ exprsString = TextDatumGetCString(exprsDatum);
+ exprs = (List *) stringToNode(exprsString);
+ pfree(exprsString);
}
+ else
+ exprs = NIL;
- /*
- * If any option is disabled, then we'll need to append the types clause
- * to show which options are enabled. We omit the types clause on purpose
- * when all options are enabled, so a pg_dump/pg_restore will create all
- * statistics types on a newer postgres version, if the statistics had all
- * options enabled on the original version.
- */
- if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
+ /* count the number of columns (attributes and expressions) */
+ ncolumns = statextrec->stxkeys.dim1 + list_length(exprs);
+
+ initStringInfo(&buf);
+
+ if (!columns_only)
{
- bool gotone = false;
+ nsp = get_namespace_name(statextrec->stxnamespace);
+ appendStringInfo(&buf, "CREATE STATISTICS %s",
+ quote_qualified_identifier(nsp,
+ NameStr(statextrec->stxname)));
- appendStringInfoString(&buf, " (");
+ /*
+ * Decode the stxkind column so that we know which stats types to
+ * print.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxkind, &isnull);
+ Assert(!isnull);
+ arr = DatumGetArrayTypeP(datum);
+ if (ARR_NDIM(arr) != 1 ||
+ ARR_HASNULL(arr) ||
+ ARR_ELEMTYPE(arr) != CHAROID)
+ elog(ERROR, "stxkind is not a 1-D char array");
+ enabled = (char *) ARR_DATA_PTR(arr);
- if (ndistinct_enabled)
+ ndistinct_enabled = false;
+ dependencies_enabled = false;
+ mcv_enabled = false;
+
+ for (i = 0; i < ARR_DIMS(arr)[0]; i++)
{
- appendStringInfoString(&buf, "ndistinct");
- gotone = true;
+ if (enabled[i] == STATS_EXT_NDISTINCT)
+ ndistinct_enabled = true;
+ else if (enabled[i] == STATS_EXT_DEPENDENCIES)
+ dependencies_enabled = true;
+ else if (enabled[i] == STATS_EXT_MCV)
+ mcv_enabled = true;
+
+ /* ignore STATS_EXT_EXPRESSIONS (it's built automatically) */
}
- if (dependencies_enabled)
+ /*
+ * If any option is disabled, then we'll need to append the types
+ * clause to show which options are enabled. We omit the types clause
+ * on purpose when all options are enabled, so a pg_dump/pg_restore
+ * will create all statistics types on a newer postgres version, if
+ * the statistics had all options enabled on the original version.
+ *
+ * But if the statistics is defined on just a single column, it has to
+ * be an expression statistics. In that case we don't need to specify
+ * kinds.
+ */
+ if ((!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) &&
+ (ncolumns > 1))
{
- appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
- gotone = true;
- }
+ bool gotone = false;
- if (mcv_enabled)
- appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+ appendStringInfoString(&buf, " (");
- appendStringInfoChar(&buf, ')');
- }
+ if (ndistinct_enabled)
+ {
+ appendStringInfoString(&buf, "ndistinct");
+ gotone = true;
+ }
+
+ if (dependencies_enabled)
+ {
+ appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
+ gotone = true;
+ }
- appendStringInfoString(&buf, " ON ");
+ if (mcv_enabled)
+ appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+
+ appendStringInfoChar(&buf, ')');
+ }
+
+ appendStringInfoString(&buf, " ON ");
+ }
+ /* decode simple column references */
for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
{
AttrNumber attnum = statextrec->stxkeys.values[colno];
appendStringInfoString(&buf, quote_identifier(attname));
}
- appendStringInfo(&buf, " FROM %s",
- generate_relation_name(statextrec->stxrelid, NIL));
+ context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+ statextrec->stxrelid);
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ char *str;
+ int prettyFlags = PRETTYFLAG_INDENT;
+
+ str = deparse_expression_pretty(expr, context, false, false,
+ prettyFlags, 0);
+
+ if (colno > 0)
+ appendStringInfoString(&buf, ", ");
+
+ /* Need parens if it's not a bare function call */
+ if (looks_like_function(expr))
+ appendStringInfoString(&buf, str);
+ else
+ appendStringInfo(&buf, "(%s)", str);
+
+ colno++;
+ }
+
+ if (!columns_only)
+ appendStringInfo(&buf, " FROM %s",
+ generate_relation_name(statextrec->stxrelid, NIL));
ReleaseSysCache(statexttup);
return buf.data;
}
+/*
+ * Generate text array of expressions for statistics object.
+ */
+Datum
+pg_get_statisticsobjdef_expressions(PG_FUNCTION_ARGS)
+{
+ Oid statextid = PG_GETARG_OID(0);
+ Form_pg_statistic_ext statextrec;
+ HeapTuple statexttup;
+ Datum datum;
+ bool isnull;
+ List *context;
+ ListCell *lc;
+ List *exprs = NIL;
+ bool has_exprs;
+ char *tmp;
+ ArrayBuildState *astate = NULL;
+
+ statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
+
+ if (!HeapTupleIsValid(statexttup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statextid);
+
+ /* has the statistics expressions? */
+ has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
+
+ /* no expressions? we're done */
+ if (!has_exprs)
+ {
+ ReleaseSysCache(statexttup);
+ PG_RETURN_NULL();
+ }
+
+ statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+
+ /*
+ * Get the statistics expressions, and deparse them into text values.
+ */
+ datum = SysCacheGetAttr(STATEXTOID, statexttup,
+ Anum_pg_statistic_ext_stxexprs, &isnull);
+
+ Assert(!isnull);
+ tmp = TextDatumGetCString(datum);
+ exprs = (List *) stringToNode(tmp);
+ pfree(tmp);
+
+ context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+ statextrec->stxrelid);
+
+ foreach(lc, exprs)
+ {
+ Node *expr = (Node *) lfirst(lc);
+ char *str;
+ int prettyFlags = PRETTYFLAG_INDENT;
+
+ str = deparse_expression_pretty(expr, context, false, false,
+ prettyFlags, 0);
+
+ astate = accumArrayResult(astate,
+ PointerGetDatum(cstring_to_text(str)),
+ false,
+ TEXTOID,
+ CurrentMemoryContext);
+ }
+
+ ReleaseSysCache(statexttup);
+
+ PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+}
+
/*
* pg_get_partkeydef
*
* If examine_variable is able to deduce anything about the GROUP BY
* expression, treat it as a single variable even if it's really more
* complicated.
+ *
+ * XXX This has the consequence that if there's a statistics on the
+ * expression, we don't split it into individual Vars. This affects
+ * our selection of statistics in estimate_multivariate_ndistinct,
+ * because it's probably better to use more accurate estimate for
+ * each expression and treat them as independent, than to combine
+ * estimates for the extracted variables when we don't know how that
+ * relates to the expressions.
*/
examine_variable(root, groupexpr, 0, &vardata);
if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
List **varinfos, double *ndistinct)
{
ListCell *lc;
- Bitmapset *attnums = NULL;
- int nmatches;
+ int nmatches_vars;
+ int nmatches_exprs;
Oid statOid = InvalidOid;
MVNDistinct *stats;
- Bitmapset *matched = NULL;
+ StatisticExtInfo *matched_info = NULL;
/* bail out immediately if the table has no extended statistics */
if (!rel->statlist)
return false;
- /* Determine the attnums we're looking for */
- foreach(lc, *varinfos)
- {
- GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
- AttrNumber attnum;
-
- Assert(varinfo->rel == rel);
-
- if (!IsA(varinfo->var, Var))
- continue;
-
- attnum = ((Var *) varinfo->var)->varattno;
-
- if (!AttrNumberIsForUserDefinedAttr(attnum))
- continue;
-
- attnums = bms_add_member(attnums, attnum);
- }
-
/* look for the ndistinct statistics matching the most vars */
- nmatches = 1; /* we require at least two matches */
+ nmatches_vars = 0; /* we require at least two matches */
+ nmatches_exprs = 0;
foreach(lc, rel->statlist)
{
+ ListCell *lc2;
StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
- Bitmapset *shared;
- int nshared;
+ int nshared_vars = 0;
+ int nshared_exprs = 0;
/* skip statistics of other kinds */
if (info->kind != STATS_EXT_NDISTINCT)
continue;
- /* compute attnums shared by the vars and the statistics object */
- shared = bms_intersect(info->keys, attnums);
- nshared = bms_num_members(shared);
+ /*
+ * Determine how many expressions (and variables in non-matched
+ * expressions) match. We'll then use these numbers to pick the
+ * statistics object that best matches the clauses.
+ */
+ foreach(lc2, *varinfos)
+ {
+ ListCell *lc3;
+ GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+ AttrNumber attnum;
+
+ Assert(varinfo->rel == rel);
+
+ /* simple Var, search in statistics keys directly */
+ if (IsA(varinfo->var, Var))
+ {
+ attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * Ignore system attributes - we don't support statistics on
+ * them, so can't match them (and it'd fail as the values are
+ * negative).
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ if (bms_is_member(attnum, info->keys))
+ nshared_vars++;
+
+ continue;
+ }
+
+ /* expression - see if it's in the statistics */
+ foreach(lc3, info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ nshared_exprs++;
+ break;
+ }
+ }
+ }
+
+ if (nshared_vars + nshared_exprs < 2)
+ continue;
/*
* Does this statistics object match more columns than the currently
* XXX This should break ties using name of the object, or something
* like that, to make the outcome stable.
*/
- if (nshared > nmatches)
+ if ((nshared_exprs > nmatches_exprs) ||
+ (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars)))
{
statOid = info->statOid;
- nmatches = nshared;
- matched = shared;
+ nmatches_vars = nshared_vars;
+ nmatches_exprs = nshared_exprs;
+ matched_info = info;
}
}
/* No match? */
if (statOid == InvalidOid)
return false;
- Assert(nmatches > 1 && matched != NULL);
+
+ Assert(nmatches_vars + nmatches_exprs > 1);
stats = statext_ndistinct_load(statOid);
int i;
List *newlist = NIL;
MVNDistinctItem *item = NULL;
+ ListCell *lc2;
+ Bitmapset *matched = NULL;
+ AttrNumber attnum_offset;
+
+ /*
+ * How much we need to offset the attnums? If there are no
+ * expressions, no offset is needed. Otherwise offset enough to move
+ * the lowest one (which is equal to number of expressions) to 1.
+ */
+ if (matched_info->exprs)
+ attnum_offset = (list_length(matched_info->exprs) + 1);
+ else
+ attnum_offset = 0;
+
+ /* see what actually matched */
+ foreach(lc2, *varinfos)
+ {
+ ListCell *lc3;
+ int idx;
+ bool found = false;
+
+ GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+
+ /*
+ * Process a simple Var expression, by matching it to keys
+ * directly. If there's a matchine expression, we'll try
+ * matching it later.
+ */
+ if (IsA(varinfo->var, Var))
+ {
+ AttrNumber attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * Ignore expressions on system attributes. Can't rely on
+ * the bms check for negative values.
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ continue;
+
+ /* Is the variable covered by the statistics? */
+ if (!bms_is_member(attnum, matched_info->keys))
+ continue;
+
+ attnum = attnum + attnum_offset;
+
+ /* ensure sufficient offset */
+ Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+ matched = bms_add_member(matched, attnum);
+
+ found = true;
+ }
+
+ /*
+ * XXX Maybe we should allow searching the expressions even if we
+ * found an attribute matching the expression? That would handle
+ * trivial expressions like "(a)" but it seems fairly useless.
+ */
+ if (found)
+ continue;
+
+ /* expression - see if it's in the statistics */
+ idx = 0;
+ foreach(lc3, matched_info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ AttrNumber attnum = -(idx + 1);
+
+ attnum = attnum + attnum_offset;
+
+ /* ensure sufficient offset */
+ Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+ matched = bms_add_member(matched, attnum);
+
+ /* there should be just one matching expression */
+ break;
+ }
+
+ idx++;
+ }
+ }
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
+ int j;
MVNDistinctItem *tmpitem = &stats->items[i];
- if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL)
+ if (tmpitem->nattributes != bms_num_members(matched))
+ continue;
+
+ /* assume it's the right item */
+ item = tmpitem;
+
+ /* check that all item attributes/expressions fit the match */
+ for (j = 0; j < tmpitem->nattributes; j++)
{
- item = tmpitem;
- break;
+ AttrNumber attnum = tmpitem->attributes[j];
+
+ /*
+ * Thanks to how we constructed the matched bitmap above, we
+ * can just offset all attnums the same way.
+ */
+ attnum = attnum + attnum_offset;
+
+ if (!bms_is_member(attnum, matched))
+ {
+ /* nah, it's not this item */
+ item = NULL;
+ break;
+ }
}
+
+ /*
+ * If the item has all the matched attributes, we know it's the
+ * right one - there can't be a better one. matching more.
+ */
+ if (item)
+ break;
}
- /* make sure we found an item */
+ /*
+ * Make sure we found an item. There has to be one, because ndistinct
+ * statistics includes all combinations of attributes.
+ */
if (!item)
elog(ERROR, "corrupt MVNDistinct entry");
foreach(lc, *varinfos)
{
GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
- AttrNumber attnum;
+ ListCell *lc3;
+ bool found = false;
- if (!IsA(varinfo->var, Var))
+ /*
+ * Let's look at plain variables first, because it's the most
+ * common case and the check is quite cheap. We can simply get the
+ * attnum and check (with an offset) matched bitmap.
+ */
+ if (IsA(varinfo->var, Var))
{
- newlist = lappend(newlist, varinfo);
+ AttrNumber attnum = ((Var *) varinfo->var)->varattno;
+
+ /*
+ * If it's a system attribute, we're done. We don't support
+ * extended statistics on system attributes, so it's clearly
+ * not matched. Just keep the expression and continue.
+ */
+ if (!AttrNumberIsForUserDefinedAttr(attnum))
+ {
+ newlist = lappend(newlist, varinfo);
+ continue;
+ }
+
+ /* apply the same offset as above */
+ attnum += attnum_offset;
+
+ /* if it's not matched, keep the varinfo */
+ if (!bms_is_member(attnum, matched))
+ newlist = lappend(newlist, varinfo);
+
+ /* The rest of the loop deals with complex expressions. */
continue;
}
- attnum = ((Var *) varinfo->var)->varattno;
+ /*
+ * Process complex expressions, not just simple Vars.
+ *
+ * First, we search for an exact match of an expression. If we
+ * find one, we can just discard the whole GroupExprInfo, with all
+ * the variables we extracted from it.
+ *
+ * Otherwise we inspect the individual vars, and try matching it
+ * to variables in the item.
+ */
+ foreach(lc3, matched_info->exprs)
+ {
+ Node *expr = (Node *) lfirst(lc3);
+
+ if (equal(varinfo->var, expr))
+ {
+ found = true;
+ break;
+ }
+ }
- if (AttrNumberIsForUserDefinedAttr(attnum) &&
- bms_is_member(attnum, matched))
+ /* found exact match, skip */
+ if (found)
continue;
newlist = lappend(newlist, varinfo);
*join_is_reversed = false;
}
+/* statext_expressions_load copies the tuple, so just pfree it. */
+static void
+ReleaseDummy(HeapTuple tuple)
+{
+ pfree(tuple);
+}
+
/*
* examine_variable
* Try to look up statistical data about an expression.
* operator we are estimating for. FIXME later.
*/
ListCell *ilist;
+ ListCell *slist;
foreach(ilist, onerel->indexlist)
{
if (vardata->statsTuple)
break;
}
+
+ /*
+ * Search extended statistics for one with a matching expression.
+ * There might be multiple ones, so just grab the first one. In the
+ * future, we might consider the statistics target (and pick the most
+ * accurate statistics) and maybe some other parameters.
+ */
+ foreach(slist, onerel->statlist)
+ {
+ StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist);
+ ListCell *expr_item;
+ int pos;
+
+ /*
+ * Stop once we've found statistics for the expression (either
+ * from extended stats, or for an index in the preceding loop).
+ */
+ if (vardata->statsTuple)
+ break;
+
+ /* skip stats without per-expression stats */
+ if (info->kind != STATS_EXT_EXPRESSIONS)
+ continue;
+
+ pos = 0;
+ foreach(expr_item, info->exprs)
+ {
+ Node *expr = (Node *) lfirst(expr_item);
+
+ Assert(expr);
+
+ /* strip RelabelType before comparing it */
+ if (expr && IsA(expr, RelabelType))
+ expr = (Node *) ((RelabelType *) expr)->arg;
+
+ /* found a match, see if we can extract pg_statistic row */
+ if (equal(node, expr))
+ {
+ HeapTuple t = statext_expressions_load(info->statOid, pos);
+
+ /* Get index's table for permission check */
+ RangeTblEntry *rte;
+ Oid userid;
+
+ vardata->statsTuple = t;
+
+ /*
+ * XXX Not sure if we should cache the tuple somewhere.
+ * Now we just create a new copy every time.
+ */
+ vardata->freefunc = ReleaseDummy;
+
+ rte = planner_rt_fetch(onerel->relid, root);
+ Assert(rte->rtekind == RTE_RELATION);
+
+ /*
+ * Use checkAsUser if it's set, in case we're accessing
+ * the table via a view.
+ */
+ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+ /*
+ * For simplicity, we insist on the whole table being
+ * selectable, rather than trying to identify which
+ * column(s) the statistics depends on. Also require all
+ * rows to be selectable --- there must be no
+ * securityQuals from security barrier views or RLS
+ * policies.
+ */
+ vardata->acl_ok =
+ rte->securityQuals == NIL &&
+ (pg_class_aclcheck(rte->relid, userid,
+ ACL_SELECT) == ACLCHECK_OK);
+
+ /*
+ * If the user doesn't have permissions to access an
+ * inheritance child relation, check the permissions of
+ * the table actually mentioned in the query, since most
+ * likely the user does have that permission. Note that
+ * whole-table select privilege on the parent doesn't
+ * quite guarantee that the user could read all columns of
+ * the child. But in practice it's unlikely that any
+ * interesting security violation could result from
+ * allowing access to the expression stats, so we allow it
+ * anyway. See similar code in examine_simple_variable()
+ * for additional comments.
+ */
+ if (!vardata->acl_ok &&
+ root->append_rel_array != NULL)
+ {
+ AppendRelInfo *appinfo;
+ Index varno = onerel->relid;
+
+ appinfo = root->append_rel_array[varno];
+ while (appinfo &&
+ planner_rt_fetch(appinfo->parent_relid,
+ root)->rtekind == RTE_RELATION)
+ {
+ varno = appinfo->parent_relid;
+ appinfo = root->append_rel_array[varno];
+ }
+ if (varno != onerel->relid)
+ {
+ /* Repeat access check on this rel */
+ rte = planner_rt_fetch(varno, root);
+ Assert(rte->rtekind == RTE_RELATION);
+
+ userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+ vardata->acl_ok =
+ rte->securityQuals == NIL &&
+ (pg_class_aclcheck(rte->relid,
+ userid,
+ ACL_SELECT) == ACLCHECK_OK);
+ }
+ }
+
+ break;
+ }
+
+ pos++;
+ }
+ }
}
}
unlike => { exclude_dump_test_schema => 1, },
},
+ 'CREATE STATISTICS extended_stats_expression' => {
+ create_order => 99,
+ create_sql => 'CREATE STATISTICS dump_test.test_ext_stats_expr
+ ON (2 * col1) FROM dump_test.test_fifth_table',
+ regexp => qr/^
+ \QCREATE STATISTICS dump_test.test_ext_stats_expr ON ((2 * col1)) FROM dump_test.test_fifth_table;\E
+ /xms,
+ like =>
+ { %full_runs, %dump_test_schema_runs, section_post_data => 1, },
+ unlike => { exclude_dump_test_schema => 1, },
+ },
+
'CREATE SEQUENCE test_table_col1_seq' => {
regexp => qr/^
\QCREATE SEQUENCE dump_test.test_table_col1_seq\E
}
/* print any extended statistics */
- if (pset.sversion >= 100000)
+ if (pset.sversion >= 140000)
+ {
+ printfPQExpBuffer(&buf,
+ "SELECT oid, "
+ "stxrelid::pg_catalog.regclass, "
+ "stxnamespace::pg_catalog.regnamespace AS nsp, "
+ "stxname,\n"
+ "pg_get_statisticsobjdef_columns(oid) AS columns,\n"
+ " 'd' = any(stxkind) AS ndist_enabled,\n"
+ " 'f' = any(stxkind) AS deps_enabled,\n"
+ " 'm' = any(stxkind) AS mcv_enabled,\n"
+ "stxstattarget\n"
+ "FROM pg_catalog.pg_statistic_ext stat\n"
+ "WHERE stxrelid = '%s'\n"
+ "ORDER BY 1;",
+ oid);
+
+ result = PSQLexec(buf.data);
+ if (!result)
+ goto error_return;
+ else
+ tuples = PQntuples(result);
+
+ if (tuples > 0)
+ {
+ printTableAddFooter(&cont, _("Statistics objects:"));
+
+ for (i = 0; i < tuples; i++)
+ {
+ bool gotone = false;
+ bool has_ndistinct;
+ bool has_dependencies;
+ bool has_mcv;
+ bool has_all;
+ bool has_some;
+
+ has_ndistinct = (strcmp(PQgetvalue(result, i, 5), "t") == 0);
+ has_dependencies = (strcmp(PQgetvalue(result, i, 6), "t") == 0);
+ has_mcv = (strcmp(PQgetvalue(result, i, 7), "t") == 0);
+
+ printfPQExpBuffer(&buf, " ");
+
+ /* statistics object name (qualified with namespace) */
+ appendPQExpBuffer(&buf, "\"%s\".\"%s\"",
+ PQgetvalue(result, i, 2),
+ PQgetvalue(result, i, 3));
+
+ /*
+ * When printing kinds we ignore expression statistics,
+ * which is used only internally and can't be specified by
+ * user. We don't print the kinds when either none are
+ * specified (in which case it has to be statistics on a
+ * single expr) or when all are specified (in which case
+ * we assume it's expanded by CREATE STATISTICS).
+ */
+ has_all = (has_ndistinct && has_dependencies && has_mcv);
+ has_some = (has_ndistinct || has_dependencies || has_mcv);
+
+ if (has_some && !has_all)
+ {
+ appendPQExpBuffer(&buf, " (");
+
+ /* options */
+ if (has_ndistinct)
+ {
+ appendPQExpBufferStr(&buf, "ndistinct");
+ gotone = true;
+ }
+
+ if (has_dependencies)
+ {
+ appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
+ gotone = true;
+ }
+
+ if (has_mcv)
+ {
+ appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
+ }
+
+ appendPQExpBuffer(&buf, ")");
+ }
+
+ appendPQExpBuffer(&buf, " ON %s FROM %s",
+ PQgetvalue(result, i, 4),
+ PQgetvalue(result, i, 1));
+
+ /* Show the stats target if it's not default */
+ if (strcmp(PQgetvalue(result, i, 8), "-1") != 0)
+ appendPQExpBuffer(&buf, "; STATISTICS %s",
+ PQgetvalue(result, i, 8));
+
+ printTableAddFooter(&cont, buf.data);
+ }
+ }
+ PQclear(result);
+ }
+ else if (pset.sversion >= 100000)
{
printfPQExpBuffer(&buf,
"SELECT oid, "
printfPQExpBuffer(&buf,
"SELECT \n"
"es.stxnamespace::pg_catalog.regnamespace::text AS \"%s\", \n"
- "es.stxname AS \"%s\", \n"
- "pg_catalog.format('%%s FROM %%s', \n"
- " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
- " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
- " JOIN pg_catalog.pg_attribute a \n"
- " ON (es.stxrelid = a.attrelid \n"
- " AND a.attnum = s.attnum \n"
- " AND NOT a.attisdropped)), \n"
- "es.stxrelid::regclass) AS \"%s\"",
+ "es.stxname AS \"%s\", \n",
gettext_noop("Schema"),
- gettext_noop("Name"),
- gettext_noop("Definition"));
+ gettext_noop("Name"));
+
+ if (pset.sversion >= 140000)
+ appendPQExpBuffer(&buf,
+ "pg_catalog.format('%%s FROM %%s', \n"
+ " pg_get_statisticsobjdef_columns(es.oid), \n"
+ " es.stxrelid::regclass) AS \"%s\"",
+ gettext_noop("Definition"));
+ else
+ appendPQExpBuffer(&buf,
+ "pg_catalog.format('%%s FROM %%s', \n"
+ " (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
+ " FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
+ " JOIN pg_catalog.pg_attribute a \n"
+ " ON (es.stxrelid = a.attrelid \n"
+ " AND a.attnum = s.attnum \n"
+ " AND NOT a.attisdropped)), \n"
+ "es.stxrelid::regclass) AS \"%s\"",
+ gettext_noop("Definition"));
appendPQExpBuffer(&buf,
",\nCASE WHEN 'd' = any(es.stxkind) THEN 'defined' \n"
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202103265
+#define CATALOG_VERSION_NO 202103266
#endif
proname => 'pg_get_statisticsobjdef', provolatile => 's',
prorettype => 'text', proargtypes => 'oid',
prosrc => 'pg_get_statisticsobjdef' },
+{ oid => '8887', descr => 'extended statistics columns',
+ proname => 'pg_get_statisticsobjdef_columns', provolatile => 's',
+ prorettype => 'text', proargtypes => 'oid',
+ prosrc => 'pg_get_statisticsobjdef_columns' },
+{ oid => '8886', descr => 'extended statistics expressions',
+ proname => 'pg_get_statisticsobjdef_expressions', provolatile => 's',
+ prorettype => '_text', proargtypes => 'oid',
+ prosrc => 'pg_get_statisticsobjdef_expressions' },
{ oid => '3352', descr => 'partition key description',
proname => 'pg_get_partkeydef', provolatile => 's', prorettype => 'text',
proargtypes => 'oid', prosrc => 'pg_get_partkeydef' },
#ifdef CATALOG_VARLEN
char stxkind[1] BKI_FORCE_NOT_NULL; /* statistics kinds requested
* to build */
+ pg_node_tree stxexprs; /* A list of expression trees for stats
+ * attributes that are not simple column
+ * references. */
#endif
} FormData_pg_statistic_ext;
#define STATS_EXT_NDISTINCT 'd'
#define STATS_EXT_DEPENDENCIES 'f'
#define STATS_EXT_MCV 'm'
+#define STATS_EXT_EXPRESSIONS 'e'
#endif /* EXPOSE_TO_CLIENT_CODE */
pg_ndistinct stxdndistinct; /* ndistinct coefficients (serialized) */
pg_dependencies stxddependencies; /* dependencies (serialized) */
pg_mcv_list stxdmcv; /* MCV (serialized) */
+ pg_statistic stxdexpr[1]; /* stats for expressions */
#endif
extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt);
extern ObjectAddress AlterStatistics(AlterStatsStmt *stmt);
extern void RemoveStatisticsById(Oid statsOid);
-extern void UpdateStatisticsForTypeChange(Oid statsOid,
- Oid relationOid, int attnum,
- Oid oldColumnType, Oid newColumnType);
+extern Oid StatisticsGetRelation(Oid statId, bool missing_ok);
/* commands/aggregatecmds.c */
extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle,
T_TypeName,
T_ColumnDef,
T_IndexElem,
+ T_StatsElem,
T_Constraint,
T_DefElem,
T_RangeTblEntry,
AT_AddIdentity, /* ADD IDENTITY */
AT_SetIdentity, /* SET identity column options */
AT_DropIdentity, /* DROP IDENTITY */
- AT_AlterCollationRefreshVersion /* ALTER COLLATION ... REFRESH VERSION */
+ AT_AlterCollationRefreshVersion, /* ALTER COLLATION ... REFRESH VERSION */
+ AT_ReAddStatistics /* internal to commands/tablecmds.c */
} AlterTableType;
typedef struct ReplicaIdentityStmt
List *relations; /* rels to build stats on (list of RangeVar) */
char *stxcomment; /* comment to apply to stats, or NULL */
bool if_not_exists; /* do nothing if stats name already exists */
+ bool transformed; /* true when transformStatsStmt is finished */
} CreateStatsStmt;
+/*
+ * StatsElem - statistics parameters (used in CREATE STATISTICS)
+ *
+ * For a plain attribute, 'name' is the name of the referenced table column
+ * and 'expr' is NULL. For an expression, 'name' is NULL and 'expr' is the
+ * expression tree.
+ */
+typedef struct StatsElem
+{
+ NodeTag type;
+ char *name; /* name of attribute to index, or NULL */
+ Node *expr; /* expression to index, or NULL */
+} StatsElem;
+
+
/* ----------------------
* Alter Statistics Statement
* ----------------------
RelOptInfo *rel; /* back-link to statistic's table */
char kind; /* statistics kind of this entry */
Bitmapset *keys; /* attnums of the columns covered */
+ List *exprs; /* expressions */
} StatisticExtInfo;
/*
EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */
EXPR_KIND_INDEX_EXPRESSION, /* index expression */
EXPR_KIND_INDEX_PREDICATE, /* index predicate */
+ EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */
EXPR_KIND_ALTER_COL_TRANSFORM, /* transform expr in ALTER COLUMN TYPE */
EXPR_KIND_EXECUTE_PARAMETER, /* parameter value in EXECUTE */
EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */
List **afterStmts);
extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt,
const char *queryString);
+extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt,
+ const char *queryString);
extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
List **actions, Node **whereClause);
extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
int count;
} SortItem;
-extern MVNDistinct *statext_ndistinct_build(double totalrows,
- int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats);
+/* a unified representation of the data the statistics is built on */
+typedef struct StatsBuildData
+{
+ int numrows;
+ int nattnums;
+ AttrNumber *attnums;
+ VacAttrStats **stats;
+ Datum **values;
+ bool **nulls;
+} StatsBuildData;
+
+
+extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data);
extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct);
extern MVNDistinct *statext_ndistinct_deserialize(bytea *data);
-extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats);
+extern MVDependencies *statext_dependencies_build(StatsBuildData *data);
extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
extern MVDependencies *statext_dependencies_deserialize(bytea *data);
-extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats,
+extern MCVList *statext_mcv_build(StatsBuildData *data,
double totalrows, int stattarget);
extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats);
extern MCVList *statext_mcv_deserialize(bytea *data);
extern int compare_scalars_simple(const void *a, const void *b, void *arg);
extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
-extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs);
+extern AttrNumber *build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs);
-extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
- TupleDesc tdesc, MultiSortSupport mss,
+extern SortItem *build_sorted_items(StatsBuildData *data, int *nitems,
+ MultiSortSupport mss,
int numattrs, AttrNumber *attnums);
-extern bool examine_clause_args(List *args, Var **varp,
- Const **cstp, bool *varonleftp);
+extern bool examine_opclause_args(List *args, Node **exprp,
+ Const **cstp, bool *expronleftp);
extern Selectivity mcv_combine_selectivities(Selectivity simple_sel,
Selectivity mcv_sel,
typedef struct MVNDistinctItem
{
double ndistinct; /* ndistinct value for this combination */
- Bitmapset *attrs; /* attr numbers of items */
+ int nattributes; /* number of attributes */
+ AttrNumber *attributes; /* attribute numbers */
} MVNDistinctItem;
/* A MVNDistinct object, comprising all possible combinations of columns */
extern bool has_stats_of_kind(List *stats, char requiredkind);
extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
Bitmapset **clause_attnums,
+ List **clause_exprs,
int nclauses);
+extern HeapTuple statext_expressions_load(Oid stxoid, int idx);
#endif /* STATISTICS_H */
extern char *generate_opclass_name(Oid opclass);
extern char *get_range_partbound_string(List *bound_datums);
+extern char *pg_get_statisticsobjdef_string(Oid statextid);
+
#endif /* RULEUTILS_H */
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all
+ "public"."ctlt_all_a_b_stat" ON a, b FROM ctlt_all
+ "public"."ctlt_all_expr_stat" ON ((a || b)) FROM ctlt_all
SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid;
relname | objsubid | description
(2 rows)
SELECT s.stxname, objsubid, description FROM pg_description, pg_statistic_ext s WHERE classoid = 'pg_statistic_ext'::regclass AND objoid = s.oid AND s.stxrelid = 'ctlt_all'::regclass ORDER BY s.stxname, objsubid;
- stxname | objsubid | description
--------------------+----------+-------------
- ctlt_all_a_b_stat | 0 | ab stats
-(1 row)
+ stxname | objsubid | description
+--------------------+----------+---------------
+ ctlt_all_a_b_stat | 0 | ab stats
+ ctlt_all_expr_stat | 0 | ab expr stats
+(2 rows)
CREATE TABLE inh_error1 () INHERITS (ctlt1, ctlt4);
NOTICE: merging multiple inherited definitions of column "a"
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "public"."pg_attrdef_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM public.pg_attrdef
+ "public"."pg_attrdef_a_b_stat" ON a, b FROM public.pg_attrdef
+ "public"."pg_attrdef_expr_stat" ON ((a || b)) FROM public.pg_attrdef
DROP TABLE public.pg_attrdef;
-- Check that LIKE isn't confused when new table masks the old, either
Check constraints:
"ctlt1_a_check" CHECK (length(a) > 2)
Statistics objects:
- "ctl_schema"."ctlt1_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt1
+ "ctl_schema"."ctlt1_a_b_stat" ON a, b FROM ctlt1
+ "ctl_schema"."ctlt1_expr_stat" ON ((a || b)) FROM ctlt1
ROLLBACK;
DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_inh, ctlt13_inh, ctlt13_like, ctlt_all, ctla, ctlb CASCADE;
NOTICE: checking pg_aggregate {aggsortop} => pg_operator {oid}
NOTICE: checking pg_aggregate {aggtranstype} => pg_type {oid}
NOTICE: checking pg_aggregate {aggmtranstype} => pg_type {oid}
-NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
-NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
-NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
-NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
-NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_statistic {starelid} => pg_class {oid}
NOTICE: checking pg_statistic {staop1} => pg_operator {oid}
NOTICE: checking pg_statistic {staop2} => pg_operator {oid}
NOTICE: checking pg_statistic {stacoll4} => pg_collation {oid}
NOTICE: checking pg_statistic {stacoll5} => pg_collation {oid}
NOTICE: checking pg_statistic {starelid,staattnum} => pg_attribute {attrelid,attnum}
+NOTICE: checking pg_statistic_ext {stxrelid} => pg_class {oid}
+NOTICE: checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
+NOTICE: checking pg_statistic_ext {stxowner} => pg_authid {oid}
+NOTICE: checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
+NOTICE: checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
NOTICE: checking pg_rewrite {ev_class} => pg_class {oid}
NOTICE: checking pg_trigger {tgrelid} => pg_class {oid}
NOTICE: checking pg_trigger {tgparentid} => pg_trigger {oid}
( SELECT array_agg(a.attname ORDER BY a.attnum) AS array_agg
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))) AS attnames,
+ pg_get_statisticsobjdef_expressions(s.oid) AS exprs,
s.stxkind AS kinds,
sd.stxdndistinct AS n_distinct,
sd.stxddependencies AS dependencies,
FROM (unnest(s.stxkeys) k(k)
JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))
WHERE (NOT has_column_privilege(c.oid, a.attnum, 'select'::text))))) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid))));
+pg_stats_ext_exprs| SELECT cn.nspname AS schemaname,
+ c.relname AS tablename,
+ sn.nspname AS statistics_schemaname,
+ s.stxname AS statistics_name,
+ pg_get_userbyid(s.stxowner) AS statistics_owner,
+ stat.expr,
+ (stat.a).stanullfrac AS null_frac,
+ (stat.a).stawidth AS avg_width,
+ (stat.a).stadistinct AS n_distinct,
+ CASE
+ WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS most_common_vals,
+ CASE
+ WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS most_common_freqs,
+ CASE
+ WHEN ((stat.a).stakind1 = 2) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 2) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 2) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 2) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 2) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS histogram_bounds,
+ CASE
+ WHEN ((stat.a).stakind1 = 3) THEN (stat.a).stanumbers1[1]
+ WHEN ((stat.a).stakind2 = 3) THEN (stat.a).stanumbers2[1]
+ WHEN ((stat.a).stakind3 = 3) THEN (stat.a).stanumbers3[1]
+ WHEN ((stat.a).stakind4 = 3) THEN (stat.a).stanumbers4[1]
+ WHEN ((stat.a).stakind5 = 3) THEN (stat.a).stanumbers5[1]
+ ELSE NULL::real
+ END AS correlation,
+ CASE
+ WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stavalues1
+ WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stavalues2
+ WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stavalues3
+ WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stavalues4
+ WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stavalues5
+ ELSE NULL::anyarray
+ END AS most_common_elems,
+ CASE
+ WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS most_common_elem_freqs,
+ CASE
+ WHEN ((stat.a).stakind1 = 5) THEN (stat.a).stanumbers1
+ WHEN ((stat.a).stakind2 = 5) THEN (stat.a).stanumbers2
+ WHEN ((stat.a).stakind3 = 5) THEN (stat.a).stanumbers3
+ WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4
+ WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5
+ ELSE NULL::real[]
+ END AS elem_count_histogram
+ FROM (((((pg_statistic_ext s
+ JOIN pg_class c ON ((c.oid = s.stxrelid)))
+ LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid)))
+ LEFT JOIN pg_namespace cn ON ((cn.oid = c.relnamespace)))
+ LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace)))
+ JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+ unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL)));
pg_tables| SELECT n.nspname AS schemaname,
c.relname AS tablename,
pg_get_userbyid(c.relowner) AS tableowner,
end;
$$;
-- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
CREATE STATISTICS tst;
ERROR: syntax error at or near ";"
LINE 1: CREATE STATISTICS tst;
ERROR: column "a" does not exist
CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
ERROR: duplicate column name in statistics definition
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-ERROR: only simple column references are allowed in CREATE STATISTICS
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
-ERROR: only simple column references are allowed in CREATE STATISTICS
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR: cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
+ERROR: duplicate expression in statistics definition
CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
ERROR: unrecognized statistics kind "unrecognized"
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+ERROR: syntax error at or near "+"
+LINE 1: CREATE STATISTICS tst ON y + z FROM ext_stats_test;
+ ^
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
+ERROR: syntax error at or near ","
+LINE 1: CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+ ^
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
b | integer | | |
c | integer | | |
Statistics objects:
- "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1
+ "public"."ab1_b_c_stats" ON b, c FROM ab1
-- Ensure statistics are dropped when table is
SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%';
a | integer | | |
b | integer | | |
Statistics objects:
- "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1; STATISTICS 0
+ "public"."ab1_a_b_stats" ON a, b FROM ab1; STATISTICS 0
ANALYZE ab1;
SELECT stxname, stxdndistinct, stxddependencies, stxdmcv
a | integer | | | | plain | |
b | integer | | | | plain | |
Statistics objects:
- "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1
+ "public"."ab1_a_b_stats" ON a, b FROM ab1
-- partial analyze doesn't build stats either
ANALYZE ab1 (a);
ANALYZE ab1;
DROP TABLE ab1 CASCADE;
NOTICE: drop cascades to table ab1c
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+ stxkind
+---------
+ {e}
+(1 row)
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+ stxkind
+-----------
+ {d,f,m,e}
+(1 row)
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+ generate_series(1,10),
+ generate_series(1,10),
+ generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+ generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
-- Verify supported object types for extended statistics
CREATE schema tststats;
CREATE TABLE tststats.t (a int, b int, c text);
200 | 11
(1 row)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 100 | 11
+(1 row)
+
-- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
ANALYZE ndistinct;
11 | 11
(1 row)
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 11 | 11
+(1 row)
+
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
200 | 13
(1 row)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual
+-----------+--------
+ 1000 | 1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual
+-----------+--------
+ 221 | 221
+(1 row)
+
DROP STATISTICS s10;
SELECT s.stxkind, d.stxdndistinct
FROM pg_statistic_ext s, pg_statistic_ext_data d
200 | 13
(1 row)
--- functional dependencies tests
-CREATE TABLE functional_dependencies (
- filler1 TEXT,
- filler2 NUMERIC,
- a INT,
- b TEXT,
- filler3 DATE,
- c INT,
- d TEXT
-)
-WITH (autovacuum_enabled = off);
-CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
-CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
--- random data (no functional dependencies)
-INSERT INTO functional_dependencies (a, b, c, filler1)
- SELECT mod(i, 5), mod(i, 7), mod(i, 11), i FROM generate_series(1,1000) s(i);
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
estimated | actual
-----------+--------
- 29 | 29
+ 100 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
estimated | actual
-----------+--------
- 3 | 3
+ 100 | 221
(1 row)
--- create statistics
-CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
estimated | actual
-----------+--------
- 29 | 29
+ 100 | 1000
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
estimated | actual
-----------+--------
- 3 | 3
+ 100 | 221
(1 row)
--- a => b, a => c, b => c
-TRUNCATE functional_dependencies;
-DROP STATISTICS func_deps_stat;
-INSERT INTO functional_dependencies (a, b, c, filler1)
- SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
estimated | actual
-----------+--------
- 1 | 50
+ 100 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
estimated | actual
-----------+--------
- 1 | 50
+ 100 | 1000
(1 row)
--- IN
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
estimated | actual
-----------+--------
- 2 | 100
+ 100 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+ stxkind | stxdndistinct
+---------+-------------------------------------------------------------------
+ {d,e} | {"-1, -2": 221, "-1, -3": 247, "-2, -3": 323, "-1, -2, -3": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
estimated | actual
-----------+--------
- 4 | 100
+ 221 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
estimated | actual
-----------+--------
- 8 | 200
+ 1000 | 1000
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
estimated | actual
-----------+--------
- 4 | 100
+ 221 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+DROP STATISTICS s10;
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 1 | 200
+ 100 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
estimated | actual
-----------+--------
- 1 | 200
+ 100 | 247
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
estimated | actual
-----------+--------
- 3 | 400
+ 100 | 1000
(1 row)
--- OR clauses referencing the same attribute
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+ stxkind | stxdndistinct
+---------+-------------------------------------------------------------
+ {d,e} | {"3, 4": 221, "3, -1": 247, "4, -1": 323, "3, 4, -1": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 2 | 100
+ 221 | 221
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
estimated | actual
-----------+--------
- 4 | 100
+ 247 | 247
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
estimated | actual
-----------+--------
- 8 | 200
+ 1000 | 1000
(1 row)
--- OR clauses referencing different attributes
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+DROP STATISTICS s10;
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+ SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+ FROM generate_series(1,1000) s(i);
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 3 | 100
+ 27 | 9
(1 row)
--- ANY
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
estimated | actual
-----------+--------
- 2 | 100
+ 27 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
estimated | actual
-----------+--------
- 4 | 100
+ 27 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
estimated | actual
-----------+--------
- 8 | 200
+ 27 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
estimated | actual
-----------+--------
- 1 | 200
+ 100 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
estimated | actual
-----------+--------
- 1 | 200
+ 100 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
estimated | actual
-----------+--------
- 3 | 400
+ 100 | 180
(1 row)
--- ANY with inequalities should not benefit from functional dependencies
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 2472 | 2400
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
estimated | actual
-----------+--------
- 1441 | 1250
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
estimated | actual
-----------+--------
- 3909 | 2550
+ 9 | 9
(1 row)
--- ALL (should not benefit from functional dependencies)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
estimated | actual
-----------+--------
- 2 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
estimated | actual
-----------+--------
- 1 | 0
+ 45 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
estimated | actual
-----------+--------
- 1 | 0
+ 45 | 45
(1 row)
--- create statistics
-CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
-ANALYZE functional_dependencies;
--- print the detected dependencies
-SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
- dependencies
-------------------------------------------------------------------------------------------------------------
- {"3 => 4": 1.000000, "3 => 6": 1.000000, "4 => 6": 1.000000, "3, 4 => 6": 1.000000, "3, 6 => 4": 1.000000}
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+-- replace the second statistics by statistics on expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 50 | 50
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
estimated | actual
-----------+--------
- 50 | 50
+ 9 | 9
(1 row)
--- IN
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
estimated | actual
-----------+--------
- 100 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
estimated | actual
-----------+--------
- 100 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
estimated | actual
-----------+--------
- 200 | 200
+ 45 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
estimated | actual
-----------+--------
- 100 | 100
+ 45 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
estimated | actual
-----------+--------
- 200 | 200
+ 100 | 180
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+-- replace the second statistics by statistics on both attributes and expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 200 | 200
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
estimated | actual
-----------+--------
- 400 | 400
+ 9 | 9
(1 row)
--- OR clauses referencing the same attribute
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
estimated | actual
-----------+--------
- 99 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
estimated | actual
-----------+--------
- 99 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
estimated | actual
-----------+--------
- 197 | 200
+ 45 | 45
(1 row)
--- OR clauses referencing different attributes are incompatible
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
estimated | actual
-----------+--------
- 3 | 100
+ 45 | 45
(1 row)
--- ANY
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
estimated | actual
-----------+--------
- 100 | 100
+ 100 | 180
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+-- replace the other statistics by statistics on both attributes and expressions
+DROP STATISTICS s11;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
estimated | actual
-----------+--------
- 100 | 100
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
estimated | actual
-----------+--------
- 200 | 200
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
estimated | actual
-----------+--------
- 200 | 200
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
estimated | actual
-----------+--------
- 200 | 200
+ 9 | 9
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
estimated | actual
-----------+--------
- 400 | 400
+ 45 | 45
(1 row)
--- ANY with inequalities should not benefit from functional dependencies
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
estimated | actual
-----------+--------
- 2472 | 2400
+ 45 | 45
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
estimated | actual
-----------+--------
- 1441 | 1250
+ 100 | 180
+(1 row)
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual
+-----------+--------
+ 9 | 9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual
+-----------+--------
+ 45 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual
+-----------+--------
+ 100 | 45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual
+-----------+--------
+ 100 | 180
+(1 row)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+-- functional dependencies tests
+CREATE TABLE functional_dependencies (
+ filler1 TEXT,
+ filler2 NUMERIC,
+ a INT,
+ b TEXT,
+ filler3 DATE,
+ c INT,
+ d TEXT
+)
+WITH (autovacuum_enabled = off);
+CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
+CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT mod(i, 5), mod(i, 7), mod(i, 11), i FROM generate_series(1,1000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 29 | 29
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 3 | 3
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 29 | 29
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 3 | 3
+(1 row)
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 5
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual
+-----------+--------
+ 35 | 35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual
+-----------+--------
+ 5 | 5
+(1 row)
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+ estimated | actual
+-----------+--------
+ 4 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+ estimated | actual
+-----------+--------
+ 8 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 4 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+ estimated | actual
+-----------+--------
+ 3 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual
+-----------+--------
+ 4 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual
+-----------+--------
+ 8 | 200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 3 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 4 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 8 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 3 | 400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+ estimated | actual
+-----------+--------
+ 2472 | 2400
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1441 | 1250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 3909 | 2550
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
+ANALYZE functional_dependencies;
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+ dependencies
+------------------------------------------------------------------------------------------------------------
+ {"3 => 4": 1.000000, "3 => 6": 1.000000, "4 => 6": 1.000000, "3, 4 => 6": 1.000000, "3, 6 => 4": 1.000000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+ estimated | actual
+-----------+--------
+ 400 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual
+-----------+--------
+ 197 | 200
+(1 row)
+
+-- OR clauses referencing different attributes are incompatible
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 3 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 400 | 400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+ estimated | actual
+-----------+--------
+ 2472 | 2400
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1441 | 1250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 3909 | 2550
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+ estimated | actual
+-----------+--------
+ 2 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
+ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+DROP STATISTICS func_deps_stat;
+-- now try functional dependencies with expressions
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual
+-----------+--------
+ 1 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual
+-----------+--------
+ 1 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual
+-----------+--------
+ 1 | 400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+ estimated | actual
+-----------+--------
+ 926 | 1900
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1543 | 2250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 2229 | 2050
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+ dependencies
+------------------------------------------------------------------------------------------------------------------------
+ {"-1 => -2": 1.000000, "-1 => -3": 1.000000, "-2 => -3": 1.000000, "-1, -2 => -3": 1.000000, "-1, -3 => -2": 1.000000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual
+-----------+--------
+ 400 | 400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 99 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual
+-----------+--------
+ 197 | 200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 3 | 100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual
+-----------+--------
+ 400 | 400
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
estimated | actual
-----------+--------
- 3909 | 2550
+ 1957 | 1900
(1 row)
--- ALL (should not benefit from functional dependencies)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
estimated | actual
-----------+--------
- 2 | 100
+ 2933 | 2250
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
estimated | actual
-----------+--------
- 1 | 0
+ 3548 | 2050
(1 row)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
estimated | actual
-----------+--------
- 1 | 0
+ 2 | 100
(1 row)
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
-ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
estimated | actual
-----------+--------
- 25 | 50
+ 1 | 0
(1 row)
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
estimated | actual
-----------+--------
- 50 | 50
+ 1 | 0
(1 row)
-- check the ability to use multiple functional dependencies
1 | 1
(1 row)
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual
+-----------+--------
+ 13 | 13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 1
+(1 row)
+
-- 100 distinct combinations, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
200 | 200
(1 row)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
estimated | actual
-----------+--------
50 | 50
(1 row)
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 111 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 111 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 15 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 11 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 5 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 1 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 149 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 20 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 20 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 116 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 12 | 100
+(1 row)
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual
+-----------+--------
+ 50 | 50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 105 | 120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual
+-----------+--------
+ 150 | 150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+ estimated | actual
+-----------+--------
+ 200 | 200
+(1 row)
+
-- 100 distinct combinations with NULL values, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
(1 row)
DROP TABLE mcv_lists_multi;
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual
+-----------+--------
+ 1 | 0
+(1 row)
+
+DROP TABLE expr_stats;
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual
+-----------+--------
+ 11 | 100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual
+-----------+--------
+ 100 | 100
+(1 row)
+
+DROP TABLE expr_stats;
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.
insert into stts_t1 select i,i from generate_series(1,100) i;
analyze stts_t1;
\dX
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
\dX stts_?
(1 row)
\dX+
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
\dX+ stts_?
create role regress_stats_ext nosuperuser;
set role regress_stats_ext;
\dX
- List of extended statistics
- Schema | Name | Definition | Ndistinct | Dependencies | MCV
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public | func_deps_stat | a, b, c FROM functional_dependencies | | defined |
- public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
- public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
- public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
- public | stts_1 | a, b FROM stts_t1 | defined | |
- public | stts_2 | a, b FROM stts_t1 | defined | defined |
- public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
- public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
- public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
- stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
- stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
- tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
+ List of extended statistics
+ Schema | Name | Definition | Ndistinct | Dependencies | MCV
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public | func_deps_stat | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies | | defined |
+ public | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays | | | defined
+ public | mcv_lists_bool_stats | a, b, c FROM mcv_lists_bool | | | defined
+ public | mcv_lists_stats | a, b, d FROM mcv_lists | | | defined
+ public | stts_1 | a, b FROM stts_t1 | defined | |
+ public | stts_2 | a, b FROM stts_t1 | defined | defined |
+ public | stts_3 | a, b FROM stts_t1 | defined | defined | defined
+ public | stts_4 | b, c FROM stts_t2 | defined | defined | defined
+ public | stts_hoge | col1, col2, col3 FROM stts_t3 | defined | defined | defined
+ stts_s1 | stts_foo | col1, col2 FROM stts_t3 | defined | defined | defined
+ stts_s2 | stts_yama | col1, col3 FROM stts_t3 | | defined | defined
+ tststats | priv_test_stats | a, b FROM tststats.priv_test_tbl | | | defined
(12 rows)
reset role;
CREATE INDEX ctlt1_b_key ON ctlt1 (b);
CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
COMMENT ON COLUMN ctlt1.a IS 'A';
COMMENT ON COLUMN ctlt1.b IS 'B';
COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
$$;
-- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
CREATE STATISTICS tst;
CREATE STATISTICS tst ON a, b;
CREATE STATISTICS tst FROM sometab;
CREATE STATISTICS tst ON a, b FROM nonexistent;
CREATE STATISTICS tst ON a, b FROM ext_stats_test;
CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
DROP TABLE ext_stats_test;
-- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
ANALYZE ab1;
DROP TABLE ab1 CASCADE;
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+ generate_series(1,10),
+ generate_series(1,10),
+ generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+ generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
+
-- Verify supported object types for extended statistics
CREATE schema tststats;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
-- correct command
CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
-- last two plans keep using Group Aggregate, because 'd' is not covered
-- by the statistic and while it's NULL-only we assume 200 values for it
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
DROP STATISTICS s10;
SELECT s.stxkind, d.stxdndistinct
SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+DROP STATISTICS s10;
+
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+ FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+ AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+DROP STATISTICS s10;
+
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+ SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+ FROM generate_series(1,1000) s(i);
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the other statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s11;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
-- functional dependencies tests
CREATE TABLE functional_dependencies (
filler1 TEXT,
TRUNCATE functional_dependencies;
DROP STATISTICS func_deps_stat;
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+
INSERT INTO functional_dependencies (a, b, c, filler1)
SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+DROP STATISTICS func_deps_stat;
+
+-- now try functional dependencies with expressions
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
-- check the ability to use multiple functional dependencies
CREATE TABLE functional_dependencies_multi (
a INTEGER,
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
-- 100 distinct combinations, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
+
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+INSERT INTO mcv_lists (a, b, c, filler1)
+ SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+
-- 100 distinct combinations with NULL values, all in the MCV list
TRUNCATE mcv_lists;
DROP STATISTICS mcv_lists_stats;
DROP TABLE mcv_lists_multi;
+
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+DROP TABLE expr_stats;
+
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+DROP TABLE expr_stats;
+
+
-- Permission tests. Users should not be able to see specific data values in
-- the extended statistics, if they lack permission to see those values in
-- the underlying table.