Extended statistics on expressions

author Tomas Vondra <[email protected]>

Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)

committer Tomas Vondra <[email protected]>

Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
author Tomas Vondra <[email protected]>
Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)
committer Tomas Vondra <[email protected]>
Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml

index 0f8703af5a585a8865d26202e4fc42e341ecc00b..f103d914a62b90097248bce456d141693c294f32 100644 (file)
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7385,8 +7385,22 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         <literal>d</literal> for n-distinct statistics,
         <literal>f</literal> for functional dependency statistics, and
         <literal>m</literal> for most common values (MCV) list statistics
+       <literal>e</literal> for expression statistics
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       Expression trees (in <function>nodeToString()</function>
+       representation) for statistics object attributes that are not simple
+       column references.  This is a list with one element per expression.
+       Null if all statistics object attributes are simple references.
+      </para></entry>
+     </row>
+
      </tbody>
     </tgroup>
    </table>
@@ -7452,7 +7466,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>oid</structfield>)
        </para>
        <para>
-       Extended statistic object containing the definition for this data
+       Extended statistics object containing the definition for this data
        </para></entry>
       </row>
  
@@ -7484,6 +7498,15 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         <structname>pg_mcv_list</structname> type
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxexprs</structfield> <type>pg_node_tree</type>
+      </para>
+      <para>
+       A list of any expressions covered by this statistics object.
+      </para></entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
@@ -7637,6 +7660,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
         see <xref linkend="logical-replication-publication"/>.
        </para></entry>
       </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>stxdexpr</structfield> <type>pg_statistic[]</type>
+      </para>
+      <para>
+       Per-expression statistics, serialized as an array of
+       <structname>pg_statistic</structname> type
+      </para></entry>
+     </row>
      </tbody>
     </tgroup>
    </table>
@@ -9444,6 +9477,11 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <entry>extended planner statistics</entry>
       </row>
  
+     <row>
+      <entry><link linkend="view-pg-stats-ext-exprs"><structname>pg_stats_ext_exprs</structname></link></entry>
+      <entry>extended planner statistics for expressions</entry>
+     </row>
+
       <row>
        <entry><link linkend="view-pg-tables"><structname>pg_tables</structname></link></entry>
        <entry>tables</entry>
@@ -12696,10 +12734,19 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
        </para>
        <para>
-       Name of the column described by this row
+       Names of the columns included in the extended statistics object
        </para></entry>
       </row>
  
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>exprs</structfield> <type>text[]</type>
+      </para>
+      <para>
+       Expressions included in the extended statistics object
+      </para></entry>
+      </row>
+
       <row>
        <entry role="catalog_table_entry"><para role="column_definition">
         <structfield>inherited</structfield> <type>bool</type>
@@ -12851,7 +12898,8 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
  
    <para>
     The view <structname>pg_stats_ext</structname> provides access to
-   the information stored in the <link
+   information about each extended statistics object in the database,
+   combining information stored in the <link
     linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
     and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
     catalogs.  This view allows access only to rows of
@@ -12908,7 +12956,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
        </para>
        <para>
-       Name of schema containing extended statistic
+       Name of schema containing extended statistics object
        </para></entry>
       </row>
  
@@ -12918,7 +12966,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
        </para>
        <para>
-       Name of extended statistics
+       Name of extended statistics object
        </para></entry>
       </row>
  
@@ -12928,7 +12976,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
        </para>
        <para>
-       Owner of the extended statistics
+       Owner of the extended statistics object
        </para></entry>
       </row>
  
@@ -12938,7 +12986,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         (references <link linkend="catalog-pg-attribute"><structname>pg_attribute</structname></link>.<structfield>attname</structfield>)
        </para>
        <para>
-       Names of the columns the extended statistics is defined on
+       Names of the columns the extended statistics object is defined on
        </para></entry>
       </row>
  
@@ -12947,7 +12995,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
         <structfield>kinds</structfield> <type>char[]</type>
        </para>
        <para>
-       Types of extended statistics enabled for this record
+       Types of extended statistics object enabled for this record
        </para></entry>
       </row>
  
@@ -13032,6 +13080,237 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
  
   </sect1>
  
+ <sect1 id="view-pg-stats-ext-exprs">
+  <title><structname>pg_stats_ext_exprs</structname></title>
+
+  <indexterm zone="view-pg-stats-ext-exprs">
+   <primary>pg_stats_ext_exprs</primary>
+  </indexterm>
+
+  <para>
+   The view <structname>pg_stats_ext_exprs</structname> provides access to
+   information about all expressions included in extended statistics objects,
+   combining information stored in the <link
+   linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>
+   and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   catalogs.  This view allows access only to rows of
+   <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link> and <link linkend="catalog-pg-statistic-ext-data"><structname>pg_statistic_ext_data</structname></link>
+   that correspond to tables the user has permission to read, and therefore
+   it is safe to allow public read access to this view.
+  </para>
+
+  <para>
+   <structname>pg_stats_ext_exprs</structname> is also designed to present
+   the information in a more readable format than the underlying catalogs
+   &mdash; at the cost that its schema must be extended whenever the structure
+   of statistics in <link linkend="catalog-pg-statistic"><structname>pg_statistic</structname></link> changes.
+  </para>
+
+  <table>
+   <title><structname>pg_stats_ext_exprs</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing table
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>tablename</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>relname</structfield>)
+      </para>
+      <para>
+       Name of table the statistics object is defined on
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_schemaname</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-namespace"><structname>pg_namespace</structname></link>.<structfield>nspname</structfield>)
+      </para>
+      <para>
+       Name of schema containing extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_name</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-statistic-ext"><structname>pg_statistic_ext</structname></link>.<structfield>stxname</structfield>)
+      </para>
+      <para>
+       Name of extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>statistics_owner</structfield> <type>name</type>
+       (references <link linkend="catalog-pg-authid"><structname>pg_authid</structname></link>.<structfield>rolname</structfield>)
+      </para>
+      <para>
+       Owner of the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>expr</structfield> <type>text</type>
+      </para>
+      <para>
+       Expression included in the extended statistics object
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>null_frac</structfield> <type>float4</type>
+      </para>
+      <para>
+       Fraction of expression entries that are null
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>avg_width</structfield> <type>int4</type>
+      </para>
+      <para>
+       Average width in bytes of expression's entries
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>n_distinct</structfield> <type>float4</type>
+      </para>
+      <para>
+       If greater than zero, the estimated number of distinct values in the
+       expression.  If less than zero, the negative of the number of distinct
+       values divided by the number of rows.  (The negated form is used when
+       <command>ANALYZE</command> believes that the number of distinct values is
+       likely to increase as the table grows; the positive form is used when
+       the expression seems to have a fixed number of possible values.)  For
+       example, -1 indicates a unique expression in which the number of distinct
+       values is the same as the number of rows.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_vals</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of the most common values in the expression. (Null if
+       no values seem to be more common than any others.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common values,
+       i.e., number of occurrences of each divided by total number of rows.
+       (Null when <structfield>most_common_vals</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>histogram_bounds</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of values that divide the expression's values into groups of
+       approximately equal population.  The values in
+       <structfield>most_common_vals</structfield>, if present, are omitted from this
+       histogram calculation.  (This expression is null if the expression data type
+       does not have a <literal>&lt;</literal> operator or if the
+       <structfield>most_common_vals</structfield> list accounts for the entire
+       population.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>correlation</structfield> <type>float4</type>
+      </para>
+      <para>
+       Statistical correlation between physical row ordering and
+       logical ordering of the expression values.  This ranges from -1 to +1.
+       When the value is near -1 or +1, an index scan on the expression will
+       be estimated to be cheaper than when it is near zero, due to reduction
+       of random access to the disk.  (This expression is null if the expression's
+       data type does not have a <literal>&lt;</literal> operator.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elems</structfield> <type>anyarray</type>
+      </para>
+      <para>
+       A list of non-null element values most often appearing within values of
+       the expression. (Null for scalar types.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>most_common_elem_freqs</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A list of the frequencies of the most common element values, i.e., the
+       fraction of rows containing at least one instance of the given value.
+       Two or three additional values follow the per-element frequencies;
+       these are the minimum and maximum of the preceding per-element
+       frequencies, and optionally the frequency of null elements.
+       (Null when <structfield>most_common_elems</structfield> is.)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>elem_count_histogram</structfield> <type>float4[]</type>
+      </para>
+      <para>
+       A histogram of the counts of distinct non-null element values within the
+       values of the expression, followed by the average number of distinct
+       non-null elements.  (Null for scalar types.)
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   The maximum number of entries in the array fields can be controlled on a
+   column-by-column basis using the <link linkend="sql-altertable"><command>ALTER
+   TABLE SET STATISTICS</command></link> command, or globally by setting the
+   <xref linkend="guc-default-statistics-target"/> run-time parameter.
+  </para>
+
+ </sect1>
+
   <sect1 id="view-pg-tables">
    <title><structname>pg_tables</structname></title>
  
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml

index 4363be50c3c4d5e42bb3a0d6902efbee19690372..988f4c573ff5e86c6248c0e6bb8458cfd0d65e6f 100644 (file)
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -21,9 +21,13 @@ PostgreSQL documentation
  
   <refsynopsisdiv>
  <synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+    ON ( <replaceable class="parameter">expression</replaceable> )
+    FROM <replaceable class="parameter">table_name</replaceable>
+
  CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
      [ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
-    ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+    ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
      FROM <replaceable class="parameter">table_name</replaceable>
  </synopsis>
  
@@ -39,6 +43,19 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     database and will be owned by the user issuing the command.
    </para>
  
+  <para>
+   The <command>CREATE STATISTICS</command> command has two basic forms. The
+   first form allows univariate statistics for a single expression to be
+   collected, providing benefits similar to an expression index without the
+   overhead of index maintenance.  This form does not allow the statistics
+   kind to be specified, since the various statistics kinds refer only to
+   multivariate statistics.  The second form of the command allows
+   multivariate statistics on multiple columns and/or expressions to be
+   collected, optionally specifying which statistics kinds to include.  This
+   form will also automatically cause univariate statistics to be collected on
+   any expressions included in the list.
+  </para>
+
    <para>
     If a schema name is given (for example, <literal>CREATE STATISTICS
     myschema.mystat ...</literal>) then the statistics object is created in the
@@ -79,14 +96,16 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
      <term><replaceable class="parameter">statistics_kind</replaceable></term>
      <listitem>
       <para>
-      A statistics kind to be computed in this statistics object.
+      A multivariate statistics kind to be computed in this statistics object.
        Currently supported kinds are
        <literal>ndistinct</literal>, which enables n-distinct statistics,
        <literal>dependencies</literal>, which enables functional
        dependency statistics, and <literal>mcv</literal> which enables
        most-common values lists.
        If this clause is omitted, all supported statistics kinds are
-      included in the statistics object.
+      included in the statistics object. Univariate expression statistics are
+      built automatically if the statistics definition includes any complex
+      expressions rather than just simple column references.
        For more information, see <xref linkend="planner-stats-extended"/>
        and <xref linkend="multivariate-statistics-examples"/>.
       </para>
@@ -98,8 +117,22 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
      <listitem>
       <para>
        The name of a table column to be covered by the computed statistics.
-      At least two column names must be given;  the order of the column names
-      is insignificant.
+      This is only allowed when building multivariate statistics.  At least
+      two column names or expressions must be specified, and their order is
+      not significant.
+     </para>
+    </listitem>
+   </varlistentry>
+
+   <varlistentry>
+    <term><replaceable class="parameter">expression</replaceable></term>
+    <listitem>
+     <para>
+      An expression to be covered by the computed statistics.  This may be
+      used to build univariate statistics on a single expression, or as part
+      of a list of multiple column names and/or expressions to build
+      multivariate statistics.  In the latter case, separate univariate
+      statistics are built automatically for each expression in the list.
       </para>
      </listitem>
     </varlistentry>
@@ -125,6 +158,13 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_na
     reading it.  Once created, however, the ownership of the statistics
     object is independent of the underlying table(s).
    </para>
+
+  <para>
+   Expression statistics are per-expression and are similar to creating an
+   index on the expression, except that they avoid the overhead of index
+   maintenance. Expression statistics are built automatically for each
+   expression in the statistics object definition.
+  </para>
   </refsect1>
  
   <refsect1 id="sql-createstatistics-examples">
@@ -196,6 +236,72 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
     in the table, allowing it to generate better estimates in both cases.
    </para>
  
+  <para>
+   Create table <structname>t3</structname> with a single timestamp column,
+   and run queries using expressions on that column.  Without extended
+   statistics, the planner has no information about the data distribution for
+   the expressions, and uses default estimates.  The planner also does not
+   realize that the value of the date truncated to the month is fully
+   determined by the value of the date truncated to the day. Then expression
+   and ndistinct statistics are built on those two expressions:
+
+<programlisting>
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+   Without expression and ndistinct statistics, the planner has no information
+   about the number of distinct values for the expressions, and has to rely
+   on default estimates. The equality and range conditions are assumed to have
+   0.5% selectivity, and the number of distinct values in the expression is
+   assumed to be the same as for the column (i.e. unique). This results in a
+   significant underestimate of the row count in the first two queries. Moreover,
+   the planner has no information about the relationship between the expressions,
+   so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
+   conditions are independent, and multiplies their selectivities together to
+   arrive at a severe overestimate of the group count in the aggregate query.
+   This is further exacerbated by the lack of accurate statistics for the
+   expressions, forcing the planner to use a default ndistinct estimate for the
+   expression derived from ndistinct for the column. With such statistics, the
+   planner recognizes that the conditions are correlated, and arrives at much
+   more accurate estimates.
+  </para>
+
   </refsect1>
  
   <refsect1>
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile

index 70bc2123df7bf23111cf0ca289acdb8812910a9f..e36a9602c129abbe1dd1b0b106a1cb684165ace9 100644 (file)
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -49,15 +49,15 @@ include $(top_srcdir)/src/backend/common.mk
  
  # Note: the order of this list determines the order in which the catalog
  # header files are assembled into postgres.bki.  BKI_BOOTSTRAP catalogs
-# must appear first, and there are reputedly other, undocumented ordering
-# dependencies.
+# must appear first, and pg_statistic before pg_statistic_ext_data, and
+# there are reputedly other, undocumented ordering dependencies.
  CATALOG_HEADERS := \
     pg_proc.h pg_type.h pg_attribute.h pg_class.h \
     pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
     pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
     pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
-   pg_statistic_ext.h pg_statistic_ext_data.h \
-   pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
+   pg_statistic.h pg_statistic_ext.h pg_statistic_ext_data.h \
+   pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
     pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
     pg_database.h pg_db_role_setting.h pg_tablespace.h \
     pg_authid.h pg_auth_members.h pg_shdepend.h pg_shdescription.h \
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql

index 0dca65dc7bb9fe920d60f84be73741bc6680662c..6483563204cfbd4242a492c2fdefd117205dbde9 100644 (file)
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -264,6 +264,7 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                    JOIN pg_attribute a
                         ON (a.attrelid = s.stxrelid AND a.attnum = k)
             ) AS attnames,
+           pg_get_statisticsobjdef_expressions(s.oid) as exprs,
             s.stxkind AS kinds,
             sd.stxdndistinct AS n_distinct,
             sd.stxddependencies AS dependencies,
@@ -290,6 +291,74 @@ CREATE VIEW pg_stats_ext WITH (security_barrier) AS
                  WHERE NOT has_column_privilege(c.oid, a.attnum, 'select') )
      AND (c.relrowsecurity = false OR NOT row_security_active(c.oid));
  
+CREATE VIEW pg_stats_ext_exprs WITH (security_barrier) AS
+    SELECT cn.nspname AS schemaname,
+           c.relname AS tablename,
+           sn.nspname AS statistics_schemaname,
+           s.stxname AS statistics_name,
+           pg_get_userbyid(s.stxowner) AS statistics_owner,
+           stat.expr,
+           (stat.a).stanullfrac AS null_frac,
+           (stat.a).stawidth AS avg_width,
+           (stat.a).stadistinct AS n_distinct,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stavalues5
+           END) AS most_common_vals,
+           (CASE
+               WHEN (stat.a).stakind1 = 1 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 1 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 1 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 1 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 1 THEN (stat.a).stanumbers5
+           END) AS most_common_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 2 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 2 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 2 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 2 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 2 THEN (stat.a).stavalues5
+           END) AS histogram_bounds,
+           (CASE
+               WHEN (stat.a).stakind1 = 3 THEN (stat.a).stanumbers1[1]
+               WHEN (stat.a).stakind2 = 3 THEN (stat.a).stanumbers2[1]
+               WHEN (stat.a).stakind3 = 3 THEN (stat.a).stanumbers3[1]
+               WHEN (stat.a).stakind4 = 3 THEN (stat.a).stanumbers4[1]
+               WHEN (stat.a).stakind5 = 3 THEN (stat.a).stanumbers5[1]
+           END) correlation,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stavalues1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stavalues2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stavalues3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stavalues4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stavalues5
+           END) AS most_common_elems,
+           (CASE
+               WHEN (stat.a).stakind1 = 4 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 4 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 4 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 4 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 4 THEN (stat.a).stanumbers5
+           END) AS most_common_elem_freqs,
+           (CASE
+               WHEN (stat.a).stakind1 = 5 THEN (stat.a).stanumbers1
+               WHEN (stat.a).stakind2 = 5 THEN (stat.a).stanumbers2
+               WHEN (stat.a).stakind3 = 5 THEN (stat.a).stanumbers3
+               WHEN (stat.a).stakind4 = 5 THEN (stat.a).stanumbers4
+               WHEN (stat.a).stakind5 = 5 THEN (stat.a).stanumbers5
+           END) AS elem_count_histogram
+    FROM pg_statistic_ext s JOIN pg_class c ON (c.oid = s.stxrelid)
+         LEFT JOIN pg_statistic_ext_data sd ON (s.oid = sd.stxoid)
+         LEFT JOIN pg_namespace cn ON (cn.oid = c.relnamespace)
+         LEFT JOIN pg_namespace sn ON (sn.oid = s.stxnamespace)
+         JOIN LATERAL (
+             SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+                    unnest(sd.stxdexpr)::pg_statistic AS a
+         ) stat ON (stat.expr IS NOT NULL);
+
  -- unprivileged users may read pg_statistic_ext but not pg_statistic_ext_data
  REVOKE ALL on pg_statistic_ext_data FROM public;
  
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c

index 2bae205845992abbc87c4a7cddc4994c35ccb189..df4768952d5b8bd12c02fc25b7f1c018b110daa9 100644 (file)
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -29,6 +29,8 @@
  #include "commands/comment.h"
  #include "commands/defrem.h"
  #include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/optimizer.h"
  #include "statistics/statistics.h"
  #include "utils/builtins.h"
  #include "utils/fmgroids.h"
@@ -62,7 +64,8 @@ ObjectAddress
  CreateStatistics(CreateStatsStmt *stmt)
  {
     int16       attnums[STATS_MAX_DIMENSIONS];
-   int         numcols = 0;
+   int         nattnums = 0;
+   int         numcols;
     char       *namestr;
     NameData    stxname;
     Oid         statoid;
@@ -74,21 +77,25 @@ CreateStatistics(CreateStatsStmt *stmt)
     Datum       datavalues[Natts_pg_statistic_ext_data];
     bool        datanulls[Natts_pg_statistic_ext_data];
     int2vector *stxkeys;
+   List       *stxexprs = NIL;
+   Datum       exprsDatum;
     Relation    statrel;
     Relation    datarel;
     Relation    rel = NULL;
     Oid         relid;
     ObjectAddress parentobject,
                 myself;
-   Datum       types[3];       /* one for each possible type of statistic */
+   Datum       types[4];       /* one for each possible type of statistic */
     int         ntypes;
     ArrayType  *stxkind;
     bool        build_ndistinct;
     bool        build_dependencies;
     bool        build_mcv;
+   bool        build_expressions;
     bool        requested_type = false;
     int         i;
     ListCell   *cell;
+   ListCell   *cell2;
  
     Assert(IsA(stmt, CreateStatsStmt));
  
@@ -190,101 +197,124 @@ CreateStatistics(CreateStatsStmt *stmt)
     }
  
     /*
-    * Currently, we only allow simple column references in the expression
-    * list.  That will change someday, and again the grammar already supports
-    * it so we have to enforce restrictions here.  For now, we can convert
-    * the expression list to a simple array of attnums.  While at it, enforce
-    * some constraints.
+    * Make sure no more than STATS_MAX_DIMENSIONS columns are used. There
+    * might be duplicates and so on, but we'll deal with those later.
+    */
+   numcols = list_length(stmt->exprs);
+   if (numcols > STATS_MAX_DIMENSIONS)
+       ereport(ERROR,
+               (errcode(ERRCODE_TOO_MANY_COLUMNS),
+                errmsg("cannot have more than %d columns in statistics",
+                       STATS_MAX_DIMENSIONS)));
+
+   /*
+    * Convert the expression list to a simple array of attnums, but also keep
+    * a list of more complex expressions.  While at it, enforce some
+    * constraints.
+    *
+    * XXX We do only the bare minimum to separate simple attribute and
+    * complex expressions - for example "(a)" will be treated as a complex
+    * expression. No matter how elaborate the check is, there'll always be a
+    * way around it, if the user is determined (consider e.g. "(a+0)"), so
+    * it's not worth protecting against it.
      */
     foreach(cell, stmt->exprs)
     {
         Node       *expr = (Node *) lfirst(cell);
-       ColumnRef  *cref;
-       char       *attname;
+       StatsElem  *selem;
         HeapTuple   atttuple;
         Form_pg_attribute attForm;
         TypeCacheEntry *type;
  
-       if (!IsA(expr, ColumnRef))
-           ereport(ERROR,
-                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                    errmsg("only simple column references are allowed in CREATE STATISTICS")));
-       cref = (ColumnRef *) expr;
-
-       if (list_length(cref->fields) != 1)
-           ereport(ERROR,
-                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                    errmsg("only simple column references are allowed in CREATE STATISTICS")));
-       attname = strVal((Value *) linitial(cref->fields));
-
-       atttuple = SearchSysCacheAttName(relid, attname);
-       if (!HeapTupleIsValid(atttuple))
-           ereport(ERROR,
-                   (errcode(ERRCODE_UNDEFINED_COLUMN),
-                    errmsg("column \"%s\" does not exist",
-                           attname)));
-       attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
-       /* Disallow use of system attributes in extended stats */
-       if (attForm->attnum <= 0)
-           ereport(ERROR,
-                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                    errmsg("statistics creation on system columns is not supported")));
-
-       /* Disallow data types without a less-than operator */
-       type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
-       if (type->lt_opr == InvalidOid)
+       /*
+        * We should not get anything else than StatsElem, given the grammar.
+        * But let's keep it as a safety.
+        */
+       if (!IsA(expr, StatsElem))
             ereport(ERROR,
                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                    errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
-                           attname, format_type_be(attForm->atttypid))));
+                    errmsg("only simple column references and expressions are allowed in CREATE STATISTICS")));
  
-       /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
-       if (numcols >= STATS_MAX_DIMENSIONS)
-           ereport(ERROR,
-                   (errcode(ERRCODE_TOO_MANY_COLUMNS),
-                    errmsg("cannot have more than %d columns in statistics",
-                           STATS_MAX_DIMENSIONS)));
+       selem = (StatsElem *) expr;
  
-       attnums[numcols] = attForm->attnum;
-       numcols++;
-       ReleaseSysCache(atttuple);
+       if (selem->name)        /* column reference */
+       {
+           char       *attname;
+
+           attname = selem->name;
+
+           atttuple = SearchSysCacheAttName(relid, attname);
+           if (!HeapTupleIsValid(atttuple))
+               ereport(ERROR,
+                       (errcode(ERRCODE_UNDEFINED_COLUMN),
+                        errmsg("column \"%s\" does not exist",
+                               attname)));
+           attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+           /* Disallow use of system attributes in extended stats */
+           if (attForm->attnum <= 0)
+               ereport(ERROR,
+                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                        errmsg("statistics creation on system columns is not supported")));
+
+           /* Disallow data types without a less-than operator */
+           type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+           if (type->lt_opr == InvalidOid)
+               ereport(ERROR,
+                       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                        errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class",
+                               attname, format_type_be(attForm->atttypid))));
+
+           attnums[nattnums] = attForm->attnum;
+           nattnums++;
+           ReleaseSysCache(atttuple);
+       }
+       else                    /* expression */
+       {
+           Node       *expr = selem->expr;
+           Oid         atttype;
+
+           Assert(expr != NULL);
+
+           /*
+            * Disallow data types without a less-than operator.
+            *
+            * We ignore this for statistics on a single expression, in which
+            * case we'll build the regular statistics only (and that code can
+            * deal with such data types).
+            */
+           if (list_length(stmt->exprs) > 1)
+           {
+               atttype = exprType(expr);
+               type = lookup_type_cache(atttype, TYPECACHE_LT_OPR);
+               if (type->lt_opr == InvalidOid)
+                   ereport(ERROR,
+                           (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                            errmsg("expression cannot be used in multivariate statistics because its type %s has no default btree operator class",
+                                   format_type_be(atttype))));
+           }
+
+           stxexprs = lappend(stxexprs, expr);
+       }
     }
  
     /*
-    * Check that at least two columns were specified in the statement. The
-    * upper bound was already checked in the loop above.
-    */
-   if (numcols < 2)
-       ereport(ERROR,
-               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                errmsg("extended statistics require at least 2 columns")));
-
-   /*
-    * Sort the attnums, which makes detecting duplicates somewhat easier, and
-    * it does not hurt (it does not affect the efficiency, unlike for
-    * indexes, for example).
-    */
-   qsort(attnums, numcols, sizeof(int16), compare_int16);
-
-   /*
-    * Check for duplicates in the list of columns. The attnums are sorted so
-    * just check consecutive elements.
+    * Parse the statistics kinds.
+    *
+    * First check that if this is the case with a single expression, there
+    * are no statistics kinds specified (we don't allow that for the simple
+    * CREATE STATISTICS form).
      */
-   for (i = 1; i < numcols; i++)
+   if ((list_length(stmt->exprs) == 1) && (list_length(stxexprs) == 1))
     {
-       if (attnums[i] == attnums[i - 1])
+       /* statistics kinds not specified */
+       if (list_length(stmt->stat_types) > 0)
             ereport(ERROR,
-                   (errcode(ERRCODE_DUPLICATE_COLUMN),
-                    errmsg("duplicate column name in statistics definition")));
+                   (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                    errmsg("when building statistics on a single expression, statistics kinds may not be specified")));
     }
  
-   /* Form an int2vector representation of the sorted column list */
-   stxkeys = buildint2vector(attnums, numcols);
-
-   /*
-    * Parse the statistics kinds.
-    */
+   /* OK, let's check that we recognize the statistics kinds. */
     build_ndistinct = false;
     build_dependencies = false;
     build_mcv = false;
@@ -313,14 +343,91 @@ CreateStatistics(CreateStatsStmt *stmt)
                      errmsg("unrecognized statistics kind \"%s\"",
                             type)));
     }
-   /* If no statistic type was specified, build them all. */
-   if (!requested_type)
+
+   /*
+    * If no statistic type was specified, build them all (but only when the
+    * statistics is defined on more than one column/expression).
+    */
+   if ((!requested_type) && (numcols >= 2))
     {
         build_ndistinct = true;
         build_dependencies = true;
         build_mcv = true;
     }
  
+   /*
+    * When there are non-trivial expressions, build the expression stats
+    * automatically. This allows calculating good estimates for stats that
+    * consider per-clause estimates (e.g. functional dependencies).
+    */
+   build_expressions = (list_length(stxexprs) > 0);
+
+   /*
+    * Check that at least two columns were specified in the statement, or
+    * that we're building statistics on a single expression.
+    */
+   if ((numcols < 2) && (list_length(stxexprs) != 1))
+       ereport(ERROR,
+               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                errmsg("extended statistics require at least 2 columns")));
+
+   /*
+    * Sort the attnums, which makes detecting duplicates somewhat easier, and
+    * it does not hurt (it does not matter for the contents, unlike for
+    * indexes, for example).
+    */
+   qsort(attnums, nattnums, sizeof(int16), compare_int16);
+
+   /*
+    * Check for duplicates in the list of columns. The attnums are sorted so
+    * just check consecutive elements.
+    */
+   for (i = 1; i < nattnums; i++)
+   {
+       if (attnums[i] == attnums[i - 1])
+           ereport(ERROR,
+                   (errcode(ERRCODE_DUPLICATE_COLUMN),
+                    errmsg("duplicate column name in statistics definition")));
+   }
+
+   /*
+    * Check for duplicate expressions. We do two loops, counting the
+    * occurrences of each expression. This is O(N^2) but we only allow small
+    * number of expressions and it's not executed often.
+    *
+    * XXX We don't cross-check attributes and expressions, because it does
+    * not seem worth it. In principle we could check that expressions don't
+    * contain trivial attribute references like "(a)", but the reasoning is
+    * similar to why we don't bother with extracting columns from
+    * expressions. It's either expensive or very easy to defeat for
+    * determined user, and there's no risk if we allow such statistics (the
+    * statistics is useless, but harmless).
+    */
+   foreach(cell, stxexprs)
+   {
+       Node       *expr1 = (Node *) lfirst(cell);
+       int         cnt = 0;
+
+       foreach(cell2, stxexprs)
+       {
+           Node       *expr2 = (Node *) lfirst(cell2);
+
+           if (equal(expr1, expr2))
+               cnt += 1;
+       }
+
+       /* every expression should find at least itself */
+       Assert(cnt >= 1);
+
+       if (cnt > 1)
+           ereport(ERROR,
+                   (errcode(ERRCODE_DUPLICATE_COLUMN),
+                    errmsg("duplicate expression in statistics definition")));
+   }
+
+   /* Form an int2vector representation of the sorted column list */
+   stxkeys = buildint2vector(attnums, nattnums);
+
     /* construct the char array of enabled statistic types */
     ntypes = 0;
     if (build_ndistinct)
@@ -329,9 +436,23 @@ CreateStatistics(CreateStatsStmt *stmt)
         types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
     if (build_mcv)
         types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
+   if (build_expressions)
+       types[ntypes++] = CharGetDatum(STATS_EXT_EXPRESSIONS);
     Assert(ntypes > 0 && ntypes <= lengthof(types));
     stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR);
  
+   /* convert the expressions (if any) to a text datum */
+   if (stxexprs != NIL)
+   {
+       char       *exprsString;
+
+       exprsString = nodeToString(stxexprs);
+       exprsDatum = CStringGetTextDatum(exprsString);
+       pfree(exprsString);
+   }
+   else
+       exprsDatum = (Datum) 0;
+
     statrel = table_open(StatisticExtRelationId, RowExclusiveLock);
  
     /*
@@ -351,6 +472,10 @@ CreateStatistics(CreateStatsStmt *stmt)
     values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
     values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
  
+   values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum;
+   if (exprsDatum == (Datum) 0)
+       nulls[Anum_pg_statistic_ext_stxexprs - 1] = true;
+
     /* insert it into pg_statistic_ext */
     htup = heap_form_tuple(statrel->rd_att, values, nulls);
     CatalogTupleInsert(statrel, htup);
@@ -373,6 +498,7 @@ CreateStatistics(CreateStatsStmt *stmt)
     datanulls[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
     datanulls[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
     datanulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+   datanulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
  
     /* insert it into pg_statistic_ext_data */
     htup = heap_form_tuple(datarel->rd_att, datavalues, datanulls);
@@ -396,12 +522,41 @@ CreateStatistics(CreateStatsStmt *stmt)
      */
     ObjectAddressSet(myself, StatisticExtRelationId, statoid);
  
-   for (i = 0; i < numcols; i++)
+   /* add dependencies for plain column references */
+   for (i = 0; i < nattnums; i++)
     {
         ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
         recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
     }
  
+   /*
+    * If there are no dependencies on a column, give the statistics an auto
+    * dependency on the whole table.  In most cases, this will be redundant,
+    * but it might not be if the statistics expressions contain no Vars
+    * (which might seem strange but possible). This is consistent with what
+    * we do for indexes in index_create.
+    *
+    * XXX We intentionally don't consider the expressions before adding this
+    * dependency, because recordDependencyOnSingleRelExpr may not create any
+    * dependencies for whole-row Vars.
+    */
+   if (!nattnums)
+   {
+       ObjectAddressSet(parentobject, RelationRelationId, relid);
+       recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+   }
+
+   /*
+    * Store dependencies on anything mentioned in statistics expressions,
+    * just like we do for index expressions.
+    */
+   if (stxexprs)
+       recordDependencyOnSingleRelExpr(&myself,
+                                       (Node *) stxexprs,
+                                       relid,
+                                       DEPENDENCY_NORMAL,
+                                       DEPENDENCY_AUTO, false, true);
+
     /*
      * Also add dependencies on namespace and owner.  These are required
      * because the stats object might have a different namespace and/or owner
@@ -582,87 +737,6 @@ RemoveStatisticsById(Oid statsOid)
     table_close(relation, RowExclusiveLock);
  }
  
-/*
- * Update a statistics object for ALTER COLUMN TYPE on a source column.
- *
- * This could throw an error if the type change can't be supported.
- * If it can be supported, but the stats must be recomputed, a likely choice
- * would be to set the relevant column(s) of the pg_statistic_ext_data tuple
- * to null until the next ANALYZE.  (Note that the type change hasn't actually
- * happened yet, so one option that's *not* on the table is to recompute
- * immediately.)
- *
- * For both ndistinct and functional-dependencies stats, the on-disk
- * representation is independent of the source column data types, and it is
- * plausible to assume that the old statistic values will still be good for
- * the new column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
- * expression that substantially alters the semantic meaning of the column
- * values, this assumption could fail.  But that seems like a corner case
- * that doesn't justify zapping the stats in common cases.)
- *
- * For MCV lists that's not the case, as those statistics store the datums
- * internally. In this case we simply reset the statistics value to NULL.
- *
- * Note that "type change" includes collation change, which means we can rely
- * on the MCV list being consistent with the collation info in pg_attribute
- * during estimation.
- */
-void
-UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
-                             Oid oldColumnType, Oid newColumnType)
-{
-   HeapTuple   stup,
-               oldtup;
-
-   Relation    rel;
-
-   Datum       values[Natts_pg_statistic_ext_data];
-   bool        nulls[Natts_pg_statistic_ext_data];
-   bool        replaces[Natts_pg_statistic_ext_data];
-
-   oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statsOid));
-   if (!HeapTupleIsValid(oldtup))
-       elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
-
-   /*
-    * When none of the defined statistics types contain datum values from the
-    * table's columns then there's no need to reset the stats. Functional
-    * dependencies and ndistinct stats should still hold true.
-    */
-   if (!statext_is_kind_built(oldtup, STATS_EXT_MCV))
-   {
-       ReleaseSysCache(oldtup);
-       return;
-   }
-
-   /*
-    * OK, we need to reset some statistics. So let's build the new tuple,
-    * replacing the affected statistics types with NULL.
-    */
-   memset(nulls, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-   memset(replaces, 0, Natts_pg_statistic_ext_data * sizeof(bool));
-   memset(values, 0, Natts_pg_statistic_ext_data * sizeof(Datum));
-
-   replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-   nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
-
-   rel = table_open(StatisticExtDataRelationId, RowExclusiveLock);
-
-   /* replace the old tuple */
-   stup = heap_modify_tuple(oldtup,
-                            RelationGetDescr(rel),
-                            values,
-                            nulls,
-                            replaces);
-
-   ReleaseSysCache(oldtup);
-   CatalogTupleUpdate(rel, &stup->t_self, stup);
-
-   heap_freetuple(stup);
-
-   table_close(rel, RowExclusiveLock);
-}
-
  /*
   * Select a nonconflicting name for a new statistics.
   *
@@ -731,18 +805,27 @@ ChooseExtendedStatisticNameAddition(List *exprs)
     buf[0] = '\0';
     foreach(lc, exprs)
     {
-       ColumnRef  *cref = (ColumnRef *) lfirst(lc);
+       StatsElem  *selem = (StatsElem *) lfirst(lc);
         const char *name;
  
         /* It should be one of these, but just skip if it happens not to be */
-       if (!IsA(cref, ColumnRef))
+       if (!IsA(selem, StatsElem))
             continue;
  
-       name = strVal((Value *) linitial(cref->fields));
+       name = selem->name;
  
         if (buflen > 0)
             buf[buflen++] = '_';    /* insert _ between names */
  
+       /*
+        * We use fixed 'expr' for expressions, which have empty column names.
+        * For indexes this is handled in ChooseIndexColumnNames, but we have
+        * no such function for stats and it does not seem worth adding. If a
+        * better name is needed, the user can specify it explicitly.
+        */
+       if (!name)
+           name = "expr";
+
         /*
          * At this point we have buflen <= NAMEDATALEN.  name should be less
          * than NAMEDATALEN already, but use strlcpy for paranoia.
@@ -754,3 +837,29 @@ ChooseExtendedStatisticNameAddition(List *exprs)
     }
     return pstrdup(buf);
  }
+
+/*
+ * StatisticsGetRelation: given a statistics's relation OID, get the OID of
+ * the relation it is an statistics on.  Uses the system cache.
+ */
+Oid
+StatisticsGetRelation(Oid statId, bool missing_ok)
+{
+   HeapTuple   tuple;
+   Form_pg_statistic_ext stx;
+   Oid         result;
+
+   tuple = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statId));
+   if (!HeapTupleIsValid(tuple))
+   {
+       if (missing_ok)
+           return InvalidOid;
+       elog(ERROR, "cache lookup failed for statistics object %u", statId);
+   }
+   stx = (Form_pg_statistic_ext) GETSTRUCT(tuple);
+   Assert(stx->oid == statId);
+
+   result = stx->stxrelid;
+   ReleaseSysCache(tuple);
+   return result;
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c

index efac06f72c7465603a684813cd66c618cf5c4336..88a68a4697ad1d26a5430c9b0753b549c2c10abc 100644 (file)
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -41,6 +41,7 @@
  #include "catalog/pg_namespace.h"
  #include "catalog/pg_opclass.h"
  #include "catalog/pg_tablespace.h"
+#include "catalog/pg_statistic_ext.h"
  #include "catalog/pg_trigger.h"
  #include "catalog/pg_type.h"
  #include "catalog/storage.h"
@@ -188,6 +189,8 @@ typedef struct AlteredTableInfo
     List       *changedIndexDefs;   /* string definitions of same */
     char       *replicaIdentityIndex;   /* index to reset as REPLICA IDENTITY */
     char       *clusterOnIndex; /* index to use for CLUSTER */
+   List       *changedStatisticsOids;  /* OIDs of statistics to rebuild */
+   List       *changedStatisticsDefs;  /* string definitions of same */
  } AlteredTableInfo;
  
  /* Struct describing one new constraint to check in Phase 3 scan */
@@ -440,6 +443,8 @@ static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *c
                                       ObjectAddresses *addrs);
  static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                                     IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
+static ObjectAddress ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                                        CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
  static ObjectAddress ATExecAddConstraint(List **wqueue,
                                          AlteredTableInfo *tab, Relation rel,
                                          Constraint *newConstraint, bool recurse, bool is_readd,
@@ -496,6 +501,7 @@ static ObjectAddress ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                                            AlterTableCmd *cmd, LOCKMODE lockmode);
  static void RememberConstraintForRebuilding(Oid conoid, AlteredTableInfo *tab);
  static void RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab);
+static void RememberStatisticsForRebuilding(Oid indoid, AlteredTableInfo *tab);
  static void ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab,
                                    LOCKMODE lockmode);
  static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
@@ -4756,6 +4762,10 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab,
             address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, true,
                                      lockmode);
             break;
+       case AT_ReAddStatistics:    /* ADD STATISTICS */
+           address = ATExecAddStatistics(tab, rel, (CreateStatsStmt *) cmd->def,
+                                         true, lockmode);
+           break;
         case AT_AddConstraint:  /* ADD CONSTRAINT */
             /* Transform the command only during initial examination */
             if (cur_pass == AT_PASS_ADD_CONSTR)
@@ -8283,6 +8293,29 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
     return address;
  }
  
+/*
+ * ALTER TABLE ADD STATISTICS
+ *
+ * This is no such command in the grammar, but we use this internally to add
+ * AT_ReAddStatistics subcommands to rebuild extended statistics after a table
+ * column type change.
+ */
+static ObjectAddress
+ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+                   CreateStatsStmt *stmt, bool is_rebuild, LOCKMODE lockmode)
+{
+   ObjectAddress address;
+
+   Assert(IsA(stmt, CreateStatsStmt));
+
+   /* The CreateStatsStmt has already been through transformStatsStmt */
+   Assert(stmt->transformed);
+
+   address = CreateStatistics(stmt);
+
+   return address;
+}
+
  /*
   * ALTER TABLE ADD CONSTRAINT USING INDEX
   *
@@ -11830,9 +11863,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                  * Give the extended-stats machinery a chance to fix anything
                  * that this column type change would break.
                  */
-               UpdateStatisticsForTypeChange(foundObject.objectId,
-                                             RelationGetRelid(rel), attnum,
-                                             attTup->atttypid, targettype);
+               RememberStatisticsForRebuilding(foundObject.objectId, tab);
                 break;
  
             case OCLASS_PROC:
@@ -12202,6 +12233,32 @@ RememberIndexForRebuilding(Oid indoid, AlteredTableInfo *tab)
     }
  }
  
+/*
+ * Subroutine for ATExecAlterColumnType: remember that a statistics object
+ * needs to be rebuilt (which we might already know).
+ */
+static void
+RememberStatisticsForRebuilding(Oid stxoid, AlteredTableInfo *tab)
+{
+   /*
+    * This de-duplication check is critical for two independent reasons: we
+    * mustn't try to recreate the same statistics object twice, and if the
+    * statistics depends on more than one column whose type is to be altered,
+    * we must capture its definition string before applying any of the type
+    * changes. ruleutils.c will get confused if we ask again later.
+    */
+   if (!list_member_oid(tab->changedStatisticsOids, stxoid))
+   {
+       /* OK, capture the index's existing definition string */
+       char       *defstring = pg_get_statisticsobjdef_string(stxoid);
+
+       tab->changedStatisticsOids = lappend_oid(tab->changedStatisticsOids,
+                                                stxoid);
+       tab->changedStatisticsDefs = lappend(tab->changedStatisticsDefs,
+                                            defstring);
+   }
+}
+
  /*
   * Cleanup after we've finished all the ALTER TYPE operations for a
   * particular relation.  We have to drop and recreate all the indexes
@@ -12306,6 +12363,22 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
         add_exact_object_address(&obj, objects);
     }
  
+   /* add dependencies for new statistics */
+   forboth(oid_item, tab->changedStatisticsOids,
+           def_item, tab->changedStatisticsDefs)
+   {
+       Oid         oldId = lfirst_oid(oid_item);
+       Oid         relid;
+
+       relid = StatisticsGetRelation(oldId, false);
+       ATPostAlterTypeParse(oldId, relid, InvalidOid,
+                            (char *) lfirst(def_item),
+                            wqueue, lockmode, tab->rewrite);
+
+       ObjectAddressSet(obj, StatisticExtRelationId, oldId);
+       add_exact_object_address(&obj, objects);
+   }
+
     /*
      * Queue up command to restore replica identity index marking
      */
@@ -12354,9 +12427,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
  }
  
  /*
- * Parse the previously-saved definition string for a constraint or index
- * against the newly-established column data type(s), and queue up the
- * resulting command parsetrees for execution.
+ * Parse the previously-saved definition string for a constraint, index or
+ * statistics object against the newly-established column data type(s), and
+ * queue up the resulting command parsetrees for execution.
   *
   * This might fail if, for example, you have a WHERE clause that uses an
   * operator that's not available for the new column type.
@@ -12402,6 +12475,11 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
             querytree_list = lappend(querytree_list, stmt);
             querytree_list = list_concat(querytree_list, afterStmts);
         }
+       else if (IsA(stmt, CreateStatsStmt))
+           querytree_list = lappend(querytree_list,
+                                    transformStatsStmt(oldRelId,
+                                                       (CreateStatsStmt *) stmt,
+                                                       cmd));
         else
             querytree_list = lappend(querytree_list, stmt);
     }
@@ -12540,6 +12618,20 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                 elog(ERROR, "unexpected statement subtype: %d",
                      (int) stmt->subtype);
         }
+       else if (IsA(stm, CreateStatsStmt))
+       {
+           CreateStatsStmt  *stmt = (CreateStatsStmt *) stm;
+           AlterTableCmd *newcmd;
+
+           /* keep the statistics object's comment */
+           stmt->stxcomment = GetComment(oldId, StatisticExtRelationId, 0);
+
+           newcmd = makeNode(AlterTableCmd);
+           newcmd->subtype = AT_ReAddStatistics;
+           newcmd->def = (Node *) stmt;
+           tab->subcmds[AT_PASS_MISC] =
+               lappend(tab->subcmds[AT_PASS_MISC], newcmd);
+       }
         else
             elog(ERROR, "unexpected statement type: %d",
                  (int) nodeTag(stm));
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c

index 38b56231b7df2fb0890b17e5026d295578b5fbe5..d5b1ad4567044ecc762a414c0c607dddf69318d9 100644 (file)
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2980,6 +2980,17 @@ _copyIndexElem(const IndexElem *from)
     return newnode;
  }
  
+static StatsElem *
+_copyStatsElem(const StatsElem *from)
+{
+   StatsElem  *newnode = makeNode(StatsElem);
+
+   COPY_STRING_FIELD(name);
+   COPY_NODE_FIELD(expr);
+
+   return newnode;
+}
+
  static ColumnDef *
  _copyColumnDef(const ColumnDef *from)
  {
@@ -5699,6 +5710,9 @@ copyObjectImpl(const void *from)
         case T_IndexElem:
             retval = _copyIndexElem(from);
             break;
+       case T_StatsElem:
+           retval = _copyStatsElem(from);
+           break;
         case T_ColumnDef:
             retval = _copyColumnDef(from);
             break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c

index 3292dda34245084dc2b5e7a4fe7f9d8d713f6410..d46909bbc4f3a9606785d13a41810ccf57a050bb 100644 (file)
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2596,6 +2596,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b)
     return true;
  }
  
+
+static bool
+_equalStatsElem(const StatsElem *a, const StatsElem *b)
+{
+   COMPARE_STRING_FIELD(name);
+   COMPARE_NODE_FIELD(expr);
+
+   return true;
+}
+
  static bool
  _equalColumnDef(const ColumnDef *a, const ColumnDef *b)
  {
@@ -3724,6 +3734,9 @@ equal(const void *a, const void *b)
         case T_IndexElem:
             retval = _equalIndexElem(a, b);
             break;
+       case T_StatsElem:
+           retval = _equalStatsElem(a, b);
+           break;
         case T_ColumnDef:
             retval = _equalColumnDef(a, b);
             break;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c

index 9f7918c7e901d270272f941c4aa9bdc07ddeefd8..12561c475768035cc683b796fa8dbafcec0c99a2 100644 (file)
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2943,6 +2943,15 @@ _outIndexElem(StringInfo str, const IndexElem *node)
     WRITE_ENUM_FIELD(nulls_ordering, SortByNulls);
  }
  
+static void
+_outStatsElem(StringInfo str, const StatsElem *node)
+{
+   WRITE_NODE_TYPE("STATSELEM");
+
+   WRITE_STRING_FIELD(name);
+   WRITE_NODE_FIELD(expr);
+}
+
  static void
  _outQuery(StringInfo str, const Query *node)
  {
@@ -4286,6 +4295,9 @@ outNode(StringInfo str, const void *obj)
             case T_IndexElem:
                 _outIndexElem(str, obj);
                 break;
+           case T_StatsElem:
+               _outStatsElem(str, obj);
+               break;
             case T_Query:
                 _outQuery(str, obj);
                 break;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c

index 6c39bf893f886a92eaa1a58a79f16e91fdaf2506..0fa8875f0910d31b3b878d9a35bf0142e96b5043 100644 (file)
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -34,6 +34,7 @@
  #include "foreign/fdwapi.h"
  #include "miscadmin.h"
  #include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
  #include "nodes/supportnodes.h"
  #include "optimizer/clauses.h"
  #include "optimizer/cost.h"
@@ -1308,6 +1309,7 @@ get_relation_constraints(PlannerInfo *root,
  static List *
  get_relation_statistics(RelOptInfo *rel, Relation relation)
  {
+   Index       varno = rel->relid;
     List       *statoidlist;
     List       *stainfos = NIL;
     ListCell   *l;
@@ -1321,6 +1323,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
         HeapTuple   htup;
         HeapTuple   dtup;
         Bitmapset  *keys = NULL;
+       List       *exprs = NIL;
         int         i;
  
         htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -1340,6 +1343,49 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
         for (i = 0; i < staForm->stxkeys.dim1; i++)
             keys = bms_add_member(keys, staForm->stxkeys.values[i]);
  
+       /*
+        * Preprocess expressions (if any). We read the expressions, run them
+        * through eval_const_expressions, and fix the varnos.
+        */
+       {
+           bool        isnull;
+           Datum       datum;
+
+           /* decode expression (if any) */
+           datum = SysCacheGetAttr(STATEXTOID, htup,
+                                   Anum_pg_statistic_ext_stxexprs, &isnull);
+
+           if (!isnull)
+           {
+               char       *exprsString;
+
+               exprsString = TextDatumGetCString(datum);
+               exprs = (List *) stringToNode(exprsString);
+               pfree(exprsString);
+
+               /*
+                * Run the expressions through eval_const_expressions. This is
+                * not just an optimization, but is necessary, because the
+                * planner will be comparing them to similarly-processed qual
+                * clauses, and may fail to detect valid matches without this.
+                * We must not use canonicalize_qual, however, since these
+                * aren't qual expressions.
+                */
+               exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+               /* May as well fix opfuncids too */
+               fix_opfuncids((Node *) exprs);
+
+               /*
+                * Modify the copies we obtain from the relcache to have the
+                * correct varno for the parent relation, so that they match
+                * up correctly against qual clauses.
+                */
+               if (varno != 1)
+                   ChangeVarNodes((Node *) exprs, 1, varno, 0);
+           }
+       }
+
         /* add one StatisticExtInfo for each kind built */
         if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT))
         {
@@ -1349,6 +1395,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
             info->rel = rel;
             info->kind = STATS_EXT_NDISTINCT;
             info->keys = bms_copy(keys);
+           info->exprs = exprs;
  
             stainfos = lappend(stainfos, info);
         }
@@ -1361,6 +1408,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
             info->rel = rel;
             info->kind = STATS_EXT_DEPENDENCIES;
             info->keys = bms_copy(keys);
+           info->exprs = exprs;
  
             stainfos = lappend(stainfos, info);
         }
@@ -1373,6 +1421,20 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
             info->rel = rel;
             info->kind = STATS_EXT_MCV;
             info->keys = bms_copy(keys);
+           info->exprs = exprs;
+
+           stainfos = lappend(stainfos, info);
+       }
+
+       if (statext_is_kind_built(dtup, STATS_EXT_EXPRESSIONS))
+       {
+           StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+           info->statOid = statOid;
+           info->rel = rel;
+           info->kind = STATS_EXT_EXPRESSIONS;
+           info->keys = bms_copy(keys);
+           info->exprs = exprs;
  
             stainfos = lappend(stainfos, info);
         }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y

index 2132cf4d828c6252351dab6abac6d2e94dee6565..7ff36bc84225aa5182ce7bb1e870348b7341cd55 100644 (file)
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -239,6 +239,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
     WindowDef           *windef;
     JoinExpr            *jexpr;
     IndexElem           *ielem;
+   StatsElem           *selem;
     Alias               *alias;
     RangeVar            *range;
     IntoClause          *into;
@@ -405,7 +406,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
                 old_aggr_definition old_aggr_list
                 oper_argtypes RuleActionList RuleActionMulti
                 opt_column_list columnList opt_name_list
-               sort_clause opt_sort_clause sortby_list index_params
+               sort_clause opt_sort_clause sortby_list index_params stats_params
                 opt_include opt_c_include index_including_params
                 name_list role_list from_clause from_list opt_array_bounds
                 qualified_name_list any_name any_name_list type_name_list
@@ -512,6 +513,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
  %type <list>   func_alias_clause
  %type <sortby> sortby
  %type <ielem>  index_elem index_elem_options
+%type <selem>  stats_param
  %type <node>   table_ref
  %type <jexpr>  joined_table
  %type <range>  relation_expr
@@ -4097,7 +4099,7 @@ ExistingIndex:   USING INDEX name                 { $$ = $3; }
  
  CreateStatsStmt:
             CREATE STATISTICS any_name
-           opt_name_list ON expr_list FROM from_list
+           opt_name_list ON stats_params FROM from_list
                 {
                     CreateStatsStmt *n = makeNode(CreateStatsStmt);
                     n->defnames = $3;
@@ -4109,7 +4111,7 @@ CreateStatsStmt:
                     $$ = (Node *)n;
                 }
             | CREATE STATISTICS IF_P NOT EXISTS any_name
-           opt_name_list ON expr_list FROM from_list
+           opt_name_list ON stats_params FROM from_list
                 {
                     CreateStatsStmt *n = makeNode(CreateStatsStmt);
                     n->defnames = $6;
@@ -4122,6 +4124,36 @@ CreateStatsStmt:
                 }
             ;
  
+/*
+ * Statistics attributes can be either simple column references, or arbitrary
+ * expressions in parens.  For compatibility with index attributes permitted
+ * in CREATE INDEX, we allow an expression that's just a function call to be
+ * written without parens.
+ */
+
+stats_params:  stats_param                         { $$ = list_make1($1); }
+           | stats_params ',' stats_param          { $$ = lappend($1, $3); }
+       ;
+
+stats_param:   ColId
+               {
+                   $$ = makeNode(StatsElem);
+                   $$->name = $1;
+                   $$->expr = NULL;
+               }
+           | func_expr_windowless
+               {
+                   $$ = makeNode(StatsElem);
+                   $$->name = NULL;
+                   $$->expr = $1;
+               }
+           | '(' a_expr ')'
+               {
+                   $$ = makeNode(StatsElem);
+                   $$->name = NULL;
+                   $$->expr = $2;
+               }
+       ;
  
  /*****************************************************************************
   *
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c

index 7c3e01aa22b5a9e96b7d476138a50ae35d30a0c3..ceb0bf597d679aeefd93b942d86feee1935c2b81 100644 (file)
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr)
             else
                 err = _("grouping operations are not allowed in index predicates");
  
+           break;
+       case EXPR_KIND_STATS_EXPRESSION:
+           if (isAgg)
+               err = _("aggregate functions are not allowed in statistics expressions");
+           else
+               err = _("grouping operations are not allowed in statistics expressions");
+
             break;
         case EXPR_KIND_ALTER_COL_TRANSFORM:
             if (isAgg)
@@ -910,6 +917,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc,
         case EXPR_KIND_INDEX_EXPRESSION:
             err = _("window functions are not allowed in index expressions");
             break;
+       case EXPR_KIND_STATS_EXPRESSION:
+           err = _("window functions are not allowed in statistics expressions");
+           break;
         case EXPR_KIND_INDEX_PREDICATE:
             err = _("window functions are not allowed in index predicates");
             break;
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c

index f869e159d63f7d1a4a3c29c36692da43d0491a96..03373d551fcb9cfed7cd4562bee5a4391dd92374 100644 (file)
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -500,6 +500,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref)
         case EXPR_KIND_FUNCTION_DEFAULT:
         case EXPR_KIND_INDEX_EXPRESSION:
         case EXPR_KIND_INDEX_PREDICATE:
+       case EXPR_KIND_STATS_EXPRESSION:
         case EXPR_KIND_ALTER_COL_TRANSFORM:
         case EXPR_KIND_EXECUTE_PARAMETER:
         case EXPR_KIND_TRIGGER_WHEN:
@@ -1741,6 +1742,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink)
         case EXPR_KIND_INDEX_PREDICATE:
             err = _("cannot use subquery in index predicate");
             break;
+       case EXPR_KIND_STATS_EXPRESSION:
+           err = _("cannot use subquery in statistics expression");
+           break;
         case EXPR_KIND_ALTER_COL_TRANSFORM:
             err = _("cannot use subquery in transform expression");
             break;
@@ -3030,6 +3034,8 @@ ParseExprKindName(ParseExprKind exprKind)
             return "index expression";
         case EXPR_KIND_INDEX_PREDICATE:
             return "index predicate";
+       case EXPR_KIND_STATS_EXPRESSION:
+           return "statistics expression";
         case EXPR_KIND_ALTER_COL_TRANSFORM:
             return "USING";
         case EXPR_KIND_EXECUTE_PARAMETER:
diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c

index 37cebc7d829cc658fad4553893cf75dcbc6f6082..debef1d14fba1adb0d50b6cd256fc76e1c0ce0e7 100644 (file)
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -2503,6 +2503,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
         case EXPR_KIND_INDEX_PREDICATE:
             err = _("set-returning functions are not allowed in index predicates");
             break;
+       case EXPR_KIND_STATS_EXPRESSION:
+           err = _("set-returning functions are not allowed in statistics expressions");
+           break;
         case EXPR_KIND_ALTER_COL_TRANSFORM:
             err = _("set-returning functions are not allowed in transform expressions");
             break;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c

index aa6c19adada7af8e1aeb592b890ff3f5a90c2add..b968c25dd69181411c47c0b7bf5d60288106a300 100644 (file)
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1917,6 +1917,9 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
             stat_types = lappend(stat_types, makeString("dependencies"));
         else if (enabled[i] == STATS_EXT_MCV)
             stat_types = lappend(stat_types, makeString("mcv"));
+       else if (enabled[i] == STATS_EXT_EXPRESSIONS)
+           /* expression stats are not exposed to users */
+           continue;
         else
             elog(ERROR, "unrecognized statistics kind %c", enabled[i]);
     }
@@ -1924,14 +1927,47 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
     /* Determine which columns the statistics are on */
     for (i = 0; i < statsrec->stxkeys.dim1; i++)
     {
-       ColumnRef  *cref = makeNode(ColumnRef);
+       StatsElem  *selem = makeNode(StatsElem);
         AttrNumber  attnum = statsrec->stxkeys.values[i];
  
-       cref->fields = list_make1(makeString(get_attname(heapRelid,
-                                                        attnum, false)));
-       cref->location = -1;
+       selem->name = get_attname(heapRelid, attnum, false);
+       selem->expr = NULL;
  
-       def_names = lappend(def_names, cref);
+       def_names = lappend(def_names, selem);
+   }
+
+   /*
+    * Now handle expressions, if there are any. The order (with respect to
+    * regular attributes) does not really matter for extended stats, so we
+    * simply append them after simple column references.
+    *
+    * XXX Some places during build/estimation treat expressions as if they
+    * are before atttibutes, but for the CREATE command that's entirely
+    * irrelevant.
+    */
+   datum = SysCacheGetAttr(STATEXTOID, ht_stats,
+                           Anum_pg_statistic_ext_stxexprs, &isnull);
+
+   if (!isnull)
+   {
+       ListCell   *lc;
+       List       *exprs = NIL;
+       char       *exprsString;
+
+       exprsString = TextDatumGetCString(datum);
+       exprs = (List *) stringToNode(exprsString);
+
+       foreach(lc, exprs)
+       {
+           StatsElem  *selem = makeNode(StatsElem);
+
+           selem->name = NULL;
+           selem->expr = (Node *) lfirst(lc);
+
+           def_names = lappend(def_names, selem);
+       }
+
+       pfree(exprsString);
     }
  
     /* finally, build the output node */
@@ -1942,6 +1978,7 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid,
     stats->relations = list_make1(heapRel);
     stats->stxcomment = NULL;
     stats->if_not_exists = false;
+   stats->transformed = true;  /* don't need transformStatsStmt again */
  
     /* Clean up */
     ReleaseSysCache(ht_stats);
@@ -2866,6 +2903,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString)
     return stmt;
  }
  
+/*
+ * transformStatsStmt - parse analysis for CREATE STATISTICS
+ *
+ * To avoid race conditions, it's important that this function rely only on
+ * the passed-in relid (and not on stmt->relation) to determine the target
+ * relation.
+ */
+CreateStatsStmt *
+transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString)
+{
+   ParseState *pstate;
+   ParseNamespaceItem *nsitem;
+   ListCell   *l;
+   Relation    rel;
+
+   /* Nothing to do if statement already transformed. */
+   if (stmt->transformed)
+       return stmt;
+
+   /*
+    * We must not scribble on the passed-in CreateStatsStmt, so copy it.
+    * (This is overkill, but easy.)
+    */
+   stmt = copyObject(stmt);
+
+   /* Set up pstate */
+   pstate = make_parsestate(NULL);
+   pstate->p_sourcetext = queryString;
+
+   /*
+    * Put the parent table into the rtable so that the expressions can refer
+    * to its fields without qualification.  Caller is responsible for locking
+    * relation, but we still need to open it.
+    */
+   rel = relation_open(relid, NoLock);
+   nsitem = addRangeTableEntryForRelation(pstate, rel,
+                                          AccessShareLock,
+                                          NULL, false, true);
+
+   /* no to join list, yes to namespaces */
+   addNSItemToQuery(pstate, nsitem, false, true, true);
+
+   /* take care of any expressions */
+   foreach(l, stmt->exprs)
+   {
+       StatsElem  *selem = (StatsElem *) lfirst(l);
+
+       if (selem->expr)
+       {
+           /* Now do parse transformation of the expression */
+           selem->expr = transformExpr(pstate, selem->expr,
+                                       EXPR_KIND_STATS_EXPRESSION);
+
+           /* We have to fix its collations too */
+           assign_expr_collations(pstate, selem->expr);
+       }
+   }
+
+   /*
+    * Check that only the base rel is mentioned.  (This should be dead code
+    * now that add_missing_from is history.)
+    */
+   if (list_length(pstate->p_rtable) != 1)
+       ereport(ERROR,
+               (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+                errmsg("statistics expressions can refer only to the table being indexed")));
+
+   free_parsestate(pstate);
+
+   /* Close relation */
+   table_close(rel, NoLock);
+
+   /* Mark statement as successfully transformed */
+   stmt->transformed = true;
+
+   return stmt;
+}
+
  
  /*
   * transformRuleStmt -
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c

index eac92851651e81227b32f72b389a29336826f9c5..cf8a6d5f68bd5167aa1f9655702782318a874f91 100644 (file)
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -70,15 +70,15 @@ static void generate_dependencies(DependencyGenerator state);
  static DependencyGenerator DependencyGenerator_init(int n, int k);
  static void DependencyGenerator_free(DependencyGenerator state);
  static AttrNumber *DependencyGenerator_next(DependencyGenerator state);
-static double dependency_degree(int numrows, HeapTuple *rows, int k,
-                               AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs);
+static double dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency);
  static bool dependency_is_fully_matched(MVDependency *dependency,
                                         Bitmapset *attnums);
  static bool dependency_is_compatible_clause(Node *clause, Index relid,
                                             AttrNumber *attnum);
+static bool dependency_is_compatible_expression(Node *clause, Index relid,
+                                               List *statlist, Node **expr);
  static MVDependency *find_strongest_dependency(MVDependencies **dependencies,
-                                              int ndependencies,
-                                              Bitmapset *attnums);
+                                              int ndependencies, Bitmapset *attnums);
  static Selectivity clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
                                                  int varRelid, JoinType jointype,
                                                  SpecialJoinInfo *sjinfo,
@@ -219,16 +219,13 @@ DependencyGenerator_next(DependencyGenerator state)
   * the last one.
   */
  static double
-dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
-                 VacAttrStats **stats, Bitmapset *attrs)
+dependency_degree(StatsBuildData *data, int k, AttrNumber *dependency)
  {
     int         i,
                 nitems;
     MultiSortSupport mss;
     SortItem   *items;
-   AttrNumber *attnums;
     AttrNumber *attnums_dep;
-   int         numattrs;
  
     /* counters valid within a group */
     int         group_size = 0;
@@ -244,15 +241,12 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
     mss = multi_sort_init(k);
  
     /*
-    * Transform the attrs from bitmap to an array to make accessing the i-th
-    * member easier, and then construct a filtered version with only attnums
-    * referenced by the dependency we validate.
+    * Translate the array of indexes to regular attnums for the dependency (we
+    * will need this to identify the columns in StatsBuildData).
      */
-   attnums = build_attnums_array(attrs, &numattrs);
-
     attnums_dep = (AttrNumber *) palloc(k * sizeof(AttrNumber));
     for (i = 0; i < k; i++)
-       attnums_dep[i] = attnums[dependency[i]];
+       attnums_dep[i] = data->attnums[dependency[i]];
  
     /*
      * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -270,7 +264,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
     /* prepare the sort function for the dimensions */
     for (i = 0; i < k; i++)
     {
-       VacAttrStats *colstat = stats[dependency[i]];
+       VacAttrStats *colstat = data->stats[dependency[i]];
         TypeCacheEntry *type;
  
         type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -289,8 +283,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
      * descriptor.  For now that assumption holds, but it might change in the
      * future for example if we support statistics on multiple tables.
      */
-   items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
-                              mss, k, attnums_dep);
+   items = build_sorted_items(data, &nitems, mss, k, attnums_dep);
  
     /*
      * Walk through the sorted array, split it into rows according to the
@@ -336,11 +329,10 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
         pfree(items);
  
     pfree(mss);
-   pfree(attnums);
     pfree(attnums_dep);
  
     /* Compute the 'degree of validity' as (supporting/total). */
-   return (n_supporting_rows * 1.0 / numrows);
+   return (n_supporting_rows * 1.0 / data->numrows);
  }
  
  /*
@@ -360,23 +352,15 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
   *    (c) -> b
   */
  MVDependencies *
-statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
-                          VacAttrStats **stats)
+statext_dependencies_build(StatsBuildData *data)
  {
     int         i,
                 k;
-   int         numattrs;
-   AttrNumber *attnums;
  
     /* result */
     MVDependencies *dependencies = NULL;
  
-   /*
-    * Transform the bms into an array, to make accessing i-th member easier.
-    */
-   attnums = build_attnums_array(attrs, &numattrs);
-
-   Assert(numattrs >= 2);
+   Assert(data->nattnums >= 2);
  
     /*
      * We'll try build functional dependencies starting from the smallest ones
@@ -384,12 +368,12 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
      * included in the statistics object.  We start from the smallest ones
      * because we want to be able to skip already implied ones.
      */
-   for (k = 2; k <= numattrs; k++)
+   for (k = 2; k <= data->nattnums; k++)
     {
         AttrNumber *dependency; /* array with k elements */
  
         /* prepare a DependencyGenerator of variation */
-       DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k);
+       DependencyGenerator DependencyGenerator = DependencyGenerator_init(data->nattnums, k);
  
         /* generate all possible variations of k values (out of n) */
         while ((dependency = DependencyGenerator_next(DependencyGenerator)))
@@ -398,7 +382,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
             MVDependency *d;
  
             /* compute how valid the dependency seems */
-           degree = dependency_degree(numrows, rows, k, dependency, stats, attrs);
+           degree = dependency_degree(data, k, dependency);
  
             /*
              * if the dependency seems entirely invalid, don't store it
@@ -413,7 +397,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
             d->degree = degree;
             d->nattributes = k;
             for (i = 0; i < k; i++)
-               d->attributes[i] = attnums[dependency[i]];
+               d->attributes[i] = data->attnums[dependency[i]];
  
             /* initialize the list of dependencies */
             if (dependencies == NULL)
@@ -747,6 +731,7 @@ static bool
  dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
  {
     Var        *var;
+   Node       *clause_expr;
  
     if (IsA(clause, RestrictInfo))
     {
@@ -774,9 +759,9 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
  
         /* Make sure non-selected argument is a pseudoconstant. */
         if (is_pseudo_constant_clause(lsecond(expr->args)))
-           var = linitial(expr->args);
+           clause_expr = linitial(expr->args);
         else if (is_pseudo_constant_clause(linitial(expr->args)))
-           var = lsecond(expr->args);
+           clause_expr = lsecond(expr->args);
         else
             return false;
  
@@ -805,8 +790,8 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
         /*
          * Reject ALL() variant, we only care about ANY/IN.
          *
-        * FIXME Maybe we should check if all the values are the same, and
-        * allow ALL in that case? Doesn't seem very practical, though.
+        * XXX Maybe we should check if all the values are the same, and allow
+        * ALL in that case? Doesn't seem very practical, though.
          */
         if (!expr->useOr)
             return false;
@@ -822,7 +807,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
         if (!is_pseudo_constant_clause(lsecond(expr->args)))
             return false;
  
-       var = linitial(expr->args);
+       clause_expr = linitial(expr->args);
  
         /*
          * If it's not an "=" operator, just ignore the clause, as it's not
@@ -838,13 +823,13 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
     }
     else if (is_orclause(clause))
     {
-       BoolExpr   *expr = (BoolExpr *) clause;
+       BoolExpr   *bool_expr = (BoolExpr *) clause;
         ListCell   *lc;
  
         /* start with no attribute number */
         *attnum = InvalidAttrNumber;
  
-       foreach(lc, expr->args)
+       foreach(lc, bool_expr->args)
         {
             AttrNumber  clause_attnum;
  
@@ -859,6 +844,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
             if (*attnum == InvalidAttrNumber)
                 *attnum = clause_attnum;
  
+           /* ensure all the variables are the same (same attnum) */
             if (*attnum != clause_attnum)
                 return false;
         }
@@ -872,7 +858,7 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
          * "NOT x" can be interpreted as "x = false", so get the argument and
          * proceed with seeing if it's a suitable Var.
          */
-       var = (Var *) get_notclausearg(clause);
+       clause_expr = (Node *) get_notclausearg(clause);
     }
     else
     {
@@ -880,20 +866,23 @@ dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
          * A boolean expression "x" can be interpreted as "x = true", so
          * proceed with seeing if it's a suitable Var.
          */
-       var = (Var *) clause;
+       clause_expr = (Node *) clause;
     }
  
     /*
      * We may ignore any RelabelType node above the operand.  (There won't be
      * more than one, since eval_const_expressions has been applied already.)
      */
-   if (IsA(var, RelabelType))
-       var = (Var *) ((RelabelType *) var)->arg;
+   if (IsA(clause_expr, RelabelType))
+       clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
  
     /* We only support plain Vars for now */
-   if (!IsA(var, Var))
+   if (!IsA(clause_expr, Var))
         return false;
  
+   /* OK, we know we have a Var */
+   var = (Var *) clause_expr;
+
     /* Ensure Var is from the correct relation */
     if (var->varno != relid)
         return false;
@@ -1157,6 +1146,212 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
     return s1;
  }
  
+/*
+ * dependency_is_compatible_expression
+ *     Determines if the expression is compatible with functional dependencies
+ *
+ * Similar to dependency_is_compatible_clause, but doesn't enforce that the
+ * expression is a simple Var. OTOH we check that there's at least one
+ * statistics object matching the expression.
+ */
+static bool
+dependency_is_compatible_expression(Node *clause, Index relid, List *statlist, Node **expr)
+{
+   List       *vars;
+   ListCell   *lc,
+              *lc2;
+   Node       *clause_expr;
+
+   if (IsA(clause, RestrictInfo))
+   {
+       RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+       /* Pseudoconstants are not interesting (they couldn't contain a Var) */
+       if (rinfo->pseudoconstant)
+           return false;
+
+       /* Clauses referencing multiple, or no, varnos are incompatible */
+       if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+           return false;
+
+       clause = (Node *) rinfo->clause;
+   }
+
+   if (is_opclause(clause))
+   {
+       /* If it's an opclause, check for Var = Const or Const = Var. */
+       OpExpr     *expr = (OpExpr *) clause;
+
+       /* Only expressions with two arguments are candidates. */
+       if (list_length(expr->args) != 2)
+           return false;
+
+       /* Make sure non-selected argument is a pseudoconstant. */
+       if (is_pseudo_constant_clause(lsecond(expr->args)))
+           clause_expr = linitial(expr->args);
+       else if (is_pseudo_constant_clause(linitial(expr->args)))
+           clause_expr = lsecond(expr->args);
+       else
+           return false;
+
+       /*
+        * If it's not an "=" operator, just ignore the clause, as it's not
+        * compatible with functional dependencies.
+        *
+        * This uses the function for estimating selectivity, not the operator
+        * directly (a bit awkward, but well ...).
+        *
+        * XXX this is pretty dubious; probably it'd be better to check btree
+        * or hash opclass membership, so as not to be fooled by custom
+        * selectivity functions, and to be more consistent with decisions
+        * elsewhere in the planner.
+        */
+       if (get_oprrest(expr->opno) != F_EQSEL)
+           return false;
+
+       /* OK to proceed with checking "var" */
+   }
+   else if (IsA(clause, ScalarArrayOpExpr))
+   {
+       /* If it's an scalar array operator, check for Var IN Const. */
+       ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
+
+       /*
+        * Reject ALL() variant, we only care about ANY/IN.
+        *
+        * FIXME Maybe we should check if all the values are the same, and
+        * allow ALL in that case? Doesn't seem very practical, though.
+        */
+       if (!expr->useOr)
+           return false;
+
+       /* Only expressions with two arguments are candidates. */
+       if (list_length(expr->args) != 2)
+           return false;
+
+       /*
+        * We know it's always (Var IN Const), so we assume the var is the
+        * first argument, and pseudoconstant is the second one.
+        */
+       if (!is_pseudo_constant_clause(lsecond(expr->args)))
+           return false;
+
+       clause_expr = linitial(expr->args);
+
+       /*
+        * If it's not an "=" operator, just ignore the clause, as it's not
+        * compatible with functional dependencies. The operator is identified
+        * simply by looking at which function it uses to estimate
+        * selectivity. That's a bit strange, but it's what other similar
+        * places do.
+        */
+       if (get_oprrest(expr->opno) != F_EQSEL)
+           return false;
+
+       /* OK to proceed with checking "var" */
+   }
+   else if (is_orclause(clause))
+   {
+       BoolExpr   *bool_expr = (BoolExpr *) clause;
+       ListCell   *lc;
+
+       /* start with no expression (we'll use the first match) */
+       *expr = NULL;
+
+       foreach(lc, bool_expr->args)
+       {
+           Node       *or_expr = NULL;
+
+           /*
+            * Had we found incompatible expression in the arguments, treat
+            * the whole expression as incompatible.
+            */
+           if (!dependency_is_compatible_expression((Node *) lfirst(lc), relid,
+                                                    statlist, &or_expr))
+               return false;
+
+           if (*expr == NULL)
+               *expr = or_expr;
+
+           /* ensure all the expressions are the same */
+           if (!equal(or_expr, *expr))
+               return false;
+       }
+
+       /* the expression is already checked by the recursive call */
+       return true;
+   }
+   else if (is_notclause(clause))
+   {
+       /*
+        * "NOT x" can be interpreted as "x = false", so get the argument and
+        * proceed with seeing if it's a suitable Var.
+        */
+       clause_expr = (Node *) get_notclausearg(clause);
+   }
+   else
+   {
+       /*
+        * A boolean expression "x" can be interpreted as "x = true", so
+        * proceed with seeing if it's a suitable Var.
+        */
+       clause_expr = (Node *) clause;
+   }
+
+   /*
+    * We may ignore any RelabelType node above the operand.  (There won't be
+    * more than one, since eval_const_expressions has been applied already.)
+    */
+   if (IsA(clause_expr, RelabelType))
+       clause_expr = (Node *) ((RelabelType *) clause_expr)->arg;
+
+   vars = pull_var_clause(clause_expr, 0);
+
+   foreach(lc, vars)
+   {
+       Var        *var = (Var *) lfirst(lc);
+
+       /* Ensure Var is from the correct relation */
+       if (var->varno != relid)
+           return false;
+
+       /* We also better ensure the Var is from the current level */
+       if (var->varlevelsup != 0)
+           return false;
+
+       /* Also ignore system attributes (we don't allow stats on those) */
+       if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+           return false;
+   }
+
+   /*
+    * Check if we actually have a matching statistics for the expression.
+    *
+    * XXX Maybe this is an overkill. We'll eliminate the expressions later.
+    */
+   foreach(lc, statlist)
+   {
+       StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+
+       /* ignore stats without dependencies */
+       if (info->kind != STATS_EXT_DEPENDENCIES)
+           continue;
+
+       foreach(lc2, info->exprs)
+       {
+           Node       *stat_expr = (Node *) lfirst(lc2);
+
+           if (equal(clause_expr, stat_expr))
+           {
+               *expr = stat_expr;
+               return true;
+           }
+       }
+   }
+
+   return false;
+}
+
  /*
   * dependencies_clauselist_selectivity
   *     Return the estimated selectivity of (a subset of) the given clauses
@@ -1204,6 +1399,11 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
     MVDependency **dependencies;
     int         ndependencies;
     int         i;
+   AttrNumber  attnum_offset;
+
+   /* unique expressions */
+   Node      **unique_exprs;
+   int         unique_exprs_cnt;
  
     /* check if there's any stats that might be useful for us. */
     if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
@@ -1212,6 +1412,15 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
     list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
                                          list_length(clauses));
  
+   /*
+    * We allocate space as if every clause was a unique expression, although
+    * that's probably overkill. Some will be simple column references that
+    * we'll translate to attnums, and there might be duplicates. But it's
+    * easier and cheaper to just do one allocation than repalloc later.
+    */
+   unique_exprs = (Node **) palloc(sizeof(Node *) * list_length(clauses));
+   unique_exprs_cnt = 0;
+
     /*
      * Pre-process the clauses list to extract the attnums seen in each item.
      * We need to determine if there's any clauses which will be useful for
@@ -1222,29 +1431,127 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
      *
      * We also skip clauses that we already estimated using different types of
      * statistics (we treat them as incompatible).
+    *
+    * To handle expressions, we assign them negative attnums, as if it was a
+    * system attribute (this is fine, as we only allow extended stats on user
+    * attributes). And then we offset everything by the number of
+    * expressions, so that we can store the values in a bitmapset.
      */
     listidx = 0;
     foreach(l, clauses)
     {
         Node       *clause = (Node *) lfirst(l);
         AttrNumber  attnum;
+       Node       *expr = NULL;
+
+       /* ignore clause by default */
+       list_attnums[listidx] = InvalidAttrNumber;
  
-       if (!bms_is_member(listidx, *estimatedclauses) &&
-           dependency_is_compatible_clause(clause, rel->relid, &attnum))
+       if (!bms_is_member(listidx, *estimatedclauses))
         {
-           list_attnums[listidx] = attnum;
-           clauses_attnums = bms_add_member(clauses_attnums, attnum);
+           /*
+            * If it's a simple column refrence, just extract the attnum. If
+            * it's an expression, assign a negative attnum as if it was a
+            * system attribute.
+            */
+           if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+           {
+               list_attnums[listidx] = attnum;
+           }
+           else if (dependency_is_compatible_expression(clause, rel->relid,
+                                                        rel->statlist,
+                                                        &expr))
+           {
+               /* special attnum assigned to this expression */
+               attnum = InvalidAttrNumber;
+
+               Assert(expr != NULL);
+
+               /* If the expression is duplicate, use the same attnum. */
+               for (i = 0; i < unique_exprs_cnt; i++)
+               {
+                   if (equal(unique_exprs[i], expr))
+                   {
+                       /* negative attribute number to expression */
+                       attnum = -(i + 1);
+                       break;
+                   }
+               }
+
+               /* not found in the list, so add it */
+               if (attnum == InvalidAttrNumber)
+               {
+                   unique_exprs[unique_exprs_cnt++] = expr;
+
+                   /* after incrementing the value, to get -1, -2, ... */
+                   attnum = (-unique_exprs_cnt);
+               }
+
+               /* remember which attnum was assigned to this clause */
+               list_attnums[listidx] = attnum;
+           }
         }
-       else
-           list_attnums[listidx] = InvalidAttrNumber;
  
         listidx++;
     }
  
+   Assert(listidx == list_length(clauses));
+
     /*
-    * If there's not at least two distinct attnums then reject the whole list
-    * of clauses. We must return 1.0 so the calling function's selectivity is
-    * unaffected.
+    * How much we need to offset the attnums? If there are no expressions,
+    * then no offset is needed. Otherwise we need to offset enough for the
+    * lowest value (-unique_exprs_cnt) to become 1.
+    */
+   if (unique_exprs_cnt > 0)
+       attnum_offset = (unique_exprs_cnt + 1);
+   else
+       attnum_offset = 0;
+
+   /*
+    * Now that we know how many expressions there are, we can offset the
+    * values just enough to build the bitmapset.
+    */
+   for (i = 0; i < list_length(clauses); i++)
+   {
+       AttrNumber  attnum;
+
+       /* ignore incompatible or already estimated clauses */
+       if (list_attnums[i] == InvalidAttrNumber)
+           continue;
+
+       /* make sure the attnum is in the expected range */
+       Assert(list_attnums[i] >= (-unique_exprs_cnt));
+       Assert(list_attnums[i] <= MaxHeapAttributeNumber);
+
+       /* make sure the attnum is positive (valid AttrNumber) */
+       attnum = list_attnums[i] + attnum_offset;
+
+       /*
+        * Either it's a regular attribute, or it's an expression, in which
+        * case we must not have seen it before (expressions are unique).
+        *
+        * XXX Check whether it's a regular attribute has to be done using the
+        * original attnum, while the second check has to use the value with
+        * an offset.
+        */
+       Assert(AttrNumberIsForUserDefinedAttr(list_attnums[i]) ||
+              !bms_is_member(attnum, clauses_attnums));
+
+       /*
+        * Remember the offset attnum, both for attributes and expressions.
+        * We'll pass list_attnums to clauselist_apply_dependencies, which
+        * uses it to identify clauses in a bitmap. We could also pass the
+        * offset, but this is more convenient.
+        */
+       list_attnums[i] = attnum;
+
+       clauses_attnums = bms_add_member(clauses_attnums, attnum);
+   }
+
+   /*
+    * If there's not at least two distinct attnums and expressions, then
+    * reject the whole list of clauses. We must return 1.0 so the calling
+    * function's selectivity is unaffected.
      */
     if (bms_membership(clauses_attnums) != BMS_MULTIPLE)
     {
@@ -1272,26 +1579,203 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
     foreach(l, rel->statlist)
     {
         StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
-       Bitmapset  *matched;
-       BMS_Membership membership;
+       int         nmatched;
+       int         nexprs;
+       int         k;
+       MVDependencies *deps;
  
         /* skip statistics that are not of the correct type */
         if (stat->kind != STATS_EXT_DEPENDENCIES)
             continue;
  
-       matched = bms_intersect(clauses_attnums, stat->keys);
-       membership = bms_membership(matched);
-       bms_free(matched);
+       /*
+        * Count matching attributes - we have to undo the attnum offsets. The
+        * input attribute numbers are not offset (expressions are not
+        * included in stat->keys, so it's not necessary). But we need to
+        * offset it before checking against clauses_attnums.
+        */
+       nmatched = 0;
+       k = -1;
+       while ((k = bms_next_member(stat->keys, k)) >= 0)
+       {
+           AttrNumber  attnum = (AttrNumber) k;
  
-       /* skip objects matching fewer than two attributes from clauses */
-       if (membership != BMS_MULTIPLE)
+           /* skip expressions */
+           if (!AttrNumberIsForUserDefinedAttr(attnum))
+               continue;
+
+           /* apply the same offset as above */
+           attnum += attnum_offset;
+
+           if (bms_is_member(attnum, clauses_attnums))
+               nmatched++;
+       }
+
+       /* count matching expressions */
+       nexprs = 0;
+       for (i = 0; i < unique_exprs_cnt; i++)
+       {
+           ListCell   *lc;
+
+           foreach(lc, stat->exprs)
+           {
+               Node       *stat_expr = (Node *) lfirst(lc);
+
+               /* try to match it */
+               if (equal(stat_expr, unique_exprs[i]))
+                   nexprs++;
+           }
+       }
+
+       /*
+        * Skip objects matching fewer than two attributes/expressions from
+        * clauses.
+        */
+       if (nmatched + nexprs < 2)
             continue;
  
-       func_dependencies[nfunc_dependencies]
-           = statext_dependencies_load(stat->statOid);
+       deps = statext_dependencies_load(stat->statOid);
+
+       /*
+        * The expressions may be represented by different attnums in the
+        * stats, we need to remap them to be consistent with the clauses.
+        * That will make the later steps (e.g. picking the strongest item and
+        * so on) much simpler and cheaper, because it won't need to care
+        * about the offset at all.
+        *
+        * When we're at it, we can ignore dependencies that are not fully
+        * matched by clauses (i.e. referencing attributes or expressions that
+        * are not in the clauses).
+        *
+        * We have to do this for all statistics, as long as there are any
+        * expressions - we need to shift the attnums in all dependencies.
+        *
+        * XXX Maybe we should do this always, because it also eliminates some
+        * of the dependencies early. It might be cheaper than having to walk
+        * the longer list in find_strongest_dependency later, especially as
+        * we need to do that repeatedly?
+        *
+        * XXX We have to do this even when there are no expressions in
+        * clauses, otherwise find_strongest_dependency may fail for stats
+        * with expressions (due to lookup of negative value in bitmap). So we
+        * need to at least filter out those dependencies. Maybe we could do
+        * it in a cheaper way (if there are no expr clauses, we can just
+        * discard all negative attnums without any lookups).
+        */
+       if (unique_exprs_cnt > 0 || stat->exprs != NIL)
+       {
+           int         ndeps = 0;
+
+           for (i = 0; i < deps->ndeps; i++)
+           {
+               bool        skip = false;
+               MVDependency *dep = deps->deps[i];
+               int         j;
+
+               for (j = 0; j < dep->nattributes; j++)
+               {
+                   int         idx;
+                   Node       *expr;
+                   int         k;
+                   AttrNumber  unique_attnum = InvalidAttrNumber;
+                   AttrNumber  attnum;
+
+                   /* undo the per-statistics offset */
+                   attnum = dep->attributes[j];
+
+                   /*
+                    * For regular attributes we can simply check if it
+                    * matches any clause. If there's no matching clause, we
+                    * can just ignore it. We need to offset the attnum
+                    * though.
+                    */
+                   if (AttrNumberIsForUserDefinedAttr(attnum))
+                   {
+                       dep->attributes[j] = attnum + attnum_offset;
+
+                       if (!bms_is_member(dep->attributes[j], clauses_attnums))
+                       {
+                           skip = true;
+                           break;
+                       }
+
+                       continue;
+                   }
+
+                   /*
+                    * the attnum should be a valid system attnum (-1, -2,
+                    * ...)
+                    */
+                   Assert(AttributeNumberIsValid(attnum));
+
+                   /*
+                    * For expressions, we need to do two translations. First
+                    * we have to translate the negative attnum to index in
+                    * the list of expressions (in the statistics object).
+                    * Then we need to see if there's a matching clause. The
+                    * index of the unique expression determines the attnum
+                    * (and we offset it).
+                    */
+                   idx = -(1 + attnum);
+
+                   /* Is the expression index is valid? */
+                   Assert((idx >= 0) && (idx < list_length(stat->exprs)));
+
+                   expr = (Node *) list_nth(stat->exprs, idx);
+
+                   /* try to find the expression in the unique list */
+                   for (k = 0; k < unique_exprs_cnt; k++)
+                   {
+                       /*
+                        * found a matching unique expression, use the attnum
+                        * (derived from index of the unique expression)
+                        */
+                       if (equal(unique_exprs[k], expr))
+                       {
+                           unique_attnum = -(k + 1) + attnum_offset;
+                           break;
+                       }
+                   }
+
+                   /*
+                    * Found no matching expression, so we can simply skip
+                    * this dependency, because there's no chance it will be
+                    * fully covered.
+                    */
+                   if (unique_attnum == InvalidAttrNumber)
+                   {
+                       skip = true;
+                       break;
+                   }
+
+                   /* otherwise remap it to the new attnum */
+                   dep->attributes[j] = unique_attnum;
+               }
  
-       total_ndeps += func_dependencies[nfunc_dependencies]->ndeps;
-       nfunc_dependencies++;
+               /* if found a matching dependency, keep it */
+               if (!skip)
+               {
+                   /* maybe we've skipped something earlier, so move it */
+                   if (ndeps != i)
+                       deps->deps[ndeps] = deps->deps[i];
+
+                   ndeps++;
+               }
+           }
+
+           deps->ndeps = ndeps;
+       }
+
+       /*
+        * It's possible we've removed all dependencies, in which case we
+        * don't bother adding it to the list.
+        */
+       if (deps->ndeps > 0)
+       {
+           func_dependencies[nfunc_dependencies] = deps;
+           total_ndeps += deps->ndeps;
+           nfunc_dependencies++;
+       }
     }
  
     /* if no matching stats could be found then we've nothing to do */
@@ -1300,6 +1784,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
         pfree(func_dependencies);
         bms_free(clauses_attnums);
         pfree(list_attnums);
+       pfree(unique_exprs);
         return 1.0;
     }
  
@@ -1347,6 +1832,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
     pfree(func_dependencies);
     bms_free(clauses_attnums);
     pfree(list_attnums);
+   pfree(unique_exprs);
  
     return s1;
  }
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c

index 7808c6a09cac71d57ac437ef37c95d1a368894de..8c75690fce816234850f31848fe4ef155646374b 100644 (file)
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -24,6 +24,7 @@
  #include "catalog/pg_collation.h"
  #include "catalog/pg_statistic_ext.h"
  #include "catalog/pg_statistic_ext_data.h"
+#include "executor/executor.h"
  #include "commands/progress.h"
  #include "miscadmin.h"
  #include "nodes/nodeFuncs.h"
@@ -35,13 +36,16 @@
  #include "statistics/statistics.h"
  #include "utils/acl.h"
  #include "utils/array.h"
+#include "utils/attoptcache.h"
  #include "utils/builtins.h"
+#include "utils/datum.h"
  #include "utils/fmgroids.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/rel.h"
  #include "utils/selfuncs.h"
  #include "utils/syscache.h"
+#include "utils/typcache.h"
  
  /*
   * To avoid consuming too much memory during analysis and/or too much space
@@ -66,18 +70,38 @@ typedef struct StatExtEntry
     Bitmapset  *columns;        /* attribute numbers covered by the object */
     List       *types;          /* 'char' list of enabled statistics kinds */
     int         stattarget;     /* statistics target (-1 for default) */
+   List       *exprs;          /* expressions */
  } StatExtEntry;
  
  
  static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
-static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                                             int nvacatts, VacAttrStats **vacatts);
-static void statext_store(Oid relid,
+static void statext_store(Oid statOid,
                           MVNDistinct *ndistinct, MVDependencies *dependencies,
-                         MCVList *mcv, VacAttrStats **stats);
+                         MCVList *mcv, Datum exprs, VacAttrStats **stats);
  static int statext_compute_stattarget(int stattarget,
                                        int natts, VacAttrStats **stats);
  
+/* Information needed to analyze a single simple expression. */
+typedef struct AnlExprData
+{
+   Node       *expr;           /* expression to analyze */
+   VacAttrStats *vacattrstat;  /* statistics attrs to analyze */
+} AnlExprData;
+
+static void compute_expr_stats(Relation onerel, double totalrows,
+                              AnlExprData * exprdata, int nexprs,
+                              HeapTuple *rows, int numrows);
+static Datum serialize_expr_stats(AnlExprData * exprdata, int nexprs);
+static Datum expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static AnlExprData *build_expr_data(List *exprs, int stattarget);
+
+static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
+                                      int numrows, HeapTuple *rows,
+                                      VacAttrStats **stats, int stattarget);
+
+
  /*
   * Compute requested extended stats, using the rows sampled for the plain
   * (single-column) stats.
@@ -92,21 +116,25 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
  {
     Relation    pg_stext;
     ListCell   *lc;
-   List       *stats;
+   List       *statslist;
     MemoryContext cxt;
     MemoryContext oldcxt;
     int64       ext_cnt;
  
+   /* Do nothing if there are no columns to analyze. */
+   if (!natts)
+       return;
+
     cxt = AllocSetContextCreate(CurrentMemoryContext,
                                 "BuildRelationExtStatistics",
                                 ALLOCSET_DEFAULT_SIZES);
     oldcxt = MemoryContextSwitchTo(cxt);
  
     pg_stext = table_open(StatisticExtRelationId, RowExclusiveLock);
-   stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+   statslist = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
  
     /* report this phase */
-   if (stats != NIL)
+   if (statslist != NIL)
     {
         const int   index[] = {
             PROGRESS_ANALYZE_PHASE,
@@ -114,28 +142,30 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
         };
         const int64 val[] = {
             PROGRESS_ANALYZE_PHASE_COMPUTE_EXT_STATS,
-           list_length(stats)
+           list_length(statslist)
         };
  
         pgstat_progress_update_multi_param(2, index, val);
     }
  
     ext_cnt = 0;
-   foreach(lc, stats)
+   foreach(lc, statslist)
     {
         StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
         MVNDistinct *ndistinct = NULL;
         MVDependencies *dependencies = NULL;
         MCVList    *mcv = NULL;
+       Datum       exprstats = (Datum) 0;
         VacAttrStats **stats;
         ListCell   *lc2;
         int         stattarget;
+       StatsBuildData *data;
  
         /*
          * Check if we can build these stats based on the column analyzed. If
          * not, report this fact (except in autovacuum) and move on.
          */
-       stats = lookup_var_attr_stats(onerel, stat->columns,
+       stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                       natts, vacattrstats);
         if (!stats)
         {
@@ -150,10 +180,6 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
             continue;
         }
  
-       /* check allowed number of dimensions */
-       Assert(bms_num_members(stat->columns) >= 2 &&
-              bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
         /* compute statistics target for this statistics */
         stattarget = statext_compute_stattarget(stat->stattarget,
                                                 bms_num_members(stat->columns),
@@ -167,28 +193,49 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
         if (stattarget == 0)
             continue;
  
+       /* evaluate expressions (if the statistics has any) */
+       data = make_build_data(onerel, stat, numrows, rows, stats, stattarget);
+
         /* compute statistic of each requested type */
         foreach(lc2, stat->types)
         {
             char        t = (char) lfirst_int(lc2);
  
             if (t == STATS_EXT_NDISTINCT)
-               ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
-                                                   stat->columns, stats);
+               ndistinct = statext_ndistinct_build(totalrows, data);
             else if (t == STATS_EXT_DEPENDENCIES)
-               dependencies = statext_dependencies_build(numrows, rows,
-                                                         stat->columns, stats);
+               dependencies = statext_dependencies_build(data);
             else if (t == STATS_EXT_MCV)
-               mcv = statext_mcv_build(numrows, rows, stat->columns, stats,
-                                       totalrows, stattarget);
+               mcv = statext_mcv_build(data, totalrows, stattarget);
+           else if (t == STATS_EXT_EXPRESSIONS)
+           {
+               AnlExprData *exprdata;
+               int         nexprs;
+
+               /* should not happen, thanks to checks when defining stats */
+               if (!stat->exprs)
+                   elog(ERROR, "requested expression stats, but there are no expressions");
+
+               exprdata = build_expr_data(stat->exprs, stattarget);
+               nexprs = list_length(stat->exprs);
+
+               compute_expr_stats(onerel, totalrows,
+                                  exprdata, nexprs,
+                                  rows, numrows);
+
+               exprstats = serialize_expr_stats(exprdata, nexprs);
+           }
         }
  
         /* store the statistics in the catalog */
-       statext_store(stat->statOid, ndistinct, dependencies, mcv, stats);
+       statext_store(stat->statOid, ndistinct, dependencies, mcv, exprstats, stats);
  
         /* for reporting progress */
         pgstat_progress_update_param(PROGRESS_ANALYZE_EXT_STATS_COMPUTED,
                                      ++ext_cnt);
+
+       /* free the build data (allocated as a single chunk) */
+       pfree(data);
     }
  
     table_close(pg_stext, RowExclusiveLock);
@@ -221,6 +268,10 @@ ComputeExtStatisticsRows(Relation onerel,
     MemoryContext oldcxt;
     int         result = 0;
  
+   /* If there are no columns to analyze, just return 0. */
+   if (!natts)
+       return 0;
+
     cxt = AllocSetContextCreate(CurrentMemoryContext,
                                 "ComputeExtStatisticsRows",
                                 ALLOCSET_DEFAULT_SIZES);
@@ -241,7 +292,7 @@ ComputeExtStatisticsRows(Relation onerel,
          * analyzed. If not, ignore it (don't report anything, we'll do that
          * during the actual build BuildRelationExtStatistics).
          */
-       stats = lookup_var_attr_stats(onerel, stat->columns,
+       stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs,
                                       natts, vacattrstats);
  
         if (!stats)
@@ -349,6 +400,10 @@ statext_is_kind_built(HeapTuple htup, char type)
             attnum = Anum_pg_statistic_ext_data_stxdmcv;
             break;
  
+       case STATS_EXT_EXPRESSIONS:
+           attnum = Anum_pg_statistic_ext_data_stxdexpr;
+           break;
+
         default:
             elog(ERROR, "unexpected statistics type requested: %d", type);
     }
@@ -388,6 +443,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
         ArrayType  *arr;
         char       *enabled;
         Form_pg_statistic_ext staForm;
+       List       *exprs = NIL;
  
         entry = palloc0(sizeof(StatExtEntry));
         staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
@@ -415,10 +471,40 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
         {
             Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
                    (enabled[i] == STATS_EXT_DEPENDENCIES) ||
-                  (enabled[i] == STATS_EXT_MCV));
+                  (enabled[i] == STATS_EXT_MCV) ||
+                  (enabled[i] == STATS_EXT_EXPRESSIONS));
             entry->types = lappend_int(entry->types, (int) enabled[i]);
         }
  
+       /* decode expression (if any) */
+       datum = SysCacheGetAttr(STATEXTOID, htup,
+                               Anum_pg_statistic_ext_stxexprs, &isnull);
+
+       if (!isnull)
+       {
+           char       *exprsString;
+
+           exprsString = TextDatumGetCString(datum);
+           exprs = (List *) stringToNode(exprsString);
+
+           pfree(exprsString);
+
+           /*
+            * Run the expressions through eval_const_expressions. This is not
+            * just an optimization, but is necessary, because the planner
+            * will be comparing them to similarly-processed qual clauses, and
+            * may fail to detect valid matches without this.  We must not use
+            * canonicalize_qual, however, since these aren't qual
+            * expressions.
+            */
+           exprs = (List *) eval_const_expressions(NULL, (Node *) exprs);
+
+           /* May as well fix opfuncids too */
+           fix_opfuncids((Node *) exprs);
+       }
+
+       entry->exprs = exprs;
+
         result = lappend(result, entry);
     }
  
@@ -427,6 +513,187 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
     return result;
  }
  
+/*
+ * examine_attribute -- pre-analysis of a single column
+ *
+ * Determine whether the column is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_attribute(Node *expr)
+{
+   HeapTuple   typtuple;
+   VacAttrStats *stats;
+   int         i;
+   bool        ok;
+
+   /*
+    * Create the VacAttrStats struct.  Note that we only have a copy of the
+    * fixed fields of the pg_attribute tuple.
+    */
+   stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+   /* fake the attribute */
+   stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE);
+   stats->attr->attstattarget = -1;
+
+   /*
+    * When analyzing an expression, believe the expression tree's type not
+    * the column datatype --- the latter might be the opckeytype storage
+    * type of the opclass, which is not interesting for our purposes.  (Note:
+    * if we did anything with non-expression statistics columns, we'd need to
+    * figure out where to get the correct type info from, but for now that's
+    * not a problem.)  It's not clear whether anyone will care about the
+    * typmod, but we store that too just in case.
+    */
+   stats->attrtypid = exprType(expr);
+   stats->attrtypmod = exprTypmod(expr);
+   stats->attrcollid = exprCollation(expr);
+
+   typtuple = SearchSysCacheCopy1(TYPEOID,
+                                  ObjectIdGetDatum(stats->attrtypid));
+   if (!HeapTupleIsValid(typtuple))
+       elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+   stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+
+   /*
+    * We don't actually analyze individual attributes, so no need to set the
+    * memory context.
+    */
+   stats->anl_context = NULL;
+   stats->tupattnum = InvalidAttrNumber;
+
+   /*
+    * The fields describing the stats->stavalues[n] element types default to
+    * the type of the data being analyzed, but the type-specific typanalyze
+    * function can change them if it wants to store something else.
+    */
+   for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+   {
+       stats->statypid[i] = stats->attrtypid;
+       stats->statyplen[i] = stats->attrtype->typlen;
+       stats->statypbyval[i] = stats->attrtype->typbyval;
+       stats->statypalign[i] = stats->attrtype->typalign;
+   }
+
+   /*
+    * Call the type-specific typanalyze function.  If none is specified, use
+    * std_typanalyze().
+    */
+   if (OidIsValid(stats->attrtype->typanalyze))
+       ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                          PointerGetDatum(stats)));
+   else
+       ok = std_typanalyze(stats);
+
+   if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+   {
+       heap_freetuple(typtuple);
+       pfree(stats->attr);
+       pfree(stats);
+       return NULL;
+   }
+
+   return stats;
+}
+
+/*
+ * examine_expression -- pre-analysis of a single expression
+ *
+ * Determine whether the expression is analyzable; if so, create and initialize
+ * a VacAttrStats struct for it.  If not, return NULL.
+ */
+static VacAttrStats *
+examine_expression(Node *expr, int stattarget)
+{
+   HeapTuple   typtuple;
+   VacAttrStats *stats;
+   int         i;
+   bool        ok;
+
+   Assert(expr != NULL);
+
+   /*
+    * Create the VacAttrStats struct.
+    */
+   stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats));
+
+   /*
+    * When analyzing an expression, believe the expression tree's type.
+    */
+   stats->attrtypid = exprType(expr);
+   stats->attrtypmod = exprTypmod(expr);
+
+   /*
+    * We don't allow collation to be specified in CREATE STATISTICS, so we
+    * have to use the collation specified for the expression. It's possible
+    * to specify the collation in the expression "(col COLLATE "en_US")" in
+    * which case exprCollation() does the right thing.
+    */
+   stats->attrcollid = exprCollation(expr);
+
+   /*
+    * We don't have any pg_attribute for expressions, so let's fake something
+    * reasonable into attstattarget, which is the only thing std_typanalyze
+    * needs.
+    */
+   stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE);
+
+   /*
+    * We can't have statistics target specified for the expression, so we
+    * could use either the default_statistics_target, or the target computed
+    * for the extended statistics. The second option seems more reasonable.
+    */
+   stats->attr->attstattarget = stattarget;
+
+   /* initialize some basic fields */
+   stats->attr->attrelid = InvalidOid;
+   stats->attr->attnum = InvalidAttrNumber;
+   stats->attr->atttypid = stats->attrtypid;
+
+   typtuple = SearchSysCacheCopy1(TYPEOID,
+                                  ObjectIdGetDatum(stats->attrtypid));
+   if (!HeapTupleIsValid(typtuple))
+       elog(ERROR, "cache lookup failed for type %u", stats->attrtypid);
+
+   stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple);
+   stats->anl_context = CurrentMemoryContext;  /* XXX should be using
+                                                * something else? */
+   stats->tupattnum = InvalidAttrNumber;
+
+   /*
+    * The fields describing the stats->stavalues[n] element types default to
+    * the type of the data being analyzed, but the type-specific typanalyze
+    * function can change them if it wants to store something else.
+    */
+   for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
+   {
+       stats->statypid[i] = stats->attrtypid;
+       stats->statyplen[i] = stats->attrtype->typlen;
+       stats->statypbyval[i] = stats->attrtype->typbyval;
+       stats->statypalign[i] = stats->attrtype->typalign;
+   }
+
+   /*
+    * Call the type-specific typanalyze function.  If none is specified, use
+    * std_typanalyze().
+    */
+   if (OidIsValid(stats->attrtype->typanalyze))
+       ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze,
+                                          PointerGetDatum(stats)));
+   else
+       ok = std_typanalyze(stats);
+
+   if (!ok || stats->compute_stats == NULL || stats->minrows <= 0)
+   {
+       heap_freetuple(typtuple);
+       pfree(stats);
+       return NULL;
+   }
+
+   return stats;
+}
+
  /*
   * Using 'vacatts' of size 'nvacatts' as input data, return a newly built
   * VacAttrStats array which includes only the items corresponding to
@@ -435,15 +702,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
   * to the caller that the stats should not be built.
   */
  static VacAttrStats **
-lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
+lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs,
                       int nvacatts, VacAttrStats **vacatts)
  {
     int         i = 0;
     int         x = -1;
+   int         natts;
     VacAttrStats **stats;
+   ListCell   *lc;
+
+   natts = bms_num_members(attrs) + list_length(exprs);
  
-   stats = (VacAttrStats **)
-       palloc(bms_num_members(attrs) * sizeof(VacAttrStats *));
+   stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *));
  
     /* lookup VacAttrStats info for the requested columns (same attnum) */
     while ((x = bms_next_member(attrs, x)) >= 0)
@@ -480,6 +750,24 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
         i++;
     }
  
+   /* also add info for expressions */
+   foreach(lc, exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+
+       stats[i] = examine_attribute(expr);
+
+       /*
+        * XXX We need tuple descriptor later, and we just grab it from
+        * stats[0]->tupDesc (see e.g. statext_mcv_build). But as coded
+        * examine_attribute does not set that, so just grab it from the first
+        * vacatts element.
+        */
+       stats[i]->tupDesc = vacatts[0]->tupDesc;
+
+       i++;
+   }
+
     return stats;
  }
  
@@ -491,7 +779,7 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
  static void
  statext_store(Oid statOid,
               MVNDistinct *ndistinct, MVDependencies *dependencies,
-             MCVList *mcv, VacAttrStats **stats)
+             MCVList *mcv, Datum exprs, VacAttrStats **stats)
  {
     Relation    pg_stextdata;
     HeapTuple   stup,
@@ -532,11 +820,17 @@ statext_store(Oid statOid,
         nulls[Anum_pg_statistic_ext_data_stxdmcv - 1] = (data == NULL);
         values[Anum_pg_statistic_ext_data_stxdmcv - 1] = PointerGetDatum(data);
     }
+   if (exprs != (Datum) 0)
+   {
+       nulls[Anum_pg_statistic_ext_data_stxdexpr - 1] = false;
+       values[Anum_pg_statistic_ext_data_stxdexpr - 1] = exprs;
+   }
  
     /* always replace the value (either by bytea or NULL) */
     replaces[Anum_pg_statistic_ext_data_stxdndistinct - 1] = true;
     replaces[Anum_pg_statistic_ext_data_stxddependencies - 1] = true;
     replaces[Anum_pg_statistic_ext_data_stxdmcv - 1] = true;
+   replaces[Anum_pg_statistic_ext_data_stxdexpr - 1] = true;
  
     /* there should already be a pg_statistic_ext_data tuple */
     oldtup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(statOid));
@@ -668,7 +962,7 @@ compare_datums_simple(Datum a, Datum b, SortSupport ssup)
   * is not necessary here (and when querying the bitmap).
   */
  AttrNumber *
-build_attnums_array(Bitmapset *attrs, int *numattrs)
+build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs)
  {
     int         i,
                 j;
@@ -684,16 +978,19 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
     j = -1;
     while ((j = bms_next_member(attrs, j)) >= 0)
     {
+       AttrNumber  attnum = (j - nexprs);
+
         /*
          * Make sure the bitmap contains only user-defined attributes. As
          * bitmaps can't contain negative values, this can be violated in two
          * ways. Firstly, the bitmap might contain 0 as a member, and secondly
          * the integer value might be larger than MaxAttrNumber.
          */
-       Assert(AttrNumberIsForUserDefinedAttr(j));
-       Assert(j <= MaxAttrNumber);
+       Assert(AttributeNumberIsValid(attnum));
+       Assert(attnum <= MaxAttrNumber);
+       Assert(attnum >= (-nexprs));
  
-       attnums[i++] = (AttrNumber) j;
+       attnums[i++] = (AttrNumber) attnum;
  
         /* protect against overflows */
         Assert(i <= num);
@@ -710,29 +1007,31 @@ build_attnums_array(Bitmapset *attrs, int *numattrs)
   * can simply pfree the return value to release all of it.
   */
  SortItem *
-build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
-                  MultiSortSupport mss, int numattrs, AttrNumber *attnums)
+build_sorted_items(StatsBuildData *data, int *nitems,
+                  MultiSortSupport mss,
+                  int numattrs, AttrNumber *attnums)
  {
     int         i,
                 j,
                 len,
-               idx;
-   int         nvalues = numrows * numattrs;
+               nrows;
+   int         nvalues = data->numrows * numattrs;
  
     SortItem   *items;
     Datum      *values;
     bool       *isnull;
     char       *ptr;
+   int        *typlen;
  
     /* Compute the total amount of memory we need (both items and values). */
-   len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+   len = data->numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
  
     /* Allocate the memory and split it into the pieces. */
     ptr = palloc0(len);
  
     /* items to sort */
     items = (SortItem *) ptr;
-   ptr += numrows * sizeof(SortItem);
+   ptr += data->numrows * sizeof(SortItem);
  
     /* values and null flags */
     values = (Datum *) ptr;
@@ -745,21 +1044,47 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
     Assert((ptr - (char *) items) == len);
  
     /* fix the pointers to Datum and bool arrays */
-   idx = 0;
-   for (i = 0; i < numrows; i++)
+   nrows = 0;
+   for (i = 0; i < data->numrows; i++)
     {
-       bool        toowide = false;
+       items[nrows].values = &values[nrows * numattrs];
+       items[nrows].isnull = &isnull[nrows * numattrs];
  
-       items[idx].values = &values[idx * numattrs];
-       items[idx].isnull = &isnull[idx * numattrs];
+       nrows++;
+   }
+
+   /* build a local cache of typlen for all attributes */
+   typlen = (int *) palloc(sizeof(int) * data->nattnums);
+   for (i = 0; i < data->nattnums; i++)
+       typlen[i] = get_typlen(data->stats[i]->attrtypid);
+
+   nrows = 0;
+   for (i = 0; i < data->numrows; i++)
+   {
+       bool        toowide = false;
  
         /* load the values/null flags from sample rows */
         for (j = 0; j < numattrs; j++)
         {
             Datum       value;
             bool        isnull;
+           int         attlen;
+           AttrNumber  attnum = attnums[j];
+
+           int         idx;
+
+           /* match attnum to the pre-calculated data */
+           for (idx = 0; idx < data->nattnums; idx++)
+           {
+               if (attnum == data->attnums[idx])
+                   break;
+           }
  
-           value = heap_getattr(rows[i], attnums[j], tdesc, &isnull);
+           Assert(idx < data->nattnums);
+
+           value = data->values[idx][i];
+           isnull = data->nulls[idx][i];
+           attlen = typlen[idx];
  
             /*
              * If this is a varlena value, check if it's too wide and if yes
@@ -770,8 +1095,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
              * on the assumption that those are small (below WIDTH_THRESHOLD)
              * and will be discarded at the end of analyze.
              */
-           if ((!isnull) &&
-               (TupleDescAttr(tdesc, attnums[j] - 1)->attlen == -1))
+           if ((!isnull) && (attlen == -1))
             {
                 if (toast_raw_datum_size(value) > WIDTH_THRESHOLD)
                 {
@@ -782,21 +1106,21 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
                 value = PointerGetDatum(PG_DETOAST_DATUM(value));
             }
  
-           items[idx].values[j] = value;
-           items[idx].isnull[j] = isnull;
+           items[nrows].values[j] = value;
+           items[nrows].isnull[j] = isnull;
         }
  
         if (toowide)
             continue;
  
-       idx++;
+       nrows++;
     }
  
     /* store the actual number of items (ignoring the too-wide ones) */
-   *nitems = idx;
+   *nitems = nrows;
  
     /* all items were too wide */
-   if (idx == 0)
+   if (nrows == 0)
     {
         /* everything is allocated as a single chunk */
         pfree(items);
@@ -804,7 +1128,7 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc,
     }
  
     /* do the sort, using the multi-sort */
-   qsort_arg((void *) items, idx, sizeof(SortItem),
+   qsort_arg((void *) items, nrows, sizeof(SortItem),
               multi_sort_compare, mss);
  
     return items;
@@ -830,6 +1154,63 @@ has_stats_of_kind(List *stats, char requiredkind)
     return false;
  }
  
+/*
+ * stat_find_expression
+ *     Search for an expression in statistics object's list of expressions.
+ *
+ * Returns the index of the expression in the statistics object's list of
+ * expressions, or -1 if not found.
+ */
+static int
+stat_find_expression(StatisticExtInfo *stat, Node *expr)
+{
+   ListCell   *lc;
+   int         idx;
+
+   idx = 0;
+   foreach(lc, stat->exprs)
+   {
+       Node       *stat_expr = (Node *) lfirst(lc);
+
+       if (equal(stat_expr, expr))
+           return idx;
+       idx++;
+   }
+
+   /* Expression not found */
+   return -1;
+}
+
+/*
+ * stat_covers_expressions
+ *         Test whether a statistics object covers all expressions in a list.
+ *
+ * Returns true if all expressions are covered.  If expr_idxs is non-NULL, it
+ * is populated with the indexes of the expressions found.
+ */
+static bool
+stat_covers_expressions(StatisticExtInfo *stat, List *exprs,
+                       Bitmapset **expr_idxs)
+{
+   ListCell   *lc;
+
+   foreach(lc, exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+       int         expr_idx;
+
+       expr_idx = stat_find_expression(stat, expr);
+       if (expr_idx == -1)
+           return false;
+
+       if (expr_idxs != NULL)
+           *expr_idxs = bms_add_member(*expr_idxs, expr_idx);
+   }
+
+   /* If we reach here, all expressions are covered */
+   return true;
+}
+
  /*
   * choose_best_statistics
   *     Look for and return statistics with the specified 'requiredkind' which
@@ -850,7 +1231,8 @@ has_stats_of_kind(List *stats, char requiredkind)
   */
  StatisticExtInfo *
  choose_best_statistics(List *stats, char requiredkind,
-                      Bitmapset **clause_attnums, int nclauses)
+                      Bitmapset **clause_attnums, List **clause_exprs,
+                      int nclauses)
  {
     ListCell   *lc;
     StatisticExtInfo *best_match = NULL;
@@ -861,7 +1243,8 @@ choose_best_statistics(List *stats, char requiredkind,
     {
         int         i;
         StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
-       Bitmapset  *matched = NULL;
+       Bitmapset  *matched_attnums = NULL;
+       Bitmapset  *matched_exprs = NULL;
         int         num_matched;
         int         numkeys;
  
@@ -870,35 +1253,43 @@ choose_best_statistics(List *stats, char requiredkind,
             continue;
  
         /*
-        * Collect attributes in remaining (unestimated) clauses fully covered
-        * by this statistic object.
+        * Collect attributes and expressions in remaining (unestimated)
+        * clauses fully covered by this statistic object.
          */
         for (i = 0; i < nclauses; i++)
         {
+           Bitmapset  *expr_idxs = NULL;
+
             /* ignore incompatible/estimated clauses */
-           if (!clause_attnums[i])
+           if (!clause_attnums[i] && !clause_exprs[i])
                 continue;
  
             /* ignore clauses that are not covered by this object */
-           if (!bms_is_subset(clause_attnums[i], info->keys))
+           if (!bms_is_subset(clause_attnums[i], info->keys) ||
+               !stat_covers_expressions(info, clause_exprs[i], &expr_idxs))
                 continue;
  
-           matched = bms_add_members(matched, clause_attnums[i]);
+           /* record attnums and indexes of expressions covered */
+           matched_attnums = bms_add_members(matched_attnums, clause_attnums[i]);
+           matched_exprs = bms_add_members(matched_exprs, expr_idxs);
         }
  
-       num_matched = bms_num_members(matched);
-       bms_free(matched);
+       num_matched = bms_num_members(matched_attnums) + bms_num_members(matched_exprs);
+
+       bms_free(matched_attnums);
+       bms_free(matched_exprs);
  
         /*
          * save the actual number of keys in the stats so that we can choose
          * the narrowest stats with the most matching keys.
          */
-       numkeys = bms_num_members(info->keys);
+       numkeys = bms_num_members(info->keys) + list_length(info->exprs);
  
         /*
-        * Use this object when it increases the number of matched clauses or
-        * when it matches the same number of attributes but these stats have
-        * fewer keys than any previous match.
+        * Use this object when it increases the number of matched attributes
+        * and expressions or when it matches the same number of attributes
+        * and expressions but these stats have fewer keys than any previous
+        * match.
          */
         if (num_matched > best_num_matched ||
             (num_matched == best_num_matched && numkeys < best_match_keys))
@@ -923,7 +1314,8 @@ choose_best_statistics(List *stats, char requiredkind,
   */
  static bool
  statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
-                                     Index relid, Bitmapset **attnums)
+                                     Index relid, Bitmapset **attnums,
+                                     List **exprs)
  {
     /* Look inside any binary-compatible relabeling (as in examine_variable) */
     if (IsA(clause, RelabelType))
@@ -951,19 +1343,19 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
         return true;
     }
  
-   /* (Var op Const) or (Const op Var) */
+   /* (Var/Expr op Const) or (Const op Var/Expr) */
     if (is_opclause(clause))
     {
         RangeTblEntry *rte = root->simple_rte_array[relid];
         OpExpr     *expr = (OpExpr *) clause;
-       Var        *var;
+       Node       *clause_expr;
  
         /* Only expressions with two arguments are considered compatible. */
         if (list_length(expr->args) != 2)
             return false;
  
-       /* Check if the expression has the right shape (one Var, one Const) */
-       if (!examine_clause_args(expr->args, &var, NULL, NULL))
+       /* Check if the expression has the right shape */
+       if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
             return false;
  
         /*
@@ -981,7 +1373,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
             case F_SCALARLESEL:
             case F_SCALARGTSEL:
             case F_SCALARGESEL:
-               /* supported, will continue with inspection of the Var */
+               /* supported, will continue with inspection of the Var/Expr */
                 break;
  
             default:
@@ -1003,23 +1395,29 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
             !get_func_leakproof(get_opcode(expr->opno)))
             return false;
  
-       return statext_is_compatible_clause_internal(root, (Node *) var,
-                                                    relid, attnums);
+       /* Check (Var op Const) or (Const op Var) clauses by recursing. */
+       if (IsA(clause_expr, Var))
+           return statext_is_compatible_clause_internal(root, clause_expr,
+                                                        relid, attnums, exprs);
+
+       /* Otherwise we have (Expr op Const) or (Const op Expr). */
+       *exprs = lappend(*exprs, clause_expr);
+       return true;
     }
  
-   /* Var IN Array */
+   /* Var/Expr IN Array */
     if (IsA(clause, ScalarArrayOpExpr))
     {
         RangeTblEntry *rte = root->simple_rte_array[relid];
         ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
-       Var        *var;
+       Node       *clause_expr;
  
         /* Only expressions with two arguments are considered compatible. */
         if (list_length(expr->args) != 2)
             return false;
  
         /* Check if the expression has the right shape (one Var, one Const) */
-       if (!examine_clause_args(expr->args, &var, NULL, NULL))
+       if (!examine_opclause_args(expr->args, &clause_expr, NULL, NULL))
             return false;
  
         /*
@@ -1037,7 +1435,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
             case F_SCALARLESEL:
             case F_SCALARGTSEL:
             case F_SCALARGESEL:
-               /* supported, will continue with inspection of the Var */
+               /* supported, will continue with inspection of the Var/Expr */
                 break;
  
             default:
@@ -1059,8 +1457,14 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
             !get_func_leakproof(get_opcode(expr->opno)))
             return false;
  
-       return statext_is_compatible_clause_internal(root, (Node *) var,
-                                                    relid, attnums);
+       /* Check Var IN Array clauses by recursing. */
+       if (IsA(clause_expr, Var))
+           return statext_is_compatible_clause_internal(root, clause_expr,
+                                                        relid, attnums, exprs);
+
+       /* Otherwise we have Expr IN Array. */
+       *exprs = lappend(*exprs, clause_expr);
+       return true;
     }
  
     /* AND/OR/NOT clause */
@@ -1093,54 +1497,62 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
              */
             if (!statext_is_compatible_clause_internal(root,
                                                        (Node *) lfirst(lc),
-                                                      relid, attnums))
+                                                      relid, attnums, exprs))
                 return false;
         }
  
         return true;
     }
  
-   /* Var IS NULL */
+   /* Var/Expr IS NULL */
     if (IsA(clause, NullTest))
     {
         NullTest   *nt = (NullTest *) clause;
  
-       /*
-        * Only simple (Var IS NULL) expressions supported for now. Maybe we
-        * could use examine_variable to fix this?
-        */
-       if (!IsA(nt->arg, Var))
-           return false;
+       /* Check Var IS NULL clauses by recursing. */
+       if (IsA(nt->arg, Var))
+           return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
+                                                        relid, attnums, exprs);
  
-       return statext_is_compatible_clause_internal(root, (Node *) (nt->arg),
-                                                    relid, attnums);
+       /* Otherwise we have Expr IS NULL. */
+       *exprs = lappend(*exprs, nt->arg);
+       return true;
     }
  
-   return false;
+   /*
+    * Treat any other expressions as bare expressions to be matched against
+    * expressions in statistics objects.
+    */
+   *exprs = lappend(*exprs, clause);
+   return true;
  }
  
  /*
   * statext_is_compatible_clause
   *     Determines if the clause is compatible with MCV lists.
   *
- * Currently, we only support three types of clauses:
+ * Currently, we only support the following types of clauses:
   *
- * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op
- * is one of ("=", "<", ">", ">=", "<=")
+ * (a) OpExprs of the form (Var/Expr op Const), or (Const op Var/Expr), where
+ * the op is one of ("=", "<", ">", ">=", "<=")
   *
- * (b) (Var IS [NOT] NULL)
+ * (b) (Var/Expr IS [NOT] NULL)
   *
   * (c) combinations using AND/OR/NOT
   *
+ * (d) ScalarArrayOpExprs of the form (Var/Expr op ANY (array)) or (Var/Expr
+ * op ALL (array))
+ *
   * In the future, the range of supported clauses may be expanded to more
   * complex cases, for example (Var op Var).
   */
  static bool
  statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
-                            Bitmapset **attnums)
+                            Bitmapset **attnums, List **exprs)
  {
     RangeTblEntry *rte = root->simple_rte_array[relid];
     RestrictInfo *rinfo = (RestrictInfo *) clause;
+   int         clause_relid;
     Oid         userid;
  
     /*
@@ -1160,7 +1572,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
         foreach(lc, expr->args)
         {
             if (!statext_is_compatible_clause(root, (Node *) lfirst(lc),
-                                             relid, attnums))
+                                             relid, attnums, exprs))
                 return false;
         }
  
@@ -1175,25 +1587,36 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
     if (rinfo->pseudoconstant)
         return false;
  
-   /* clauses referencing multiple varnos are incompatible */
-   if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+   /* Clauses referencing other varnos are incompatible. */
+   if (!bms_get_singleton_member(rinfo->clause_relids, &clause_relid) ||
+       clause_relid != relid)
         return false;
  
     /* Check the clause and determine what attributes it references. */
     if (!statext_is_compatible_clause_internal(root, (Node *) rinfo->clause,
-                                              relid, attnums))
+                                              relid, attnums, exprs))
         return false;
  
     /*
-    * Check that the user has permission to read all these attributes.  Use
+    * Check that the user has permission to read all required attributes. Use
      * checkAsUser if it's set, in case we're accessing the table via a view.
      */
     userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
  
     if (pg_class_aclcheck(rte->relid, userid, ACL_SELECT) != ACLCHECK_OK)
     {
+       Bitmapset  *clause_attnums;
+
         /* Don't have table privilege, must check individual columns */
-       if (bms_is_member(InvalidAttrNumber, *attnums))
+       if (*exprs != NIL)
+       {
+           pull_varattnos((Node *) exprs, relid, &clause_attnums);
+           clause_attnums = bms_add_members(clause_attnums, *attnums);
+       }
+       else
+           clause_attnums = *attnums;
+
+       if (bms_is_member(InvalidAttrNumber, clause_attnums))
         {
             /* Have a whole-row reference, must have access to all columns */
             if (pg_attribute_aclcheck_all(rte->relid, userid, ACL_SELECT,
@@ -1205,7 +1628,7 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid,
             /* Check the columns referenced by the clause */
             int         attnum = -1;
  
-           while ((attnum = bms_next_member(*attnums, attnum)) >= 0)
+           while ((attnum = bms_next_member(clause_attnums, attnum)) >= 0)
             {
                 if (pg_attribute_aclcheck(rte->relid, attnum, userid,
                                           ACL_SELECT) != ACLCHECK_OK)
@@ -1259,7 +1682,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
                                    bool is_or)
  {
     ListCell   *l;
-   Bitmapset **list_attnums;
+   Bitmapset **list_attnums;   /* attnums extracted from the clause */
+   List      **list_exprs;     /* expressions matched to any statistic */
     int         listidx;
     Selectivity sel = (is_or) ? 0.0 : 1.0;
  
@@ -1270,13 +1694,16 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
     list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
                                          list_length(clauses));
  
+   /* expressions extracted from complex expressions */
+   list_exprs = (List **) palloc(sizeof(Node *) * list_length(clauses));
+
     /*
-    * Pre-process the clauses list to extract the attnums seen in each item.
-    * We need to determine if there's any clauses which will be useful for
-    * selectivity estimations with extended stats. Along the way we'll record
-    * all of the attnums for each clause in a list which we'll reference
-    * later so we don't need to repeat the same work again. We'll also keep
-    * track of all attnums seen.
+    * Pre-process the clauses list to extract the attnums and expressions
+    * seen in each item.  We need to determine if there are any clauses which
+    * will be useful for selectivity estimations with extended stats.  Along
+    * the way we'll record all of the attnums and expressions for each clause
+    * in lists which we'll reference later so we don't need to repeat the
+    * same work again.
      *
      * We also skip clauses that we already estimated using different types of
      * statistics (we treat them as incompatible).
@@ -1286,12 +1713,19 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
     {
         Node       *clause = (Node *) lfirst(l);
         Bitmapset  *attnums = NULL;
+       List       *exprs = NIL;
  
         if (!bms_is_member(listidx, *estimatedclauses) &&
-           statext_is_compatible_clause(root, clause, rel->relid, &attnums))
+           statext_is_compatible_clause(root, clause, rel->relid, &attnums, &exprs))
+       {
             list_attnums[listidx] = attnums;
+           list_exprs[listidx] = exprs;
+       }
         else
+       {
             list_attnums[listidx] = NULL;
+           list_exprs[listidx] = NIL;
+       }
  
         listidx++;
     }
@@ -1305,7 +1739,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
  
         /* find the best suited statistics object for these attnums */
         stat = choose_best_statistics(rel->statlist, STATS_EXT_MCV,
-                                     list_attnums, list_length(clauses));
+                                     list_attnums, list_exprs,
+                                     list_length(clauses));
  
         /*
          * if no (additional) matching stats could be found then we've nothing
@@ -1320,28 +1755,39 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
         /* now filter the clauses to be estimated using the selected MCV */
         stat_clauses = NIL;
  
-       /* record which clauses are simple (single column) */
+       /* record which clauses are simple (single column or expression) */
         simple_clauses = NULL;
  
         listidx = 0;
         foreach(l, clauses)
         {
             /*
-            * If the clause is compatible with the selected statistics, mark
-            * it as estimated and add it to the list to estimate.
+            * If the clause is not already estimated and is compatible with
+            * the selected statistics object (all attributes and expressions
+            * covered), mark it as estimated and add it to the list to
+            * estimate.
              */
-           if (list_attnums[listidx] != NULL &&
-               bms_is_subset(list_attnums[listidx], stat->keys))
+           if (!bms_is_member(listidx, *estimatedclauses) &&
+               bms_is_subset(list_attnums[listidx], stat->keys) &&
+               stat_covers_expressions(stat, list_exprs[listidx], NULL))
             {
-               if (bms_membership(list_attnums[listidx]) == BMS_SINGLETON)
+               /* record simple clauses (single column or expression) */
+               if ((list_attnums[listidx] == NULL &&
+                    list_length(list_exprs[listidx]) == 1) ||
+                   (list_exprs[listidx] == NIL &&
+                    bms_membership(list_attnums[listidx]) == BMS_SINGLETON))
                     simple_clauses = bms_add_member(simple_clauses,
                                                     list_length(stat_clauses));
  
+               /* add clause to list and mark as estimated */
                 stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
                 *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
  
                 bms_free(list_attnums[listidx]);
                 list_attnums[listidx] = NULL;
+
+               list_free(list_exprs[listidx]);
+               list_exprs[listidx] = NULL;
             }
  
             listidx++;
@@ -1530,23 +1976,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
  }
  
  /*
- * examine_opclause_expression
- *     Split expression into Var and Const parts.
+ * examine_opclause_args
+ *     Split an operator expression's arguments into Expr and Const parts.
   *
- * Attempts to match the arguments to either (Var op Const) or (Const op Var),
- * possibly with a RelabelType on top. When the expression matches this form,
- * returns true, otherwise returns false.
+ * Attempts to match the arguments to either (Expr op Const) or (Const op
+ * Expr), possibly with a RelabelType on top. When the expression matches this
+ * form, returns true, otherwise returns false.
   *
- * Optionally returns pointers to the extracted Var/Const nodes, when passed
- * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
- * on which side of the operator we found the Var node.
+ * Optionally returns pointers to the extracted Expr/Const nodes, when passed
+ * non-null pointers (exprp, cstp and expronleftp). The expronleftp flag
+ * specifies on which side of the operator we found the expression node.
   */
  bool
-examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
+examine_opclause_args(List *args, Node **exprp, Const **cstp,
+                     bool *expronleftp)
  {
-   Var        *var;
+   Node       *expr;
     Const      *cst;
-   bool        varonleft;
+   bool        expronleft;
     Node       *leftop,
                *rightop;
  
@@ -1563,30 +2010,564 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
     if (IsA(rightop, RelabelType))
         rightop = (Node *) ((RelabelType *) rightop)->arg;
  
-   if (IsA(leftop, Var) && IsA(rightop, Const))
+   if (IsA(rightop, Const))
     {
-       var = (Var *) leftop;
+       expr = (Node *) leftop;
         cst = (Const *) rightop;
-       varonleft = true;
+       expronleft = true;
     }
-   else if (IsA(leftop, Const) && IsA(rightop, Var))
+   else if (IsA(leftop, Const))
     {
-       var = (Var *) rightop;
+       expr = (Node *) rightop;
         cst = (Const *) leftop;
-       varonleft = false;
+       expronleft = false;
     }
     else
         return false;
  
     /* return pointers to the extracted parts if requested */
-   if (varp)
-       *varp = var;
+   if (exprp)
+       *exprp = expr;
  
     if (cstp)
         *cstp = cst;
  
-   if (varonleftp)
-       *varonleftp = varonleft;
+   if (expronleftp)
+       *expronleftp = expronleft;
  
     return true;
  }
+
+
+/*
+ * Compute statistics about expressions of a relation.
+ */
+static void
+compute_expr_stats(Relation onerel, double totalrows,
+                  AnlExprData *exprdata, int nexprs,
+                  HeapTuple *rows, int numrows)
+{
+   MemoryContext expr_context,
+               old_context;
+   int         ind,
+               i;
+
+   expr_context = AllocSetContextCreate(CurrentMemoryContext,
+                                        "Analyze Expression",
+                                        ALLOCSET_DEFAULT_SIZES);
+   old_context = MemoryContextSwitchTo(expr_context);
+
+   for (ind = 0; ind < nexprs; ind++)
+   {
+       AnlExprData *thisdata = &exprdata[ind];
+       VacAttrStats *stats = thisdata->vacattrstat;
+       Node       *expr = thisdata->expr;
+       TupleTableSlot *slot;
+       EState     *estate;
+       ExprContext *econtext;
+       Datum      *exprvals;
+       bool       *exprnulls;
+       ExprState  *exprstate;
+       int         tcnt;
+
+       /* Are we still in the main context? */
+       Assert(CurrentMemoryContext == expr_context);
+
+       /*
+        * Need an EState for evaluation of expressions.  Create it in the
+        * per-expression context to be sure it gets cleaned up at the bottom
+        * of the loop.
+        */
+       estate = CreateExecutorState();
+       econtext = GetPerTupleExprContext(estate);
+
+       /* Set up expression evaluation state */
+       exprstate = ExecPrepareExpr((Expr *) expr, estate);
+
+       /* Need a slot to hold the current heap tuple, too */
+       slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel),
+                                       &TTSOpsHeapTuple);
+
+       /* Arrange for econtext's scan tuple to be the tuple under test */
+       econtext->ecxt_scantuple = slot;
+
+       /* Compute and save expression values */
+       exprvals = (Datum *) palloc(numrows * sizeof(Datum));
+       exprnulls = (bool *) palloc(numrows * sizeof(bool));
+
+       tcnt = 0;
+       for (i = 0; i < numrows; i++)
+       {
+           Datum       datum;
+           bool        isnull;
+
+           /*
+            * Reset the per-tuple context each time, to reclaim any cruft
+            * left behind by evaluating the statistics expressions.
+            */
+           ResetExprContext(econtext);
+
+           /* Set up for expression evaluation */
+           ExecStoreHeapTuple(rows[i], slot, false);
+
+           /*
+            * Evaluate the expression. We do this in the per-tuple context so
+            * as not to leak memory, and then copy the result into the
+            * context created at the beginning of this function.
+            */
+           datum = ExecEvalExprSwitchContext(exprstate,
+                                             GetPerTupleExprContext(estate),
+                                             &isnull);
+           if (isnull)
+           {
+               exprvals[tcnt] = (Datum) 0;
+               exprnulls[tcnt] = true;
+           }
+           else
+           {
+               /* Make sure we copy the data into the context. */
+               Assert(CurrentMemoryContext == expr_context);
+
+               exprvals[tcnt] = datumCopy(datum,
+                                          stats->attrtype->typbyval,
+                                          stats->attrtype->typlen);
+               exprnulls[tcnt] = false;
+           }
+
+           tcnt++;
+       }
+
+       /*
+        * Now we can compute the statistics for the expression columns.
+        *
+        * XXX Unlike compute_index_stats we don't need to switch and reset
+        * memory contexts here, because we're only computing stats for a
+        * single expression (and not iterating over many indexes), so we just
+        * do it in expr_context. Note that compute_stats copies the result
+        * into stats->anl_context, so it does not disappear.
+        */
+       if (tcnt > 0)
+       {
+           AttributeOpts *aopt =
+           get_attribute_options(stats->attr->attrelid,
+                                 stats->attr->attnum);
+
+           stats->exprvals = exprvals;
+           stats->exprnulls = exprnulls;
+           stats->rowstride = 1;
+           stats->compute_stats(stats,
+                                expr_fetch_func,
+                                tcnt,
+                                tcnt);
+
+           /*
+            * If the n_distinct option is specified, it overrides the above
+            * computation.
+            */
+           if (aopt != NULL && aopt->n_distinct != 0.0)
+               stats->stadistinct = aopt->n_distinct;
+       }
+
+       /* And clean up */
+       MemoryContextSwitchTo(expr_context);
+
+       ExecDropSingleTupleTableSlot(slot);
+       FreeExecutorState(estate);
+       MemoryContextResetAndDeleteChildren(expr_context);
+   }
+
+   MemoryContextSwitchTo(old_context);
+   MemoryContextDelete(expr_context);
+}
+
+
+/*
+ * Fetch function for analyzing statistics object expressions.
+ *
+ * We have not bothered to construct tuples from the data, instead the data
+ * is just in Datum arrays.
+ */
+static Datum
+expr_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull)
+{
+   int         i;
+
+   /* exprvals and exprnulls are already offset for proper column */
+   i = rownum * stats->rowstride;
+   *isNull = stats->exprnulls[i];
+   return stats->exprvals[i];
+}
+
+/*
+ * Build analyze data for a list of expressions. As this is not tied
+ * directly to a relation (table or index), we have to fake some of
+ * the fields in examine_expression().
+ */
+static AnlExprData *
+build_expr_data(List *exprs, int stattarget)
+{
+   int         idx;
+   int         nexprs = list_length(exprs);
+   AnlExprData *exprdata;
+   ListCell   *lc;
+
+   exprdata = (AnlExprData *) palloc0(nexprs * sizeof(AnlExprData));
+
+   idx = 0;
+   foreach(lc, exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+       AnlExprData *thisdata = &exprdata[idx];
+
+       thisdata->expr = expr;
+       thisdata->vacattrstat = examine_expression(expr, stattarget);
+       idx++;
+   }
+
+   return exprdata;
+}
+
+/* form an array of pg_statistic rows (per update_attstats) */
+static Datum
+serialize_expr_stats(AnlExprData *exprdata, int nexprs)
+{
+   int         exprno;
+   Oid         typOid;
+   Relation    sd;
+
+   ArrayBuildState *astate = NULL;
+
+   sd = table_open(StatisticRelationId, RowExclusiveLock);
+
+   /* lookup OID of composite type for pg_statistic */
+   typOid = get_rel_type_id(StatisticRelationId);
+   if (!OidIsValid(typOid))
+       ereport(ERROR,
+               (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                errmsg("relation \"pg_statistic\" does not have a composite type")));
+
+   for (exprno = 0; exprno < nexprs; exprno++)
+   {
+       int         i,
+                   k;
+       VacAttrStats *stats = exprdata[exprno].vacattrstat;
+
+       Datum       values[Natts_pg_statistic];
+       bool        nulls[Natts_pg_statistic];
+       HeapTuple   stup;
+
+       if (!stats->stats_valid)
+       {
+           astate = accumArrayResult(astate,
+                                     (Datum) 0,
+                                     true,
+                                     typOid,
+                                     CurrentMemoryContext);
+           continue;
+       }
+
+       /*
+        * Construct a new pg_statistic tuple
+        */
+       for (i = 0; i < Natts_pg_statistic; ++i)
+       {
+           nulls[i] = false;
+       }
+
+       values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(InvalidOid);
+       values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(InvalidAttrNumber);
+       values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(false);
+       values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac);
+       values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth);
+       values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct);
+       i = Anum_pg_statistic_stakind1 - 1;
+       for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+       {
+           values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */
+       }
+       i = Anum_pg_statistic_staop1 - 1;
+       for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+       {
+           values[i++] = ObjectIdGetDatum(stats->staop[k]);    /* staopN */
+       }
+       i = Anum_pg_statistic_stacoll1 - 1;
+       for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+       {
+           values[i++] = ObjectIdGetDatum(stats->stacoll[k]);  /* stacollN */
+       }
+       i = Anum_pg_statistic_stanumbers1 - 1;
+       for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+       {
+           int         nnum = stats->numnumbers[k];
+
+           if (nnum > 0)
+           {
+               int         n;
+               Datum      *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
+               ArrayType  *arry;
+
+               for (n = 0; n < nnum; n++)
+                   numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]);
+               /* XXX knows more than it should about type float4: */
+               arry = construct_array(numdatums, nnum,
+                                      FLOAT4OID,
+                                      sizeof(float4), true, TYPALIGN_INT);
+               values[i++] = PointerGetDatum(arry);    /* stanumbersN */
+           }
+           else
+           {
+               nulls[i] = true;
+               values[i++] = (Datum) 0;
+           }
+       }
+       i = Anum_pg_statistic_stavalues1 - 1;
+       for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+       {
+           if (stats->numvalues[k] > 0)
+           {
+               ArrayType  *arry;
+
+               arry = construct_array(stats->stavalues[k],
+                                      stats->numvalues[k],
+                                      stats->statypid[k],
+                                      stats->statyplen[k],
+                                      stats->statypbyval[k],
+                                      stats->statypalign[k]);
+               values[i++] = PointerGetDatum(arry);    /* stavaluesN */
+           }
+           else
+           {
+               nulls[i] = true;
+               values[i++] = (Datum) 0;
+           }
+       }
+
+       stup = heap_form_tuple(RelationGetDescr(sd), values, nulls);
+
+       astate = accumArrayResult(astate,
+                                 heap_copy_tuple_as_datum(stup, RelationGetDescr(sd)),
+                                 false,
+                                 typOid,
+                                 CurrentMemoryContext);
+   }
+
+   table_close(sd, RowExclusiveLock);
+
+   return makeArrayResult(astate, CurrentMemoryContext);
+}
+
+/*
+ * Loads pg_statistic record from expression statistics for expression
+ * identified by the supplied index.
+ */
+HeapTuple
+statext_expressions_load(Oid stxoid, int idx)
+{
+   bool        isnull;
+   Datum       value;
+   HeapTuple   htup;
+   ExpandedArrayHeader *eah;
+   HeapTupleHeader td;
+   HeapTupleData tmptup;
+   HeapTuple   tup;
+
+   htup = SearchSysCache1(STATEXTDATASTXOID, ObjectIdGetDatum(stxoid));
+   if (!HeapTupleIsValid(htup))
+       elog(ERROR, "cache lookup failed for statistics object %u", stxoid);
+
+   value = SysCacheGetAttr(STATEXTDATASTXOID, htup,
+                           Anum_pg_statistic_ext_data_stxdexpr, &isnull);
+   if (isnull)
+       elog(ERROR,
+            "requested statistics kind \"%c\" is not yet built for statistics object %u",
+            STATS_EXT_DEPENDENCIES, stxoid);
+
+   eah = DatumGetExpandedArray(value);
+
+   deconstruct_expanded_array(eah);
+
+   td = DatumGetHeapTupleHeader(eah->dvalues[idx]);
+
+   /* Build a temporary HeapTuple control structure */
+   tmptup.t_len = HeapTupleHeaderGetDatumLength(td);
+   tmptup.t_data = td;
+
+   tup = heap_copytuple(&tmptup);
+
+   ReleaseSysCache(htup);
+
+   return tup;
+}
+
+/*
+ * Evaluate the expressions, so that we can use the results to build
+ * all the requested statistics types. This matters especially for
+ * expensive expressions, of course.
+ */
+static StatsBuildData *
+make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows,
+               VacAttrStats **stats, int stattarget)
+{
+   /* evaluated expressions */
+   StatsBuildData *result;
+   char       *ptr;
+   Size        len;
+
+   int         i;
+   int         k;
+   int         idx;
+   TupleTableSlot *slot;
+   EState     *estate;
+   ExprContext *econtext;
+   List       *exprstates = NIL;
+   int         nkeys = bms_num_members(stat->columns) + list_length(stat->exprs);
+   ListCell   *lc;
+
+   /* allocate everything as a single chunk, so we can free it easily */
+   len = MAXALIGN(sizeof(StatsBuildData));
+   len += MAXALIGN(sizeof(AttrNumber) * nkeys);    /* attnums */
+   len += MAXALIGN(sizeof(VacAttrStats *) * nkeys);    /* stats */
+
+   /* values */
+   len += MAXALIGN(sizeof(Datum *) * nkeys);
+   len += nkeys * MAXALIGN(sizeof(Datum) * numrows);
+
+   /* nulls */
+   len += MAXALIGN(sizeof(bool *) * nkeys);
+   len += nkeys * MAXALIGN(sizeof(bool) * numrows);
+
+   ptr = palloc(len);
+
+   /* set the pointers */
+   result = (StatsBuildData *) ptr;
+   ptr += MAXALIGN(sizeof(StatsBuildData));
+
+   /* attnums */
+   result->attnums = (AttrNumber *) ptr;
+   ptr += MAXALIGN(sizeof(AttrNumber) * nkeys);
+
+   /* stats */
+   result->stats = (VacAttrStats **) ptr;
+   ptr += MAXALIGN(sizeof(VacAttrStats *) * nkeys);
+
+   /* values */
+   result->values = (Datum **) ptr;
+   ptr += MAXALIGN(sizeof(Datum *) * nkeys);
+
+   /* nulls */
+   result->nulls = (bool **) ptr;
+   ptr += MAXALIGN(sizeof(bool *) * nkeys);
+
+   for (i = 0; i < nkeys; i++)
+   {
+       result->values[i] = (Datum *) ptr;
+       ptr += MAXALIGN(sizeof(Datum) * numrows);
+
+       result->nulls[i] = (bool *) ptr;
+       ptr += MAXALIGN(sizeof(bool) * numrows);
+   }
+
+   Assert((ptr - (char *) result) == len);
+
+   /* we have it allocated, so let's fill the values */
+   result->nattnums = nkeys;
+   result->numrows = numrows;
+
+   /* fill the attribute info - first attributes, then expressions */
+   idx = 0;
+   k = -1;
+   while ((k = bms_next_member(stat->columns, k)) >= 0)
+   {
+       result->attnums[idx] = k;
+       result->stats[idx] = stats[idx];
+
+       idx++;
+   }
+
+   k = -1;
+   foreach(lc, stat->exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+
+       result->attnums[idx] = k;
+       result->stats[idx] = examine_expression(expr, stattarget);
+
+       idx++;
+       k--;
+   }
+
+   /* first extract values for all the regular attributes */
+   for (i = 0; i < numrows; i++)
+   {
+       idx = 0;
+       k = -1;
+       while ((k = bms_next_member(stat->columns, k)) >= 0)
+       {
+           result->values[idx][i] = heap_getattr(rows[i], k,
+                                                 result->stats[idx]->tupDesc,
+                                                 &result->nulls[idx][i]);
+
+           idx++;
+       }
+   }
+
+   /* Need an EState for evaluation expressions. */
+   estate = CreateExecutorState();
+   econtext = GetPerTupleExprContext(estate);
+
+   /* Need a slot to hold the current heap tuple, too */
+   slot = MakeSingleTupleTableSlot(RelationGetDescr(rel),
+                                   &TTSOpsHeapTuple);
+
+   /* Arrange for econtext's scan tuple to be the tuple under test */
+   econtext->ecxt_scantuple = slot;
+
+   /* Set up expression evaluation state */
+   exprstates = ExecPrepareExprList(stat->exprs, estate);
+
+   for (i = 0; i < numrows; i++)
+   {
+       /*
+        * Reset the per-tuple context each time, to reclaim any cruft left
+        * behind by evaluating the statistics object expressions.
+        */
+       ResetExprContext(econtext);
+
+       /* Set up for expression evaluation */
+       ExecStoreHeapTuple(rows[i], slot, false);
+
+       idx = bms_num_members(stat->columns);
+       foreach(lc, exprstates)
+       {
+           Datum       datum;
+           bool        isnull;
+           ExprState  *exprstate = (ExprState *) lfirst(lc);
+
+           /*
+            * XXX This probably leaks memory. Maybe we should use
+            * ExecEvalExprSwitchContext but then we need to copy the result
+            * somewhere else.
+            */
+           datum = ExecEvalExpr(exprstate,
+                                GetPerTupleExprContext(estate),
+                                &isnull);
+           if (isnull)
+           {
+               result->values[idx][i] = (Datum) 0;
+               result->nulls[idx][i] = true;
+           }
+           else
+           {
+               result->values[idx][i] = (Datum) datum;
+               result->nulls[idx][i] = false;
+           }
+
+           idx++;
+       }
+   }
+
+   ExecDropSingleTupleTableSlot(slot);
+   FreeExecutorState(estate);
+
+   return result;
+}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c

index 8335dff2418d5d26f7eade663db26de52256095a..2a00fb48483f477fb328b44c1f1b39fe34e6e11f 100644 (file)
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -74,7 +74,7 @@
      ((ndims) * sizeof(DimensionInfo)) + \
      ((nitems) * ITEM_SIZE(ndims)))
  
-static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
+static MultiSortSupport build_mss(StatsBuildData *data);
  
  static SortItem *build_distinct_groups(int numrows, SortItem *items,
                                        MultiSortSupport mss, int *ndistinct);
@@ -181,32 +181,33 @@ get_mincount_for_mcv_list(int samplerows, double totalrows)
   *
   */
  MCVList *
-statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
-                 VacAttrStats **stats, double totalrows, int stattarget)
+statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
  {
     int         i,
                 numattrs,
+               numrows,
                 ngroups,
                 nitems;
-   AttrNumber *attnums;
     double      mincount;
     SortItem   *items;
     SortItem   *groups;
     MCVList    *mcvlist = NULL;
     MultiSortSupport mss;
  
-   attnums = build_attnums_array(attrs, &numattrs);
-
     /* comparator for all the columns */
-   mss = build_mss(stats, numattrs);
+   mss = build_mss(data);
  
     /* sort the rows */
-   items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc,
-                              mss, numattrs, attnums);
+   items = build_sorted_items(data, &nitems, mss,
+                              data->nattnums, data->attnums);
  
     if (!items)
         return NULL;
  
+   /* for convenience */
+   numattrs = data->nattnums;
+   numrows = data->numrows;
+
     /* transform the sorted rows into groups (sorted by frequency) */
     groups = build_distinct_groups(nitems, items, mss, &ngroups);
  
@@ -289,7 +290,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
  
         /* store info about data type OIDs */
         for (i = 0; i < numattrs; i++)
-           mcvlist->types[i] = stats[i]->attrtypid;
+           mcvlist->types[i] = data->stats[i]->attrtypid;
  
         /* Copy the first chunk of groups into the result. */
         for (i = 0; i < nitems; i++)
@@ -347,9 +348,10 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
   * build MultiSortSupport for the attributes passed in attrs
   */
  static MultiSortSupport
-build_mss(VacAttrStats **stats, int numattrs)
+build_mss(StatsBuildData *data)
  {
     int         i;
+   int         numattrs = data->nattnums;
  
     /* Sort by multiple columns (using array of SortSupport) */
     MultiSortSupport mss = multi_sort_init(numattrs);
@@ -357,7 +359,7 @@ build_mss(VacAttrStats **stats, int numattrs)
     /* prepare the sort functions for all the attributes */
     for (i = 0; i < numattrs; i++)
     {
-       VacAttrStats *colstat = stats[i];
+       VacAttrStats *colstat = data->stats[i];
         TypeCacheEntry *type;
  
         type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
@@ -1523,6 +1525,59 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
     return byteasend(fcinfo);
  }
  
+/*
+ * match the attribute/expression to a dimension of the statistic
+ *
+ * Match the attribute/expression to statistics dimension. Optionally
+ * determine the collation.
+ */
+static int
+mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
+{
+   int         idx = -1;
+
+   if (IsA(expr, Var))
+   {
+       /* simple Var, so just lookup using varattno */
+       Var        *var = (Var *) expr;
+
+       if (collid)
+           *collid = var->varcollid;
+
+       idx = bms_member_index(keys, var->varattno);
+
+       /* make sure the index is valid */
+       Assert((idx >= 0) && (idx <= bms_num_members(keys)));
+   }
+   else
+   {
+       ListCell   *lc;
+
+       /* expressions are stored after the simple columns */
+       idx = bms_num_members(keys);
+
+       if (collid)
+           *collid = exprCollation(expr);
+
+       /* expression - lookup in stats expressions */
+       foreach(lc, exprs)
+       {
+           Node       *stat_expr = (Node *) lfirst(lc);
+
+           if (equal(expr, stat_expr))
+               break;
+
+           idx++;
+       }
+
+       /* make sure the index is valid */
+       Assert((idx >= bms_num_members(keys)) &&
+              (idx <= bms_num_members(keys) + list_length(exprs)));
+   }
+
+   return idx;
+}
+
  /*
   * mcv_get_match_bitmap
   * Evaluate clauses using the MCV list, and update the match bitmap.
@@ -1544,7 +1599,8 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
   */
  static bool *
  mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
-                    Bitmapset *keys, MCVList *mcvlist, bool is_or)
+                    Bitmapset *keys, List *exprs,
+                    MCVList *mcvlist, bool is_or)
  {
     int         i;
     ListCell   *l;
@@ -1582,77 +1638,78 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
             OpExpr     *expr = (OpExpr *) clause;
             FmgrInfo    opproc;
  
-           /* valid only after examine_clause_args returns true */
-           Var        *var;
+           /* valid only after examine_opclause_args returns true */
+           Node       *clause_expr;
             Const      *cst;
-           bool        varonleft;
+           bool        expronleft;
+           int         idx;
+           Oid         collid;
  
             fmgr_info(get_opcode(expr->opno), &opproc);
  
-           /* extract the var and const from the expression */
-           if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+           /* extract the var/expr and const from the expression */
+           if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+               elog(ERROR, "incompatible clause");
+
+           /* match the attribute/expression to a dimension of the statistic */
+           idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
+
+           /*
+            * Walk through the MCV items and evaluate the current clause. We
+            * can skip items that were already ruled out, and terminate if
+            * there are no remaining MCV items that might possibly match.
+            */
+           for (i = 0; i < mcvlist->nitems; i++)
             {
-               int         idx;
+               bool        match = true;
+               MCVItem    *item = &mcvlist->items[i];
  
-               /* match the attribute to a dimension of the statistic */
-               idx = bms_member_index(keys, var->varattno);
+               Assert(idx >= 0);
  
                 /*
-                * Walk through the MCV items and evaluate the current clause.
-                * We can skip items that were already ruled out, and
-                * terminate if there are no remaining MCV items that might
-                * possibly match.
+                * When the MCV item or the Const value is NULL we can treat
+                * this as a mismatch. We must not call the operator because
+                * of strictness.
                  */
-               for (i = 0; i < mcvlist->nitems; i++)
+               if (item->isnull[idx] || cst->constisnull)
                 {
-                   bool        match = true;
-                   MCVItem    *item = &mcvlist->items[i];
-
-                   /*
-                    * When the MCV item or the Const value is NULL we can
-                    * treat this as a mismatch. We must not call the operator
-                    * because of strictness.
-                    */
-                   if (item->isnull[idx] || cst->constisnull)
-                   {
-                       matches[i] = RESULT_MERGE(matches[i], is_or, false);
-                       continue;
-                   }
+                   matches[i] = RESULT_MERGE(matches[i], is_or, false);
+                   continue;
+               }
  
-                   /*
-                    * Skip MCV items that can't change result in the bitmap.
-                    * Once the value gets false for AND-lists, or true for
-                    * OR-lists, we don't need to look at more clauses.
-                    */
-                   if (RESULT_IS_FINAL(matches[i], is_or))
-                       continue;
+               /*
+                * Skip MCV items that can't change result in the bitmap. Once
+                * the value gets false for AND-lists, or true for OR-lists,
+                * we don't need to look at more clauses.
+                */
+               if (RESULT_IS_FINAL(matches[i], is_or))
+                   continue;
  
-                   /*
-                    * First check whether the constant is below the lower
-                    * boundary (in that case we can skip the bucket, because
-                    * there's no overlap).
-                    *
-                    * We don't store collations used to build the statistics,
-                    * but we can use the collation for the attribute itself,
-                    * as stored in varcollid. We do reset the statistics
-                    * after a type change (including collation change), so
-                    * this is OK. We may need to relax this after allowing
-                    * extended statistics on expressions.
-                    */
-                   if (varonleft)
-                       match = DatumGetBool(FunctionCall2Coll(&opproc,
-                                                              var->varcollid,
-                                                              item->values[idx],
-                                                              cst->constvalue));
-                   else
-                       match = DatumGetBool(FunctionCall2Coll(&opproc,
-                                                              var->varcollid,
-                                                              cst->constvalue,
-                                                              item->values[idx]));
-
-                   /* update the match bitmap with the result */
-                   matches[i] = RESULT_MERGE(matches[i], is_or, match);
-               }
+               /*
+                * First check whether the constant is below the lower
+                * boundary (in that case we can skip the bucket, because
+                * there's no overlap).
+                *
+                * We don't store collations used to build the statistics, but
+                * we can use the collation for the attribute itself, as
+                * stored in varcollid. We do reset the statistics after a
+                * type change (including collation change), so this is OK.
+                * For expressions we use the collation extracted from the
+                * expression itself.
+                */
+               if (expronleft)
+                   match = DatumGetBool(FunctionCall2Coll(&opproc,
+                                                          collid,
+                                                          item->values[idx],
+                                                          cst->constvalue));
+               else
+                   match = DatumGetBool(FunctionCall2Coll(&opproc,
+                                                          collid,
+                                                          cst->constvalue,
+                                                          item->values[idx]));
+
+               /* update the match bitmap with the result */
+               matches[i] = RESULT_MERGE(matches[i], is_or, match);
             }
         }
         else if (IsA(clause, ScalarArrayOpExpr))
@@ -1660,115 +1717,116 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
             ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
             FmgrInfo    opproc;
  
-           /* valid only after examine_clause_args returns true */
-           Var        *var;
+           /* valid only after examine_opclause_args returns true */
+           Node       *clause_expr;
             Const      *cst;
-           bool        varonleft;
+           bool        expronleft;
+           Oid         collid;
+           int         idx;
+
+           /* array evaluation */
+           ArrayType  *arrayval;
+           int16       elmlen;
+           bool        elmbyval;
+           char        elmalign;
+           int         num_elems;
+           Datum      *elem_values;
+           bool       *elem_nulls;
  
             fmgr_info(get_opcode(expr->opno), &opproc);
  
-           /* extract the var and const from the expression */
-           if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+           /* extract the var/expr and const from the expression */
+           if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
+               elog(ERROR, "incompatible clause");
+
+           /* ScalarArrayOpExpr has the Var always on the left */
+           Assert(expronleft);
+
+           /* XXX what if (cst->constisnull == NULL)? */
+           if (!cst->constisnull)
             {
-               int         idx;
+               arrayval = DatumGetArrayTypeP(cst->constvalue);
+               get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+                                    &elmlen, &elmbyval, &elmalign);
+               deconstruct_array(arrayval,
+                                 ARR_ELEMTYPE(arrayval),
+                                 elmlen, elmbyval, elmalign,
+                                 &elem_values, &elem_nulls, &num_elems);
+           }
  
-               ArrayType  *arrayval;
-               int16       elmlen;
-               bool        elmbyval;
-               char        elmalign;
-               int         num_elems;
-               Datum      *elem_values;
-               bool       *elem_nulls;
+           /* match the attribute/expression to a dimension of the statistic */
+           idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
  
-               /* ScalarArrayOpExpr has the Var always on the left */
-               Assert(varonleft);
+           /*
+            * Walk through the MCV items and evaluate the current clause. We
+            * can skip items that were already ruled out, and terminate if
+            * there are no remaining MCV items that might possibly match.
+            */
+           for (i = 0; i < mcvlist->nitems; i++)
+           {
+               int         j;
+               bool        match = (expr->useOr ? false : true);
+               MCVItem    *item = &mcvlist->items[i];
  
-               if (!cst->constisnull)
+               /*
+                * When the MCV item or the Const value is NULL we can treat
+                * this as a mismatch. We must not call the operator because
+                * of strictness.
+                */
+               if (item->isnull[idx] || cst->constisnull)
                 {
-                   arrayval = DatumGetArrayTypeP(cst->constvalue);
-                   get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
-                                        &elmlen, &elmbyval, &elmalign);
-                   deconstruct_array(arrayval,
-                                     ARR_ELEMTYPE(arrayval),
-                                     elmlen, elmbyval, elmalign,
-                                     &elem_values, &elem_nulls, &num_elems);
+                   matches[i] = RESULT_MERGE(matches[i], is_or, false);
+                   continue;
                 }
  
-               /* match the attribute to a dimension of the statistic */
-               idx = bms_member_index(keys, var->varattno);
-
                 /*
-                * Walk through the MCV items and evaluate the current clause.
-                * We can skip items that were already ruled out, and
-                * terminate if there are no remaining MCV items that might
-                * possibly match.
+                * Skip MCV items that can't change result in the bitmap. Once
+                * the value gets false for AND-lists, or true for OR-lists,
+                * we don't need to look at more clauses.
                  */
-               for (i = 0; i < mcvlist->nitems; i++)
+               if (RESULT_IS_FINAL(matches[i], is_or))
+                   continue;
+
+               for (j = 0; j < num_elems; j++)
                 {
-                   int         j;
-                   bool        match = (expr->useOr ? false : true);
-                   MCVItem    *item = &mcvlist->items[i];
+                   Datum       elem_value = elem_values[j];
+                   bool        elem_isnull = elem_nulls[j];
+                   bool        elem_match;
  
-                   /*
-                    * When the MCV item or the Const value is NULL we can
-                    * treat this as a mismatch. We must not call the operator
-                    * because of strictness.
-                    */
-                   if (item->isnull[idx] || cst->constisnull)
+                   /* NULL values always evaluate as not matching. */
+                   if (elem_isnull)
                     {
-                       matches[i] = RESULT_MERGE(matches[i], is_or, false);
+                       match = RESULT_MERGE(match, expr->useOr, false);
                         continue;
                     }
  
                     /*
-                    * Skip MCV items that can't change result in the bitmap.
-                    * Once the value gets false for AND-lists, or true for
-                    * OR-lists, we don't need to look at more clauses.
+                    * Stop evaluating the array elements once we reach match
+                    * value that can't change - ALL() is the same as
+                    * AND-list, ANY() is the same as OR-list.
                      */
-                   if (RESULT_IS_FINAL(matches[i], is_or))
-                       continue;
+                   if (RESULT_IS_FINAL(match, expr->useOr))
+                       break;
  
-                   for (j = 0; j < num_elems; j++)
-                   {
-                       Datum       elem_value = elem_values[j];
-                       bool        elem_isnull = elem_nulls[j];
-                       bool        elem_match;
-
-                       /* NULL values always evaluate as not matching. */
-                       if (elem_isnull)
-                       {
-                           match = RESULT_MERGE(match, expr->useOr, false);
-                           continue;
-                       }
-
-                       /*
-                        * Stop evaluating the array elements once we reach
-                        * match value that can't change - ALL() is the same
-                        * as AND-list, ANY() is the same as OR-list.
-                        */
-                       if (RESULT_IS_FINAL(match, expr->useOr))
-                           break;
-
-                       elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
-                                                                   var->varcollid,
-                                                                   item->values[idx],
-                                                                   elem_value));
-
-                       match = RESULT_MERGE(match, expr->useOr, elem_match);
-                   }
+                   elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
+                                                               collid,
+                                                               item->values[idx],
+                                                               elem_value));
  
-                   /* update the match bitmap with the result */
-                   matches[i] = RESULT_MERGE(matches[i], is_or, match);
+                   match = RESULT_MERGE(match, expr->useOr, elem_match);
                 }
+
+               /* update the match bitmap with the result */
+               matches[i] = RESULT_MERGE(matches[i], is_or, match);
             }
         }
         else if (IsA(clause, NullTest))
         {
             NullTest   *expr = (NullTest *) clause;
-           Var        *var = (Var *) (expr->arg);
+           Node       *clause_expr = (Node *) (expr->arg);
  
-           /* match the attribute to a dimension of the statistic */
-           int         idx = bms_member_index(keys, var->varattno);
+           /* match the attribute/expression to a dimension of the statistic */
+           int         idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
  
             /*
              * Walk through the MCV items and evaluate the current clause. We
@@ -1811,7 +1869,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
             Assert(list_length(bool_clauses) >= 2);
  
             /* build the match bitmap for the OR-clauses */
-           bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys,
+           bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
                                                 mcvlist, is_orclause(clause));
  
             /*
@@ -1839,7 +1897,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
             Assert(list_length(not_args) == 1);
  
             /* build the match bitmap for the NOT-clause */
-           not_matches = mcv_get_match_bitmap(root, not_args, keys,
+           not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
                                                mcvlist, false);
  
             /*
@@ -1982,7 +2040,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
     mcv = statext_mcv_load(stat->statOid);
  
     /* build a match bitmap for the clauses */
-   matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false);
+   matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
+                                  mcv, false);
  
     /* sum frequencies for all the matching MCV items */
     *basesel = 0.0;
@@ -2056,7 +2115,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
  
     /* build the match bitmap for the new clause */
     new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
-                                      mcv, false);
+                                      stat->exprs, mcv, false);
  
     /*
      * Sum the frequencies for all the MCV items matching this clause and also
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c

index e08c001e3f4f1ac5c44649cf2289d0da53410b5b..4481312d61d3eb6ac0676663ab503a40cd9c863c 100644 (file)
--- a/src/backend/statistics/mvdistinct.c
+++ b/src/backend/statistics/mvdistinct.c
@@ -36,8 +36,7 @@
  #include "utils/syscache.h"
  #include "utils/typcache.h"
  
-static double ndistinct_for_combination(double totalrows, int numrows,
-                                       HeapTuple *rows, VacAttrStats **stats,
+static double ndistinct_for_combination(double totalrows, StatsBuildData *data,
                                         int k, int *combination);
  static double estimate_ndistinct(double totalrows, int numrows, int d, int f1);
  static int n_choose_k(int n, int k);
@@ -81,15 +80,18 @@ static void generate_combinations(CombinationGenerator *state);
   *
   * This computes the ndistinct estimate using the same estimator used
   * in analyze.c and then computes the coefficient.
+ *
+ * To handle expressions easily, we treat them as system attributes with
+ * negative attnums, and offset everything by number of expressions to
+ * allow using Bitmapsets.
   */
  MVNDistinct *
-statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
-                       Bitmapset *attrs, VacAttrStats **stats)
+statext_ndistinct_build(double totalrows, StatsBuildData *data)
  {
     MVNDistinct *result;
     int         k;
     int         itemcnt;
-   int         numattrs = bms_num_members(attrs);
+   int         numattrs = data->nattnums;
     int         numcombs = num_combinations(numattrs);
  
     result = palloc(offsetof(MVNDistinct, items) +
@@ -112,13 +114,19 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows,
             MVNDistinctItem *item = &result->items[itemcnt];
             int         j;
  
-           item->attrs = NULL;
+           item->attributes = palloc(sizeof(AttrNumber) * k);
+           item->nattributes = k;
+
+           /* translate the indexes to attnums */
             for (j = 0; j < k; j++)
-               item->attrs = bms_add_member(item->attrs,
-                                            stats[combination[j]]->attr->attnum);
+           {
+               item->attributes[j] = data->attnums[combination[j]];
+
+               Assert(AttributeNumberIsValid(item->attributes[j]));
+           }
+
             item->ndistinct =
-               ndistinct_for_combination(totalrows, numrows, rows,
-                                         stats, k, combination);
+               ndistinct_for_combination(totalrows, data, k, combination);
  
             itemcnt++;
             Assert(itemcnt <= result->nitems);
@@ -189,7 +197,7 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
     {
         int         nmembers;
  
-       nmembers = bms_num_members(ndistinct->items[i].attrs);
+       nmembers = ndistinct->items[i].nattributes;
         Assert(nmembers >= 2);
  
         len += SizeOfItem(nmembers);
@@ -214,22 +222,15 @@ statext_ndistinct_serialize(MVNDistinct *ndistinct)
     for (i = 0; i < ndistinct->nitems; i++)
     {
         MVNDistinctItem item = ndistinct->items[i];
-       int         nmembers = bms_num_members(item.attrs);
-       int         x;
+       int         nmembers = item.nattributes;
  
         memcpy(tmp, &item.ndistinct, sizeof(double));
         tmp += sizeof(double);
         memcpy(tmp, &nmembers, sizeof(int));
         tmp += sizeof(int);
  
-       x = -1;
-       while ((x = bms_next_member(item.attrs, x)) >= 0)
-       {
-           AttrNumber  value = (AttrNumber) x;
-
-           memcpy(tmp, &value, sizeof(AttrNumber));
-           tmp += sizeof(AttrNumber);
-       }
+       memcpy(tmp, item.attributes, sizeof(AttrNumber) * nmembers);
+       tmp += nmembers * sizeof(AttrNumber);
  
         /* protect against overflows */
         Assert(tmp <= ((char *) output + len));
@@ -301,27 +302,21 @@ statext_ndistinct_deserialize(bytea *data)
     for (i = 0; i < ndistinct->nitems; i++)
     {
         MVNDistinctItem *item = &ndistinct->items[i];
-       int         nelems;
-
-       item->attrs = NULL;
  
         /* ndistinct value */
         memcpy(&item->ndistinct, tmp, sizeof(double));
         tmp += sizeof(double);
  
         /* number of attributes */
-       memcpy(&nelems, tmp, sizeof(int));
+       memcpy(&item->nattributes, tmp, sizeof(int));
         tmp += sizeof(int);
-       Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS));
+       Assert((item->nattributes >= 2) && (item->nattributes <= STATS_MAX_DIMENSIONS));
  
-       while (nelems-- > 0)
-       {
-           AttrNumber  attno;
+       item->attributes
+           = (AttrNumber *) palloc(item->nattributes * sizeof(AttrNumber));
  
-           memcpy(&attno, tmp, sizeof(AttrNumber));
-           tmp += sizeof(AttrNumber);
-           item->attrs = bms_add_member(item->attrs, attno);
-       }
+       memcpy(item->attributes, tmp, sizeof(AttrNumber) * item->nattributes);
+       tmp += sizeof(AttrNumber) * item->nattributes;
  
         /* still within the bytea */
         Assert(tmp <= ((char *) data + VARSIZE_ANY(data)));
@@ -369,17 +364,17 @@ pg_ndistinct_out(PG_FUNCTION_ARGS)
  
     for (i = 0; i < ndist->nitems; i++)
     {
+       int         j;
         MVNDistinctItem item = ndist->items[i];
-       int         x = -1;
-       bool        first = true;
  
         if (i > 0)
             appendStringInfoString(&str, ", ");
  
-       while ((x = bms_next_member(item.attrs, x)) >= 0)
+       for (j = 0; j < item.nattributes; j++)
         {
-           appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x);
-           first = false;
+           AttrNumber  attnum = item.attributes[j];
+
+           appendStringInfo(&str, "%s%d", (j == 0) ? "\"" : ", ", attnum);
         }
         appendStringInfo(&str, "\": %d", (int) item.ndistinct);
     }
@@ -427,8 +422,8 @@ pg_ndistinct_send(PG_FUNCTION_ARGS)
   * combination of multiple columns.
   */
  static double
-ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
-                         VacAttrStats **stats, int k, int *combination)
+ndistinct_for_combination(double totalrows, StatsBuildData *data,
+                         int k, int *combination)
  {
     int         i,
                 j;
@@ -439,6 +434,7 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
     Datum      *values;
     SortItem   *items;
     MultiSortSupport mss;
+   int         numrows = data->numrows;
  
     mss = multi_sort_init(k);
  
@@ -467,25 +463,27 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows,
      */
     for (i = 0; i < k; i++)
     {
-       VacAttrStats *colstat = stats[combination[i]];
+       Oid         typid;
         TypeCacheEntry *type;
+       Oid         collid = InvalidOid;
+       VacAttrStats *colstat = data->stats[combination[i]];
+
+       typid = colstat->attrtypid;
+       collid = colstat->attrcollid;
  
-       type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
+       type = lookup_type_cache(typid, TYPECACHE_LT_OPR);
         if (type->lt_opr == InvalidOid) /* shouldn't happen */
             elog(ERROR, "cache lookup failed for ordering operator for type %u",
-                colstat->attrtypid);
+                typid);
  
         /* prepare the sort function for this dimension */
-       multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
+       multi_sort_add_dimension(mss, i, type->lt_opr, collid);
  
         /* accumulate all the data for this dimension into the arrays */
         for (j = 0; j < numrows; j++)
         {
-           items[j].values[i] =
-               heap_getattr(rows[j],
-                            colstat->attr->attnum,
-                            colstat->tupDesc,
-                            &items[j].isnull[i]);
+           items[j].values[i] = data->values[combination[i]][j];
+           items[j].isnull[i] = data->nulls[combination[i]][j];
         }
     }
  
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c

index 729274b330f0f1427ec62548f51571bec46be035..16c6f17e2355ae6bd38bee3e18386090e90a96ae 100644 (file)
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1816,7 +1816,34 @@ ProcessUtilitySlow(ParseState *pstate,
                 break;
  
             case T_CreateStatsStmt:
-               address = CreateStatistics((CreateStatsStmt *) parsetree);
+               {
+                   Oid         relid;
+                   CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree;
+                   RangeVar   *rel = (RangeVar *) linitial(stmt->relations);
+
+                   if (!IsA(rel, RangeVar))
+                       ereport(ERROR,
+                               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                                errmsg("only a single relation is allowed in CREATE STATISTICS")));
+
+                   /*
+                    * CREATE STATISTICS will influence future execution plans
+                    * but does not interfere with currently executing plans.
+                    * So it should be enough to take ShareUpdateExclusiveLock
+                    * on relation, conflicting with ANALYZE and other DDL
+                    * that sets statistical information, but not with normal
+                    * queries.
+                    *
+                    * XXX RangeVarCallbackOwnsRelation not needed here, to
+                    * keep the same behavior as before.
+                    */
+                   relid = RangeVarGetRelid(rel, ShareUpdateExclusiveLock, false);
+
+                   /* Run parse analysis ... */
+                   stmt = transformStatsStmt(relid, stmt, queryString);
+
+                   address = CreateStatistics(stmt);
+               }
                 break;
  
             case T_AlterStatsStmt:
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c

index f0de2a25c96c2bf2692287515b05efc667c2705b..3de98d2333e36039107424e78f1ff1213684fcfa 100644 (file)
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -336,7 +336,8 @@ static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
                                     bool attrsOnly, bool keysOnly,
                                     bool showTblSpc, bool inherits,
                                     int prettyFlags, bool missing_ok);
-static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
+static char *pg_get_statisticsobj_worker(Oid statextid, bool columns_only,
+                                        bool missing_ok);
  static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
                                       bool attrsOnly, bool missing_ok);
  static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
@@ -1507,7 +1508,36 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
     Oid         statextid = PG_GETARG_OID(0);
     char       *res;
  
-   res = pg_get_statisticsobj_worker(statextid, true);
+   res = pg_get_statisticsobj_worker(statextid, false, true);
+
+   if (res == NULL)
+       PG_RETURN_NULL();
+
+   PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal version for use by ALTER TABLE.
+ * Includes a tablespace clause in the result.
+ * Returns a palloc'd C string; no pretty-printing.
+ */
+char *
+pg_get_statisticsobjdef_string(Oid statextid)
+{
+   return pg_get_statisticsobj_worker(statextid, false, false);
+}
+
+/*
+ * pg_get_statisticsobjdef_columns
+ *     Get columns and expressions for an extended statistics object
+ */
+Datum
+pg_get_statisticsobjdef_columns(PG_FUNCTION_ARGS)
+{
+   Oid         statextid = PG_GETARG_OID(0);
+   char       *res;
+
+   res = pg_get_statisticsobj_worker(statextid, true, true);
  
     if (res == NULL)
         PG_RETURN_NULL();
@@ -1519,7 +1549,7 @@ pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
   * Internal workhorse to decompile an extended statistics object.
   */
  static char *
-pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
+pg_get_statisticsobj_worker(Oid statextid, bool columns_only, bool missing_ok)
  {
     Form_pg_statistic_ext statextrec;
     HeapTuple   statexttup;
@@ -1534,6 +1564,11 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
     bool        dependencies_enabled;
     bool        mcv_enabled;
     int         i;
+   List       *context;
+   ListCell   *lc;
+   List       *exprs = NIL;
+   bool        has_exprs;
+   int         ncolumns;
  
     statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
  
@@ -1544,75 +1579,114 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
         elog(ERROR, "cache lookup failed for statistics object %u", statextid);
     }
  
-   statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+   /* has the statistics expressions? */
+   has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
  
-   initStringInfo(&buf);
-
-   nsp = get_namespace_name(statextrec->stxnamespace);
-   appendStringInfo(&buf, "CREATE STATISTICS %s",
-                    quote_qualified_identifier(nsp,
-                                               NameStr(statextrec->stxname)));
+   statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
  
     /*
-    * Decode the stxkind column so that we know which stats types to print.
+    * Get the statistics expressions, if any.  (NOTE: we do not use the
+    * relcache versions of the expressions, because we want to display
+    * non-const-folded expressions.)
      */
-   datum = SysCacheGetAttr(STATEXTOID, statexttup,
-                           Anum_pg_statistic_ext_stxkind, &isnull);
-   Assert(!isnull);
-   arr = DatumGetArrayTypeP(datum);
-   if (ARR_NDIM(arr) != 1 ||
-       ARR_HASNULL(arr) ||
-       ARR_ELEMTYPE(arr) != CHAROID)
-       elog(ERROR, "stxkind is not a 1-D char array");
-   enabled = (char *) ARR_DATA_PTR(arr);
-
-   ndistinct_enabled = false;
-   dependencies_enabled = false;
-   mcv_enabled = false;
-
-   for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+   if (has_exprs)
     {
-       if (enabled[i] == STATS_EXT_NDISTINCT)
-           ndistinct_enabled = true;
-       if (enabled[i] == STATS_EXT_DEPENDENCIES)
-           dependencies_enabled = true;
-       if (enabled[i] == STATS_EXT_MCV)
-           mcv_enabled = true;
+       Datum       exprsDatum;
+       bool        isnull;
+       char       *exprsString;
+
+       exprsDatum = SysCacheGetAttr(STATEXTOID, statexttup,
+                                    Anum_pg_statistic_ext_stxexprs, &isnull);
+       Assert(!isnull);
+       exprsString = TextDatumGetCString(exprsDatum);
+       exprs = (List *) stringToNode(exprsString);
+       pfree(exprsString);
     }
+   else
+       exprs = NIL;
  
-   /*
-    * If any option is disabled, then we'll need to append the types clause
-    * to show which options are enabled.  We omit the types clause on purpose
-    * when all options are enabled, so a pg_dump/pg_restore will create all
-    * statistics types on a newer postgres version, if the statistics had all
-    * options enabled on the original version.
-    */
-   if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
+   /* count the number of columns (attributes and expressions) */
+   ncolumns = statextrec->stxkeys.dim1 + list_length(exprs);
+
+   initStringInfo(&buf);
+
+   if (!columns_only)
     {
-       bool        gotone = false;
+       nsp = get_namespace_name(statextrec->stxnamespace);
+       appendStringInfo(&buf, "CREATE STATISTICS %s",
+                        quote_qualified_identifier(nsp,
+                                                   NameStr(statextrec->stxname)));
  
-       appendStringInfoString(&buf, " (");
+       /*
+        * Decode the stxkind column so that we know which stats types to
+        * print.
+        */
+       datum = SysCacheGetAttr(STATEXTOID, statexttup,
+                               Anum_pg_statistic_ext_stxkind, &isnull);
+       Assert(!isnull);
+       arr = DatumGetArrayTypeP(datum);
+       if (ARR_NDIM(arr) != 1 ||
+           ARR_HASNULL(arr) ||
+           ARR_ELEMTYPE(arr) != CHAROID)
+           elog(ERROR, "stxkind is not a 1-D char array");
+       enabled = (char *) ARR_DATA_PTR(arr);
  
-       if (ndistinct_enabled)
+       ndistinct_enabled = false;
+       dependencies_enabled = false;
+       mcv_enabled = false;
+
+       for (i = 0; i < ARR_DIMS(arr)[0]; i++)
         {
-           appendStringInfoString(&buf, "ndistinct");
-           gotone = true;
+           if (enabled[i] == STATS_EXT_NDISTINCT)
+               ndistinct_enabled = true;
+           else if (enabled[i] == STATS_EXT_DEPENDENCIES)
+               dependencies_enabled = true;
+           else if (enabled[i] == STATS_EXT_MCV)
+               mcv_enabled = true;
+
+           /* ignore STATS_EXT_EXPRESSIONS (it's built automatically) */
         }
  
-       if (dependencies_enabled)
+       /*
+        * If any option is disabled, then we'll need to append the types
+        * clause to show which options are enabled.  We omit the types clause
+        * on purpose when all options are enabled, so a pg_dump/pg_restore
+        * will create all statistics types on a newer postgres version, if
+        * the statistics had all options enabled on the original version.
+        *
+        * But if the statistics is defined on just a single column, it has to
+        * be an expression statistics. In that case we don't need to specify
+        * kinds.
+        */
+       if ((!ndistinct_enabled || !dependencies_enabled || !mcv_enabled) &&
+           (ncolumns > 1))
         {
-           appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
-           gotone = true;
-       }
+           bool        gotone = false;
  
-       if (mcv_enabled)
-           appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+           appendStringInfoString(&buf, " (");
  
-       appendStringInfoChar(&buf, ')');
-   }
+           if (ndistinct_enabled)
+           {
+               appendStringInfoString(&buf, "ndistinct");
+               gotone = true;
+           }
+
+           if (dependencies_enabled)
+           {
+               appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
+               gotone = true;
+           }
  
-   appendStringInfoString(&buf, " ON ");
+           if (mcv_enabled)
+               appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+
+           appendStringInfoChar(&buf, ')');
+       }
+
+       appendStringInfoString(&buf, " ON ");
+   }
  
+   /* decode simple column references */
     for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
     {
         AttrNumber  attnum = statextrec->stxkeys.values[colno];
@@ -1626,14 +1700,109 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
         appendStringInfoString(&buf, quote_identifier(attname));
     }
  
-   appendStringInfo(&buf, " FROM %s",
-                    generate_relation_name(statextrec->stxrelid, NIL));
+   context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+                                 statextrec->stxrelid);
+
+   foreach(lc, exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+       char       *str;
+       int         prettyFlags = PRETTYFLAG_INDENT;
+
+       str = deparse_expression_pretty(expr, context, false, false,
+                                       prettyFlags, 0);
+
+       if (colno > 0)
+           appendStringInfoString(&buf, ", ");
+
+       /* Need parens if it's not a bare function call */
+       if (looks_like_function(expr))
+           appendStringInfoString(&buf, str);
+       else
+           appendStringInfo(&buf, "(%s)", str);
+
+       colno++;
+   }
+
+   if (!columns_only)
+       appendStringInfo(&buf, " FROM %s",
+                        generate_relation_name(statextrec->stxrelid, NIL));
  
     ReleaseSysCache(statexttup);
  
     return buf.data;
  }
  
+/*
+ * Generate text array of expressions for statistics object.
+ */
+Datum
+pg_get_statisticsobjdef_expressions(PG_FUNCTION_ARGS)
+{
+   Oid         statextid = PG_GETARG_OID(0);
+   Form_pg_statistic_ext statextrec;
+   HeapTuple   statexttup;
+   Datum       datum;
+   bool        isnull;
+   List       *context;
+   ListCell   *lc;
+   List       *exprs = NIL;
+   bool        has_exprs;
+   char       *tmp;
+   ArrayBuildState *astate = NULL;
+
+   statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
+
+   if (!HeapTupleIsValid(statexttup))
+       elog(ERROR, "cache lookup failed for statistics object %u", statextid);
+
+   /* has the statistics expressions? */
+   has_exprs = !heap_attisnull(statexttup, Anum_pg_statistic_ext_stxexprs, NULL);
+
+   /* no expressions? we're done */
+   if (!has_exprs)
+   {
+       ReleaseSysCache(statexttup);
+       PG_RETURN_NULL();
+   }
+
+   statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+
+   /*
+    * Get the statistics expressions, and deparse them into text values.
+    */
+   datum = SysCacheGetAttr(STATEXTOID, statexttup,
+                           Anum_pg_statistic_ext_stxexprs, &isnull);
+
+   Assert(!isnull);
+   tmp = TextDatumGetCString(datum);
+   exprs = (List *) stringToNode(tmp);
+   pfree(tmp);
+
+   context = deparse_context_for(get_relation_name(statextrec->stxrelid),
+                                 statextrec->stxrelid);
+
+   foreach(lc, exprs)
+   {
+       Node       *expr = (Node *) lfirst(lc);
+       char       *str;
+       int         prettyFlags = PRETTYFLAG_INDENT;
+
+       str = deparse_expression_pretty(expr, context, false, false,
+                                       prettyFlags, 0);
+
+       astate = accumArrayResult(astate,
+                                 PointerGetDatum(cstring_to_text(str)),
+                                 false,
+                                 TEXTOID,
+                                 CurrentMemoryContext);
+   }
+
+   ReleaseSysCache(statexttup);
+
+   PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
+}
+
  /*
   * pg_get_partkeydef
   *
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 2348d4a772a0a1a9901e59b36a34d70bb532fc0e..7e41bc56418ea0a2c9ac8a4a46e51d8454598461 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3430,6 +3430,14 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
          * If examine_variable is able to deduce anything about the GROUP BY
          * expression, treat it as a single variable even if it's really more
          * complicated.
+        *
+        * XXX This has the consequence that if there's a statistics on the
+        * expression, we don't split it into individual Vars. This affects
+        * our selection of statistics in estimate_multivariate_ndistinct,
+        * because it's probably better to use more accurate estimate for
+        * each expression and treat them as independent, than to combine
+        * estimates for the extracted variables when we don't know how that
+        * relates to the expressions.
          */
         examine_variable(root, groupexpr, 0, &vardata);
         if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
@@ -3880,50 +3888,77 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
                                 List **varinfos, double *ndistinct)
  {
     ListCell   *lc;
-   Bitmapset  *attnums = NULL;
-   int         nmatches;
+   int         nmatches_vars;
+   int         nmatches_exprs;
     Oid         statOid = InvalidOid;
     MVNDistinct *stats;
-   Bitmapset  *matched = NULL;
+   StatisticExtInfo *matched_info = NULL;
  
     /* bail out immediately if the table has no extended statistics */
     if (!rel->statlist)
         return false;
  
-   /* Determine the attnums we're looking for */
-   foreach(lc, *varinfos)
-   {
-       GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
-       AttrNumber  attnum;
-
-       Assert(varinfo->rel == rel);
-
-       if (!IsA(varinfo->var, Var))
-           continue;
-
-       attnum = ((Var *) varinfo->var)->varattno;
-
-       if (!AttrNumberIsForUserDefinedAttr(attnum))
-           continue;
-
-       attnums = bms_add_member(attnums, attnum);
-   }
-
     /* look for the ndistinct statistics matching the most vars */
-   nmatches = 1;               /* we require at least two matches */
+   nmatches_vars = 0;          /* we require at least two matches */
+   nmatches_exprs = 0;
     foreach(lc, rel->statlist)
     {
+       ListCell   *lc2;
         StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
-       Bitmapset  *shared;
-       int         nshared;
+       int         nshared_vars = 0;
+       int         nshared_exprs = 0;
  
         /* skip statistics of other kinds */
         if (info->kind != STATS_EXT_NDISTINCT)
             continue;
  
-       /* compute attnums shared by the vars and the statistics object */
-       shared = bms_intersect(info->keys, attnums);
-       nshared = bms_num_members(shared);
+       /*
+        * Determine how many expressions (and variables in non-matched
+        * expressions) match. We'll then use these numbers to pick the
+        * statistics object that best matches the clauses.
+        */
+       foreach(lc2, *varinfos)
+       {
+           ListCell   *lc3;
+           GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+           AttrNumber  attnum;
+
+           Assert(varinfo->rel == rel);
+
+           /* simple Var, search in statistics keys directly */
+           if (IsA(varinfo->var, Var))
+           {
+               attnum = ((Var *) varinfo->var)->varattno;
+
+               /*
+                * Ignore system attributes - we don't support statistics on
+                * them, so can't match them (and it'd fail as the values are
+                * negative).
+                */
+               if (!AttrNumberIsForUserDefinedAttr(attnum))
+                   continue;
+
+               if (bms_is_member(attnum, info->keys))
+                   nshared_vars++;
+
+               continue;
+           }
+
+           /* expression - see if it's in the statistics */
+           foreach(lc3, info->exprs)
+           {
+               Node       *expr = (Node *) lfirst(lc3);
+
+               if (equal(varinfo->var, expr))
+               {
+                   nshared_exprs++;
+                   break;
+               }
+           }
+       }
+
+       if (nshared_vars + nshared_exprs < 2)
+           continue;
  
         /*
          * Does this statistics object match more columns than the currently
@@ -3932,18 +3967,21 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
          * XXX This should break ties using name of the object, or something
          * like that, to make the outcome stable.
          */
-       if (nshared > nmatches)
+       if ((nshared_exprs > nmatches_exprs) ||
+           (((nshared_exprs == nmatches_exprs)) && (nshared_vars > nmatches_vars)))
         {
             statOid = info->statOid;
-           nmatches = nshared;
-           matched = shared;
+           nmatches_vars = nshared_vars;
+           nmatches_exprs = nshared_exprs;
+           matched_info = info;
         }
     }
  
     /* No match? */
     if (statOid == InvalidOid)
         return false;
-   Assert(nmatches > 1 && matched != NULL);
+
+   Assert(nmatches_vars + nmatches_exprs > 1);
  
     stats = statext_ndistinct_load(statOid);
  
@@ -3956,20 +3994,135 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
         int         i;
         List       *newlist = NIL;
         MVNDistinctItem *item = NULL;
+       ListCell   *lc2;
+       Bitmapset  *matched = NULL;
+       AttrNumber  attnum_offset;
+
+       /*
+        * How much we need to offset the attnums? If there are no
+        * expressions, no offset is needed. Otherwise offset enough to move
+        * the lowest one (which is equal to number of expressions) to 1.
+        */
+       if (matched_info->exprs)
+           attnum_offset = (list_length(matched_info->exprs) + 1);
+       else
+           attnum_offset = 0;
+
+       /* see what actually matched */
+       foreach(lc2, *varinfos)
+       {
+           ListCell   *lc3;
+           int         idx;
+           bool        found = false;
+
+           GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc2);
+
+           /*
+            * Process a simple Var expression, by matching it to keys
+            * directly. If there's a matchine expression, we'll try
+            * matching it later.
+            */
+           if (IsA(varinfo->var, Var))
+           {
+               AttrNumber  attnum = ((Var *) varinfo->var)->varattno;
+
+               /*
+                * Ignore expressions on system attributes. Can't rely on
+                * the bms check for negative values.
+                */
+               if (!AttrNumberIsForUserDefinedAttr(attnum))
+                   continue;
+
+               /* Is the variable covered by the statistics? */
+               if (!bms_is_member(attnum, matched_info->keys))
+                   continue;
+
+               attnum = attnum + attnum_offset;
+
+               /* ensure sufficient offset */
+               Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+               matched = bms_add_member(matched, attnum);
+
+               found = true;
+           }
+
+           /*
+            * XXX Maybe we should allow searching the expressions even if we
+            * found an attribute matching the expression? That would handle
+            * trivial expressions like "(a)" but it seems fairly useless.
+            */
+           if (found)
+               continue;
+
+           /* expression - see if it's in the statistics */
+           idx = 0;
+           foreach(lc3, matched_info->exprs)
+           {
+               Node       *expr = (Node *) lfirst(lc3);
+
+               if (equal(varinfo->var, expr))
+               {
+                   AttrNumber  attnum = -(idx + 1);
+
+                   attnum = attnum + attnum_offset;
+
+                   /* ensure sufficient offset */
+                   Assert(AttrNumberIsForUserDefinedAttr(attnum));
+
+                   matched = bms_add_member(matched, attnum);
+
+                   /* there should be just one matching expression */
+                   break;
+               }
+
+               idx++;
+           }
+       }
  
         /* Find the specific item that exactly matches the combination */
         for (i = 0; i < stats->nitems; i++)
         {
+           int         j;
             MVNDistinctItem *tmpitem = &stats->items[i];
  
-           if (bms_subset_compare(tmpitem->attrs, matched) == BMS_EQUAL)
+           if (tmpitem->nattributes != bms_num_members(matched))
+               continue;
+
+           /* assume it's the right item */
+           item = tmpitem;
+
+           /* check that all item attributes/expressions fit the match */
+           for (j = 0; j < tmpitem->nattributes; j++)
             {
-               item = tmpitem;
-               break;
+               AttrNumber  attnum = tmpitem->attributes[j];
+
+               /*
+                * Thanks to how we constructed the matched bitmap above, we
+                * can just offset all attnums the same way.
+                */
+               attnum = attnum + attnum_offset;
+
+               if (!bms_is_member(attnum, matched))
+               {
+                   /* nah, it's not this item */
+                   item = NULL;
+                   break;
+               }
             }
+
+           /*
+            * If the item has all the matched attributes, we know it's the
+            * right one - there can't be a better one. matching more.
+            */
+           if (item)
+               break;
         }
  
-       /* make sure we found an item */
+       /*
+        * Make sure we found an item. There has to be one, because ndistinct
+        * statistics includes all combinations of attributes.
+        */
         if (!item)
             elog(ERROR, "corrupt MVNDistinct entry");
  
@@ -3977,18 +4130,63 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
         foreach(lc, *varinfos)
         {
             GroupVarInfo *varinfo = (GroupVarInfo *) lfirst(lc);
-           AttrNumber  attnum;
+           ListCell   *lc3;
+           bool        found = false;
  
-           if (!IsA(varinfo->var, Var))
+           /*
+            * Let's look at plain variables first, because it's the most
+            * common case and the check is quite cheap. We can simply get the
+            * attnum and check (with an offset) matched bitmap.
+            */
+           if (IsA(varinfo->var, Var))
             {
-               newlist = lappend(newlist, varinfo);
+               AttrNumber  attnum = ((Var *) varinfo->var)->varattno;
+
+               /*
+                * If it's a system attribute, we're done. We don't support
+                * extended statistics on system attributes, so it's clearly
+                * not matched. Just keep the expression and continue.
+                */
+               if (!AttrNumberIsForUserDefinedAttr(attnum))
+               {
+                   newlist = lappend(newlist, varinfo);
+                   continue;
+               }
+
+               /* apply the same offset as above */
+               attnum += attnum_offset;
+
+               /* if it's not matched, keep the varinfo */
+               if (!bms_is_member(attnum, matched))
+                   newlist = lappend(newlist, varinfo);
+
+               /* The rest of the loop deals with complex expressions. */
                 continue;
             }
  
-           attnum = ((Var *) varinfo->var)->varattno;
+           /*
+            * Process complex expressions, not just simple Vars.
+            *
+            * First, we search for an exact match of an expression. If we
+            * find one, we can just discard the whole GroupExprInfo, with all
+            * the variables we extracted from it.
+            *
+            * Otherwise we inspect the individual vars, and try matching it
+            * to variables in the item.
+            */
+           foreach(lc3, matched_info->exprs)
+           {
+               Node       *expr = (Node *) lfirst(lc3);
+
+               if (equal(varinfo->var, expr))
+               {
+                   found = true;
+                   break;
+               }
+           }
  
-           if (AttrNumberIsForUserDefinedAttr(attnum) &&
-               bms_is_member(attnum, matched))
+           /* found exact match, skip */
+           if (found)
                 continue;
  
             newlist = lappend(newlist, varinfo);
@@ -4690,6 +4888,13 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
         *join_is_reversed = false;
  }
  
+/* statext_expressions_load copies the tuple, so just pfree it. */
+static void
+ReleaseDummy(HeapTuple tuple)
+{
+   pfree(tuple);
+}
+
  /*
   * examine_variable
   *     Try to look up statistical data about an expression.
@@ -4830,6 +5035,7 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
          * operator we are estimating for.  FIXME later.
          */
         ListCell   *ilist;
+       ListCell   *slist;
  
         foreach(ilist, onerel->indexlist)
         {
@@ -4986,6 +5192,129 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
             if (vardata->statsTuple)
                 break;
         }
+
+       /*
+        * Search extended statistics for one with a matching expression.
+        * There might be multiple ones, so just grab the first one. In the
+        * future, we might consider the statistics target (and pick the most
+        * accurate statistics) and maybe some other parameters.
+        */
+       foreach(slist, onerel->statlist)
+       {
+           StatisticExtInfo *info = (StatisticExtInfo *) lfirst(slist);
+           ListCell   *expr_item;
+           int         pos;
+
+           /*
+            * Stop once we've found statistics for the expression (either
+            * from extended stats, or for an index in the preceding loop).
+            */
+           if (vardata->statsTuple)
+               break;
+
+           /* skip stats without per-expression stats */
+           if (info->kind != STATS_EXT_EXPRESSIONS)
+               continue;
+
+           pos = 0;
+           foreach(expr_item, info->exprs)
+           {
+               Node       *expr = (Node *) lfirst(expr_item);
+
+               Assert(expr);
+
+               /* strip RelabelType before comparing it */
+               if (expr && IsA(expr, RelabelType))
+                   expr = (Node *) ((RelabelType *) expr)->arg;
+
+               /* found a match, see if we can extract pg_statistic row */
+               if (equal(node, expr))
+               {
+                   HeapTuple   t = statext_expressions_load(info->statOid, pos);
+
+                   /* Get index's table for permission check */
+                   RangeTblEntry *rte;
+                   Oid         userid;
+
+                   vardata->statsTuple = t;
+
+                   /*
+                    * XXX Not sure if we should cache the tuple somewhere.
+                    * Now we just create a new copy every time.
+                    */
+                   vardata->freefunc = ReleaseDummy;
+
+                   rte = planner_rt_fetch(onerel->relid, root);
+                   Assert(rte->rtekind == RTE_RELATION);
+
+                   /*
+                    * Use checkAsUser if it's set, in case we're accessing
+                    * the table via a view.
+                    */
+                   userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+                   /*
+                    * For simplicity, we insist on the whole table being
+                    * selectable, rather than trying to identify which
+                    * column(s) the statistics depends on.  Also require all
+                    * rows to be selectable --- there must be no
+                    * securityQuals from security barrier views or RLS
+                    * policies.
+                    */
+                   vardata->acl_ok =
+                       rte->securityQuals == NIL &&
+                       (pg_class_aclcheck(rte->relid, userid,
+                                          ACL_SELECT) == ACLCHECK_OK);
+
+                   /*
+                    * If the user doesn't have permissions to access an
+                    * inheritance child relation, check the permissions of
+                    * the table actually mentioned in the query, since most
+                    * likely the user does have that permission.  Note that
+                    * whole-table select privilege on the parent doesn't
+                    * quite guarantee that the user could read all columns of
+                    * the child. But in practice it's unlikely that any
+                    * interesting security violation could result from
+                    * allowing access to the expression stats, so we allow it
+                    * anyway.  See similar code in examine_simple_variable()
+                    * for additional comments.
+                    */
+                   if (!vardata->acl_ok &&
+                       root->append_rel_array != NULL)
+                   {
+                       AppendRelInfo *appinfo;
+                       Index       varno = onerel->relid;
+
+                       appinfo = root->append_rel_array[varno];
+                       while (appinfo &&
+                              planner_rt_fetch(appinfo->parent_relid,
+                                               root)->rtekind == RTE_RELATION)
+                       {
+                           varno = appinfo->parent_relid;
+                           appinfo = root->append_rel_array[varno];
+                       }
+                       if (varno != onerel->relid)
+                       {
+                           /* Repeat access check on this rel */
+                           rte = planner_rt_fetch(varno, root);
+                           Assert(rte->rtekind == RTE_RELATION);
+
+                           userid = rte->checkAsUser ? rte->checkAsUser : GetUserId();
+
+                           vardata->acl_ok =
+                               rte->securityQuals == NIL &&
+                               (pg_class_aclcheck(rte->relid,
+                                                  userid,
+                                                  ACL_SELECT) == ACLCHECK_OK);
+                       }
+                   }
+
+                   break;
+               }
+
+               pos++;
+           }
+       }
     }
  }
  
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl

index 737e46464ab7ea3e917070fe0659a3ce0fd38db7..86113df29c46820a14a04c72c68484e880d85182 100644 (file)
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -2637,6 +2637,18 @@ my %tests = (
         unlike => { exclude_dump_test_schema => 1, },
     },
  
+   'CREATE STATISTICS extended_stats_expression' => {
+       create_order => 99,
+       create_sql   => 'CREATE STATISTICS dump_test.test_ext_stats_expr
+                           ON (2 * col1) FROM dump_test.test_fifth_table',
+       regexp => qr/^
+           \QCREATE STATISTICS dump_test.test_ext_stats_expr ON ((2 * col1)) FROM dump_test.test_fifth_table;\E
+           /xms,
+       like =>
+         { %full_runs, %dump_test_schema_runs, section_post_data => 1, },
+       unlike => { exclude_dump_test_schema => 1, },
+   },
+
     'CREATE SEQUENCE test_table_col1_seq' => {
         regexp => qr/^
             \QCREATE SEQUENCE dump_test.test_table_col1_seq\E
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c

index e56cc43e1114f43dd6523d686cb76a71f26a483c..440249ff69df157bf83caf04bea2b500b2d79e24 100644 (file)
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2712,7 +2712,104 @@ describeOneTableDetails(const char *schemaname,
         }
  
         /* print any extended statistics */
-       if (pset.sversion >= 100000)
+       if (pset.sversion >= 140000)
+       {
+           printfPQExpBuffer(&buf,
+                             "SELECT oid, "
+                             "stxrelid::pg_catalog.regclass, "
+                             "stxnamespace::pg_catalog.regnamespace AS nsp, "
+                             "stxname,\n"
+                             "pg_get_statisticsobjdef_columns(oid) AS columns,\n"
+                             "  'd' = any(stxkind) AS ndist_enabled,\n"
+                             "  'f' = any(stxkind) AS deps_enabled,\n"
+                             "  'm' = any(stxkind) AS mcv_enabled,\n"
+                             "stxstattarget\n"
+                             "FROM pg_catalog.pg_statistic_ext stat\n"
+                             "WHERE stxrelid = '%s'\n"
+                             "ORDER BY 1;",
+                             oid);
+
+           result = PSQLexec(buf.data);
+           if (!result)
+               goto error_return;
+           else
+               tuples = PQntuples(result);
+
+           if (tuples > 0)
+           {
+               printTableAddFooter(&cont, _("Statistics objects:"));
+
+               for (i = 0; i < tuples; i++)
+               {
+                   bool        gotone = false;
+                   bool        has_ndistinct;
+                   bool        has_dependencies;
+                   bool        has_mcv;
+                   bool        has_all;
+                   bool        has_some;
+
+                   has_ndistinct = (strcmp(PQgetvalue(result, i, 5), "t") == 0);
+                   has_dependencies = (strcmp(PQgetvalue(result, i, 6), "t") == 0);
+                   has_mcv = (strcmp(PQgetvalue(result, i, 7), "t") == 0);
+
+                   printfPQExpBuffer(&buf, "    ");
+
+                   /* statistics object name (qualified with namespace) */
+                   appendPQExpBuffer(&buf, "\"%s\".\"%s\"",
+                                     PQgetvalue(result, i, 2),
+                                     PQgetvalue(result, i, 3));
+
+                   /*
+                    * When printing kinds we ignore expression statistics,
+                    * which is used only internally and can't be specified by
+                    * user. We don't print the kinds when either none are
+                    * specified (in which case it has to be statistics on a
+                    * single expr) or when all are specified (in which case
+                    * we assume it's expanded by CREATE STATISTICS).
+                    */
+                   has_all = (has_ndistinct && has_dependencies && has_mcv);
+                   has_some = (has_ndistinct || has_dependencies || has_mcv);
+
+                   if (has_some && !has_all)
+                   {
+                       appendPQExpBuffer(&buf, " (");
+
+                       /* options */
+                       if (has_ndistinct)
+                       {
+                           appendPQExpBufferStr(&buf, "ndistinct");
+                           gotone = true;
+                       }
+
+                       if (has_dependencies)
+                       {
+                           appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
+                           gotone = true;
+                       }
+
+                       if (has_mcv)
+                       {
+                           appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
+                       }
+
+                       appendPQExpBuffer(&buf, ")");
+                   }
+
+                   appendPQExpBuffer(&buf, " ON %s FROM %s",
+                                     PQgetvalue(result, i, 4),
+                                     PQgetvalue(result, i, 1));
+
+                   /* Show the stats target if it's not default */
+                   if (strcmp(PQgetvalue(result, i, 8), "-1") != 0)
+                       appendPQExpBuffer(&buf, "; STATISTICS %s",
+                                         PQgetvalue(result, i, 8));
+
+                   printTableAddFooter(&cont, buf.data);
+               }
+           }
+           PQclear(result);
+       }
+       else if (pset.sversion >= 100000)
         {
             printfPQExpBuffer(&buf,
                               "SELECT oid, "
@@ -4468,18 +4565,27 @@ listExtendedStats(const char *pattern)
     printfPQExpBuffer(&buf,
                       "SELECT \n"
                       "es.stxnamespace::pg_catalog.regnamespace::text AS \"%s\", \n"
-                     "es.stxname AS \"%s\", \n"
-                     "pg_catalog.format('%%s FROM %%s', \n"
-                     "  (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
-                     "   FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
-                     "   JOIN pg_catalog.pg_attribute a \n"
-                     "   ON (es.stxrelid = a.attrelid \n"
-                     "   AND a.attnum = s.attnum \n"
-                     "   AND NOT a.attisdropped)), \n"
-                     "es.stxrelid::regclass) AS \"%s\"",
+                     "es.stxname AS \"%s\", \n",
                       gettext_noop("Schema"),
-                     gettext_noop("Name"),
-                     gettext_noop("Definition"));
+                     gettext_noop("Name"));
+
+   if (pset.sversion >= 140000)
+       appendPQExpBuffer(&buf,
+                         "pg_catalog.format('%%s FROM %%s', \n"
+                         "  pg_get_statisticsobjdef_columns(es.oid), \n"
+                         "  es.stxrelid::regclass) AS \"%s\"",
+                         gettext_noop("Definition"));
+   else
+       appendPQExpBuffer(&buf,
+                         "pg_catalog.format('%%s FROM %%s', \n"
+                         "  (SELECT pg_catalog.string_agg(pg_catalog.quote_ident(a.attname),', ') \n"
+                         "   FROM pg_catalog.unnest(es.stxkeys) s(attnum) \n"
+                         "   JOIN pg_catalog.pg_attribute a \n"
+                         "   ON (es.stxrelid = a.attrelid \n"
+                         "   AND a.attnum = s.attnum \n"
+                         "   AND NOT a.attisdropped)), \n"
+                         "es.stxrelid::regclass) AS \"%s\"",
+                         gettext_noop("Definition"));
  
     appendPQExpBuffer(&buf,
                       ",\nCASE WHEN 'd' = any(es.stxkind) THEN 'defined' \n"
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h

index 474ee2982b89946d611fdc5b49cc9066b073a01e..4a39da3c9d494d09e5e1d05bb722a3ebd3332f22 100644 (file)
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
   */
  
  /*                         yyyymmddN */
-#define CATALOG_VERSION_NO 202103265
+#define CATALOG_VERSION_NO 202103266
  
  #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat

index ecf12f463986026749bf85b99ec90ed0409ce7ac..cc7d90d2b0bd8c5ae126e3e1fdd9c567a7b7e58f 100644 (file)
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3658,6 +3658,14 @@
    proname => 'pg_get_statisticsobjdef', provolatile => 's',
    prorettype => 'text', proargtypes => 'oid',
    prosrc => 'pg_get_statisticsobjdef' },
+{ oid => '8887', descr => 'extended statistics columns',
+  proname => 'pg_get_statisticsobjdef_columns', provolatile => 's',
+  prorettype => 'text', proargtypes => 'oid',
+  prosrc => 'pg_get_statisticsobjdef_columns' },
+{ oid => '8886', descr => 'extended statistics expressions',
+  proname => 'pg_get_statisticsobjdef_expressions', provolatile => 's',
+  prorettype => '_text', proargtypes => 'oid',
+  prosrc => 'pg_get_statisticsobjdef_expressions' },
  { oid => '3352', descr => 'partition key description',
    proname => 'pg_get_partkeydef', provolatile => 's', prorettype => 'text',
    proargtypes => 'oid', prosrc => 'pg_get_partkeydef' },
diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h

index 29649f58143b8f431d86291283bd6c4a0f701e78..36912ce528e002ea866f9b574f364d76232cd2ca 100644 (file)
--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -54,6 +54,9 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId)
  #ifdef CATALOG_VARLEN
     char        stxkind[1] BKI_FORCE_NOT_NULL;  /* statistics kinds requested
                                                  * to build */
+   pg_node_tree stxexprs;      /* A list of expression trees for stats
+                                * attributes that are not simple column
+                                * references. */
  #endif
  
  } FormData_pg_statistic_ext;
@@ -81,6 +84,7 @@ DECLARE_ARRAY_FOREIGN_KEY((stxrelid, stxkeys), pg_attribute, (attrelid, attnum))
  #define STATS_EXT_NDISTINCT            'd'
  #define STATS_EXT_DEPENDENCIES     'f'
  #define STATS_EXT_MCV              'm'
+#define STATS_EXT_EXPRESSIONS      'e'
  
  #endif                         /* EXPOSE_TO_CLIENT_CODE */
  
diff --git a/src/include/catalog/pg_statistic_ext_data.h b/src/include/catalog/pg_statistic_ext_data.h

index 2f2577c21887754c178c7996c0289769de690ab4..572915438377a08919cc69d7757e0c866c10942f 100644 (file)
--- a/src/include/catalog/pg_statistic_ext_data.h
+++ b/src/include/catalog/pg_statistic_ext_data.h
@@ -38,6 +38,7 @@ CATALOG(pg_statistic_ext_data,3429,StatisticExtDataRelationId)
     pg_ndistinct stxdndistinct; /* ndistinct coefficients (serialized) */
     pg_dependencies stxddependencies;   /* dependencies (serialized) */
     pg_mcv_list stxdmcv;        /* MCV (serialized) */
+   pg_statistic stxdexpr[1];   /* stats for expressions */
  
  #endif
  
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h

index 1a79540c94deba58b998a12363f2792ab0d773ec..339f29f4c852e9e237f546fe0abf09af06516f80 100644 (file)
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -81,9 +81,7 @@ extern ObjectAddress AlterOperator(AlterOperatorStmt *stmt);
  extern ObjectAddress CreateStatistics(CreateStatsStmt *stmt);
  extern ObjectAddress AlterStatistics(AlterStatsStmt *stmt);
  extern void RemoveStatisticsById(Oid statsOid);
-extern void UpdateStatisticsForTypeChange(Oid statsOid,
-                                         Oid relationOid, int attnum,
-                                         Oid oldColumnType, Oid newColumnType);
+extern Oid StatisticsGetRelation(Oid statId, bool missing_ok);
  
  /* commands/aggregatecmds.c */
  extern ObjectAddress DefineAggregate(ParseState *pstate, List *name, List *args, bool oldstyle,
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h

index e22df890ef4cf522199d339cc725568494064dd8..299956f32987f75cea18c3019c848749c9670a86 100644 (file)
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -454,6 +454,7 @@ typedef enum NodeTag
     T_TypeName,
     T_ColumnDef,
     T_IndexElem,
+   T_StatsElem,
     T_Constraint,
     T_DefElem,
     T_RangeTblEntry,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h

index 0ce19d98ec8147c5239b81bfa7e2fa698b2ebf3f..12e0e026dce178550a81a9236e98b82309804506 100644 (file)
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1914,7 +1914,8 @@ typedef enum AlterTableType
     AT_AddIdentity,             /* ADD IDENTITY */
     AT_SetIdentity,             /* SET identity column options */
     AT_DropIdentity,            /* DROP IDENTITY */
-   AT_AlterCollationRefreshVersion /* ALTER COLLATION ... REFRESH VERSION */
+   AT_AlterCollationRefreshVersion, /* ALTER COLLATION ... REFRESH VERSION */
+   AT_ReAddStatistics          /* internal to commands/tablecmds.c */
  } AlterTableType;
  
  typedef struct ReplicaIdentityStmt
@@ -2872,8 +2873,24 @@ typedef struct CreateStatsStmt
     List       *relations;      /* rels to build stats on (list of RangeVar) */
     char       *stxcomment;     /* comment to apply to stats, or NULL */
     bool        if_not_exists;  /* do nothing if stats name already exists */
+   bool        transformed;    /* true when transformStatsStmt is finished */
  } CreateStatsStmt;
  
+/*
+ * StatsElem - statistics parameters (used in CREATE STATISTICS)
+ *
+ * For a plain attribute, 'name' is the name of the referenced table column
+ * and 'expr' is NULL.  For an expression, 'name' is NULL and 'expr' is the
+ * expression tree.
+ */
+typedef struct StatsElem
+{
+   NodeTag     type;
+   char       *name;           /* name of attribute to index, or NULL */
+   Node       *expr;           /* expression to index, or NULL */
+} StatsElem;
+
+
  /* ----------------------
   *     Alter Statistics Statement
   * ----------------------
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h

index c13642e35eb45e0917c442dc07e2d2911ceacfaf..e4b554f811405746a90316fc2a50ed48749372bf 100644 (file)
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -923,6 +923,7 @@ typedef struct StatisticExtInfo
     RelOptInfo *rel;            /* back-link to statistic's table */
     char        kind;           /* statistics kind of this entry */
     Bitmapset  *keys;           /* attnums of the columns covered */
+   List       *exprs;          /* expressions */
  } StatisticExtInfo;
  
  /*
diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h

index 176b9f37c1f8f4e17153d5d63231570a95e9655a..a71d7e1f74fbd0328d73d0973e3c7d663be3b253 100644 (file)
--- a/src/include/parser/parse_node.h
+++ b/src/include/parser/parse_node.h
@@ -69,6 +69,7 @@ typedef enum ParseExprKind
     EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */
     EXPR_KIND_INDEX_EXPRESSION, /* index expression */
     EXPR_KIND_INDEX_PREDICATE,  /* index predicate */
+   EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */
     EXPR_KIND_ALTER_COL_TRANSFORM,  /* transform expr in ALTER COLUMN TYPE */
     EXPR_KIND_EXECUTE_PARAMETER,    /* parameter value in EXECUTE */
     EXPR_KIND_TRIGGER_WHEN,     /* WHEN condition in CREATE TRIGGER */
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h

index bfa4a6b0f2099c8c512d5d1529e1b4254f6a9347..1056bf081b1062a3f51092fc69b0db0dcecbdfb9 100644 (file)
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -26,6 +26,8 @@ extern AlterTableStmt *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                                                List **afterStmts);
  extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt,
                                      const char *queryString);
+extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt,
+                                          const char *queryString);
  extern void transformRuleStmt(RuleStmt *stmt, const char *queryString,
                               List **actions, Node **whereClause);
  extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt);
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h

index a0a3cf5b0fabc531794dc32e2f8f099ff8b5746e..55cd9252a55b34b7d370359b4c06535f79d001c3 100644 (file)
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -57,19 +57,27 @@ typedef struct SortItem
     int         count;
  } SortItem;
  
-extern MVNDistinct *statext_ndistinct_build(double totalrows,
-                                           int numrows, HeapTuple *rows,
-                                           Bitmapset *attrs, VacAttrStats **stats);
+/* a unified representation of the data the statistics is built on */
+typedef struct StatsBuildData
+{
+   int         numrows;
+   int         nattnums;
+   AttrNumber *attnums;
+   VacAttrStats **stats;
+   Datum     **values;
+   bool      **nulls;
+} StatsBuildData;
+
+
+extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data);
  extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct);
  extern MVNDistinct *statext_ndistinct_deserialize(bytea *data);
  
-extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
-                                                 Bitmapset *attrs, VacAttrStats **stats);
+extern MVDependencies *statext_dependencies_build(StatsBuildData *data);
  extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
  extern MVDependencies *statext_dependencies_deserialize(bytea *data);
  
-extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
-                                 Bitmapset *attrs, VacAttrStats **stats,
+extern MCVList *statext_mcv_build(StatsBuildData *data,
                                   double totalrows, int stattarget);
  extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats);
  extern MCVList *statext_mcv_deserialize(bytea *data);
@@ -85,14 +93,14 @@ extern int  multi_sort_compare_dims(int start, int end, const SortItem *a,
  extern int compare_scalars_simple(const void *a, const void *b, void *arg);
  extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
  
-extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs);
+extern AttrNumber *build_attnums_array(Bitmapset *attrs, int nexprs, int *numattrs);
  
-extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
-                                   TupleDesc tdesc, MultiSortSupport mss,
+extern SortItem *build_sorted_items(StatsBuildData *data, int *nitems,
+                                   MultiSortSupport mss,
                                     int numattrs, AttrNumber *attnums);
  
-extern bool examine_clause_args(List *args, Var **varp,
-                               Const **cstp, bool *varonleftp);
+extern bool examine_opclause_args(List *args, Node **exprp,
+                                 Const **cstp, bool *expronleftp);
  
  extern Selectivity mcv_combine_selectivities(Selectivity simple_sel,
                                              Selectivity mcv_sel,
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h

index fec50688ea490c118ab0cd685feecd39028b27b0..326cf26feae7b60b67464d8b2628bf70b80f993d 100644 (file)
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -26,7 +26,8 @@
  typedef struct MVNDistinctItem
  {
     double      ndistinct;      /* ndistinct value for this combination */
-   Bitmapset  *attrs;          /* attr numbers of items */
+   int         nattributes;    /* number of attributes */
+   AttrNumber *attributes;     /* attribute numbers */
  } MVNDistinctItem;
  
  /* A MVNDistinct object, comprising all possible combinations of columns */
@@ -121,6 +122,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
  extern bool has_stats_of_kind(List *stats, char requiredkind);
  extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
                                                 Bitmapset **clause_attnums,
+                                               List **clause_exprs,
                                                 int nclauses);
+extern HeapTuple statext_expressions_load(Oid stxoid, int idx);
  
  #endif                         /* STATISTICS_H */
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h

index ac3d0a6742f2a286383e1ee5b0ed42b2df5ef90c..d333e5e8a56d03f505d1bc7d7a1e60cefdc74046 100644 (file)
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -41,4 +41,6 @@ extern char *generate_collation_name(Oid collid);
  extern char *generate_opclass_name(Oid opclass);
  extern char *get_range_partbound_string(List *bound_datums);
  
+extern char *pg_get_statisticsobjdef_string(Oid statextid);
+
  #endif                         /* RULEUTILS_H */
diff --git a/src/test/regress/expected/create_table_like.out b/src/test/regress/expected/create_table_like.out

index 10d17be23cf6680c6a2d7cb984e87f9def902c60..4dc5e6aa5fb899634da29771864225a562b6a791 100644 (file)
--- a/src/test/regress/expected/create_table_like.out
+++ b/src/test/regress/expected/create_table_like.out
@@ -304,7 +304,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
  CREATE INDEX ctlt1_b_key ON ctlt1 (b);
  CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
  CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
  COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
  COMMENT ON COLUMN ctlt1.a IS 'A';
  COMMENT ON COLUMN ctlt1.b IS 'B';
  COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
@@ -414,7 +416,8 @@ Indexes:
  Check constraints:
      "ctlt1_a_check" CHECK (length(a) > 2)
  Statistics objects:
-    "public"."ctlt_all_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt_all
+    "public"."ctlt_all_a_b_stat" ON a, b FROM ctlt_all
+    "public"."ctlt_all_expr_stat" ON ((a || b)) FROM ctlt_all
  
  SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_class c WHERE classoid = 'pg_class'::regclass AND objoid = i.indexrelid AND c.oid = i.indexrelid AND i.indrelid = 'ctlt_all'::regclass ORDER BY c.relname, objsubid;
      relname     | objsubid | description 
@@ -424,10 +427,11 @@ SELECT c.relname, objsubid, description FROM pg_description, pg_index i, pg_clas
  (2 rows)
  
  SELECT s.stxname, objsubid, description FROM pg_description, pg_statistic_ext s WHERE classoid = 'pg_statistic_ext'::regclass AND objoid = s.oid AND s.stxrelid = 'ctlt_all'::regclass ORDER BY s.stxname, objsubid;
-      stxname      | objsubid | description 
--------------------+----------+-------------
- ctlt_all_a_b_stat |        0 | ab stats
-(1 row)
+      stxname       | objsubid |  description  
+--------------------+----------+---------------
+ ctlt_all_a_b_stat  |        0 | ab stats
+ ctlt_all_expr_stat |        0 | ab expr stats
+(2 rows)
  
  CREATE TABLE inh_error1 () INHERITS (ctlt1, ctlt4);
  NOTICE:  merging multiple inherited definitions of column "a"
@@ -452,7 +456,8 @@ Indexes:
  Check constraints:
      "ctlt1_a_check" CHECK (length(a) > 2)
  Statistics objects:
-    "public"."pg_attrdef_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM public.pg_attrdef
+    "public"."pg_attrdef_a_b_stat" ON a, b FROM public.pg_attrdef
+    "public"."pg_attrdef_expr_stat" ON ((a || b)) FROM public.pg_attrdef
  
  DROP TABLE public.pg_attrdef;
  -- Check that LIKE isn't confused when new table masks the old, either
@@ -473,7 +478,8 @@ Indexes:
  Check constraints:
      "ctlt1_a_check" CHECK (length(a) > 2)
  Statistics objects:
-    "ctl_schema"."ctlt1_a_b_stat" (ndistinct, dependencies, mcv) ON a, b FROM ctlt1
+    "ctl_schema"."ctlt1_a_b_stat" ON a, b FROM ctlt1
+    "ctl_schema"."ctlt1_expr_stat" ON ((a || b)) FROM ctlt1
  
  ROLLBACK;
  DROP TABLE ctlt1, ctlt2, ctlt3, ctlt4, ctlt12_storage, ctlt12_comments, ctlt1_inh, ctlt13_inh, ctlt13_like, ctlt_all, ctla, ctlb CASCADE;
diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out

index 50d046d3ef1ad5098d1f1dfddbeb59b55b9f9eed..1461e947cdf6db04b57f4c05f265995a551a1874 100644 (file)
--- a/src/test/regress/expected/oidjoins.out
+++ b/src/test/regress/expected/oidjoins.out
@@ -151,11 +151,6 @@ NOTICE:  checking pg_aggregate {aggmfinalfn} => pg_proc {oid}
  NOTICE:  checking pg_aggregate {aggsortop} => pg_operator {oid}
  NOTICE:  checking pg_aggregate {aggtranstype} => pg_type {oid}
  NOTICE:  checking pg_aggregate {aggmtranstype} => pg_type {oid}
-NOTICE:  checking pg_statistic_ext {stxrelid} => pg_class {oid}
-NOTICE:  checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
-NOTICE:  checking pg_statistic_ext {stxowner} => pg_authid {oid}
-NOTICE:  checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
-NOTICE:  checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
  NOTICE:  checking pg_statistic {starelid} => pg_class {oid}
  NOTICE:  checking pg_statistic {staop1} => pg_operator {oid}
  NOTICE:  checking pg_statistic {staop2} => pg_operator {oid}
@@ -168,6 +163,11 @@ NOTICE:  checking pg_statistic {stacoll3} => pg_collation {oid}
  NOTICE:  checking pg_statistic {stacoll4} => pg_collation {oid}
  NOTICE:  checking pg_statistic {stacoll5} => pg_collation {oid}
  NOTICE:  checking pg_statistic {starelid,staattnum} => pg_attribute {attrelid,attnum}
+NOTICE:  checking pg_statistic_ext {stxrelid} => pg_class {oid}
+NOTICE:  checking pg_statistic_ext {stxnamespace} => pg_namespace {oid}
+NOTICE:  checking pg_statistic_ext {stxowner} => pg_authid {oid}
+NOTICE:  checking pg_statistic_ext {stxrelid,stxkeys} => pg_attribute {attrelid,attnum}
+NOTICE:  checking pg_statistic_ext_data {stxoid} => pg_statistic_ext {oid}
  NOTICE:  checking pg_rewrite {ev_class} => pg_class {oid}
  NOTICE:  checking pg_trigger {tgrelid} => pg_class {oid}
  NOTICE:  checking pg_trigger {tgparentid} => pg_trigger {oid}
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out

index 9b12cc122a9f5e642c1251e7a07259df2e9d2eb5..9b59a7b4a5797708e516f03f5ad119b74cae1e99 100644 (file)
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2418,6 +2418,7 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
      ( SELECT array_agg(a.attname ORDER BY a.attnum) AS array_agg
             FROM (unnest(s.stxkeys) k(k)
               JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))) AS attnames,
+    pg_get_statisticsobjdef_expressions(s.oid) AS exprs,
      s.stxkind AS kinds,
      sd.stxdndistinct AS n_distinct,
      sd.stxddependencies AS dependencies,
@@ -2439,6 +2440,78 @@ pg_stats_ext| SELECT cn.nspname AS schemaname,
             FROM (unnest(s.stxkeys) k(k)
               JOIN pg_attribute a ON (((a.attrelid = s.stxrelid) AND (a.attnum = k.k))))
            WHERE (NOT has_column_privilege(c.oid, a.attnum, 'select'::text))))) AND ((c.relrowsecurity = false) OR (NOT row_security_active(c.oid))));
+pg_stats_ext_exprs| SELECT cn.nspname AS schemaname,
+    c.relname AS tablename,
+    sn.nspname AS statistics_schemaname,
+    s.stxname AS statistics_name,
+    pg_get_userbyid(s.stxowner) AS statistics_owner,
+    stat.expr,
+    (stat.a).stanullfrac AS null_frac,
+    (stat.a).stawidth AS avg_width,
+    (stat.a).stadistinct AS n_distinct,
+        CASE
+            WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stavalues1
+            WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stavalues2
+            WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stavalues3
+            WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stavalues4
+            WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stavalues5
+            ELSE NULL::anyarray
+        END AS most_common_vals,
+        CASE
+            WHEN ((stat.a).stakind1 = 1) THEN (stat.a).stanumbers1
+            WHEN ((stat.a).stakind2 = 1) THEN (stat.a).stanumbers2
+            WHEN ((stat.a).stakind3 = 1) THEN (stat.a).stanumbers3
+            WHEN ((stat.a).stakind4 = 1) THEN (stat.a).stanumbers4
+            WHEN ((stat.a).stakind5 = 1) THEN (stat.a).stanumbers5
+            ELSE NULL::real[]
+        END AS most_common_freqs,
+        CASE
+            WHEN ((stat.a).stakind1 = 2) THEN (stat.a).stavalues1
+            WHEN ((stat.a).stakind2 = 2) THEN (stat.a).stavalues2
+            WHEN ((stat.a).stakind3 = 2) THEN (stat.a).stavalues3
+            WHEN ((stat.a).stakind4 = 2) THEN (stat.a).stavalues4
+            WHEN ((stat.a).stakind5 = 2) THEN (stat.a).stavalues5
+            ELSE NULL::anyarray
+        END AS histogram_bounds,
+        CASE
+            WHEN ((stat.a).stakind1 = 3) THEN (stat.a).stanumbers1[1]
+            WHEN ((stat.a).stakind2 = 3) THEN (stat.a).stanumbers2[1]
+            WHEN ((stat.a).stakind3 = 3) THEN (stat.a).stanumbers3[1]
+            WHEN ((stat.a).stakind4 = 3) THEN (stat.a).stanumbers4[1]
+            WHEN ((stat.a).stakind5 = 3) THEN (stat.a).stanumbers5[1]
+            ELSE NULL::real
+        END AS correlation,
+        CASE
+            WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stavalues1
+            WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stavalues2
+            WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stavalues3
+            WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stavalues4
+            WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stavalues5
+            ELSE NULL::anyarray
+        END AS most_common_elems,
+        CASE
+            WHEN ((stat.a).stakind1 = 4) THEN (stat.a).stanumbers1
+            WHEN ((stat.a).stakind2 = 4) THEN (stat.a).stanumbers2
+            WHEN ((stat.a).stakind3 = 4) THEN (stat.a).stanumbers3
+            WHEN ((stat.a).stakind4 = 4) THEN (stat.a).stanumbers4
+            WHEN ((stat.a).stakind5 = 4) THEN (stat.a).stanumbers5
+            ELSE NULL::real[]
+        END AS most_common_elem_freqs,
+        CASE
+            WHEN ((stat.a).stakind1 = 5) THEN (stat.a).stanumbers1
+            WHEN ((stat.a).stakind2 = 5) THEN (stat.a).stanumbers2
+            WHEN ((stat.a).stakind3 = 5) THEN (stat.a).stanumbers3
+            WHEN ((stat.a).stakind4 = 5) THEN (stat.a).stanumbers4
+            WHEN ((stat.a).stakind5 = 5) THEN (stat.a).stanumbers5
+            ELSE NULL::real[]
+        END AS elem_count_histogram
+   FROM (((((pg_statistic_ext s
+     JOIN pg_class c ON ((c.oid = s.stxrelid)))
+     LEFT JOIN pg_statistic_ext_data sd ON ((s.oid = sd.stxoid)))
+     LEFT JOIN pg_namespace cn ON ((cn.oid = c.relnamespace)))
+     LEFT JOIN pg_namespace sn ON ((sn.oid = s.stxnamespace)))
+     JOIN LATERAL ( SELECT unnest(pg_get_statisticsobjdef_expressions(s.oid)) AS expr,
+            unnest(sd.stxdexpr) AS a) stat ON ((stat.expr IS NOT NULL)));
  pg_tables| SELECT n.nspname AS schemaname,
      c.relname AS tablename,
      pg_get_userbyid(c.relowner) AS tableowner,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out

index 07af1e29e395cc3d51887cb929da8eeca21b63a0..60e9bfcd783dddb0db99ea5d13d610559ca7512a 100644 (file)
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -25,7 +25,7 @@ begin
  end;
  $$;
  -- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
  CREATE STATISTICS tst;
  ERROR:  syntax error at or near ";"
  LINE 1: CREATE STATISTICS tst;
@@ -44,12 +44,25 @@ CREATE STATISTICS tst ON a, b FROM ext_stats_test;
  ERROR:  column "a" does not exist
  CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
  ERROR:  duplicate column name in statistics definition
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-ERROR:  only simple column references are allowed in CREATE STATISTICS
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
-ERROR:  only simple column references are allowed in CREATE STATISTICS
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+ERROR:  cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR:  cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+ERROR:  cannot have more than 8 columns in statistics
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
+ERROR:  duplicate expression in statistics definition
  CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
  ERROR:  unrecognized statistics kind "unrecognized"
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+ERROR:  syntax error at or near "+"
+LINE 1: CREATE STATISTICS tst ON y + z FROM ext_stats_test;
+                                   ^
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
+ERROR:  syntax error at or near ","
+LINE 1: CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+                                   ^
  DROP TABLE ext_stats_test;
  -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
  CREATE TABLE ab1 (a INTEGER, b INTEGER, c INTEGER);
@@ -79,7 +92,7 @@ ALTER TABLE ab1 DROP COLUMN a;
   b      | integer |           |          | 
   c      | integer |           |          | 
  Statistics objects:
-    "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1
+    "public"."ab1_b_c_stats" ON b, c FROM ab1
  
  -- Ensure statistics are dropped when table is
  SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%';
@@ -111,7 +124,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS 0;
   a      | integer |           |          | 
   b      | integer |           |          | 
  Statistics objects:
-    "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1; STATISTICS 0
+    "public"."ab1_a_b_stats" ON a, b FROM ab1; STATISTICS 0
  
  ANALYZE ab1;
  SELECT stxname, stxdndistinct, stxddependencies, stxdmcv
@@ -131,7 +144,7 @@ ALTER STATISTICS ab1_a_b_stats SET STATISTICS -1;
   a      | integer |           |          |         | plain   |              | 
   b      | integer |           |          |         | plain   |              | 
  Statistics objects:
-    "public"."ab1_a_b_stats" (ndistinct, dependencies, mcv) ON a, b FROM ab1
+    "public"."ab1_a_b_stats" ON a, b FROM ab1
  
  -- partial analyze doesn't build stats either
  ANALYZE ab1 (a);
@@ -150,6 +163,39 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
  ANALYZE ab1;
  DROP TABLE ab1 CASCADE;
  NOTICE:  drop cascades to table ab1c
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+ stxkind 
+---------
+ {e}
+(1 row)
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+  stxkind  
+-----------
+ {d,f,m,e}
+(1 row)
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+    generate_series(1,10),
+    generate_series(1,10),
+    generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+    generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
  -- Verify supported object types for extended statistics
  CREATE schema tststats;
  CREATE TABLE tststats.t (a int, b int, c text);
@@ -244,6 +290,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
         200 |     11
  (1 row)
  
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual 
+-----------+--------
+       100 |     11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+       100 |     11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual 
+-----------+--------
+       100 |     11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+       100 |     11
+(1 row)
+
  -- correct command
  CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
  ANALYZE ndistinct;
@@ -282,6 +352,32 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
          11 |     11
  (1 row)
  
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual 
+-----------+--------
+        11 |     11
+(1 row)
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+        11 |     11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual 
+-----------+--------
+        11 |     11
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+        11 |     11
+(1 row)
+
  -- last two plans keep using Group Aggregate, because 'd' is not covered
  -- by the statistic and while it's NULL-only we assume 200 values for it
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
@@ -343,6 +439,30 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d
         200 |     13
  (1 row)
  
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+ estimated | actual 
+-----------+--------
+       221 |    221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+       221 |    221
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+ estimated | actual 
+-----------+--------
+      1000 |   1000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+ estimated | actual 
+-----------+--------
+       221 |    221
+(1 row)
+
  DROP STATISTICS s10;
  SELECT s.stxkind, d.stxdndistinct
    FROM pg_statistic_ext s, pg_statistic_ext_data d
@@ -383,393 +503,1160 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d
         200 |     13
  (1 row)
  
--- functional dependencies tests
-CREATE TABLE functional_dependencies (
-    filler1 TEXT,
-    filler2 NUMERIC,
-    a INT,
-    b TEXT,
-    filler3 DATE,
-    c INT,
-    d TEXT
-)
-WITH (autovacuum_enabled = off);
-CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
-CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
--- random data (no functional dependencies)
-INSERT INTO functional_dependencies (a, b, c, filler1)
-     SELECT mod(i, 5), mod(i, 7), mod(i, 11), i FROM generate_series(1,1000) s(i);
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
   estimated | actual 
  -----------+--------
-        29 |     29
+       100 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         3 |      3
+       100 |    221
  (1 row)
  
--- create statistics
-CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
   estimated | actual 
  -----------+--------
-        29 |     29
+       100 |   1000
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         3 |      3
+       100 |    221
  (1 row)
  
--- a => b, a => c, b => c
-TRUNCATE functional_dependencies;
-DROP STATISTICS func_deps_stat;
-INSERT INTO functional_dependencies (a, b, c, filler1)
-     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         1 |     50
+       100 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
   estimated | actual 
  -----------+--------
-         1 |     50
+       100 |   1000
  (1 row)
  
--- IN
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         2 |    100
+       100 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+  FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+   AND d.stxoid = s.oid;
+ stxkind |                           stxdndistinct                           
+---------+-------------------------------------------------------------------
+ {d,e}   | {"-1, -2": 221, "-1, -3": 247, "-2, -3": 323, "-1, -2, -3": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         4 |    100
+       221 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
   estimated | actual 
  -----------+--------
-         8 |    200
+      1000 |   1000
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
   estimated | actual 
  -----------+--------
-         4 |    100
+       221 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+DROP STATISTICS s10;
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-         1 |    200
+       100 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
   estimated | actual 
  -----------+--------
-         1 |    200
+       100 |    247
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
   estimated | actual 
  -----------+--------
-         3 |    400
+       100 |   1000
  (1 row)
  
--- OR clauses referencing the same attribute
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT s.stxkind, d.stxdndistinct
+  FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+   AND d.stxoid = s.oid;
+ stxkind |                        stxdndistinct                        
+---------+-------------------------------------------------------------
+ {d,e}   | {"3, 4": 221, "3, -1": 247, "4, -1": 323, "3, 4, -1": 1000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-         2 |    100
+       221 |    221
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
   estimated | actual 
  -----------+--------
-         4 |    100
+       247 |    247
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
   estimated | actual 
  -----------+--------
-         8 |    200
+      1000 |   1000
  (1 row)
  
--- OR clauses referencing different attributes
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+DROP STATISTICS s10;
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+     SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+       FROM generate_series(1,1000) s(i);
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-         3 |    100
+        27 |      9
  (1 row)
  
--- ANY
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
   estimated | actual 
  -----------+--------
-         2 |    100
+        27 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
   estimated | actual 
  -----------+--------
-         4 |    100
+        27 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
   estimated | actual 
  -----------+--------
-         8 |    200
+        27 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
   estimated | actual 
  -----------+--------
-         1 |    200
+       100 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
   estimated | actual 
  -----------+--------
-         1 |    200
+       100 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
   estimated | actual 
  -----------+--------
-         3 |    400
+       100 |    180
  (1 row)
  
--- ANY with inequalities should not benefit from functional dependencies
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-      2472 |   2400
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
   estimated | actual 
  -----------+--------
-      1441 |   1250
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
   estimated | actual 
  -----------+--------
-      3909 |   2550
+         9 |      9
  (1 row)
  
--- ALL (should not benefit from functional dependencies)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
   estimated | actual 
  -----------+--------
-         2 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
   estimated | actual 
  -----------+--------
-         1 |      0
+        45 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
   estimated | actual 
  -----------+--------
-         1 |      0
+        45 |     45
  (1 row)
  
--- create statistics
-CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
-ANALYZE functional_dependencies;
--- print the detected dependencies
-SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
-                                                dependencies                                                
-------------------------------------------------------------------------------------------------------------
- {"3 => 4": 1.000000, "3 => 6": 1.000000, "4 => 6": 1.000000, "3, 4 => 6": 1.000000, "3, 6 => 4": 1.000000}
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual 
+-----------+--------
+       100 |    180
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+-- replace the second statistics by statistics on expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-        50 |     50
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
   estimated | actual 
  -----------+--------
-        50 |     50
+         9 |      9
  (1 row)
  
--- IN
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
   estimated | actual 
  -----------+--------
-       100 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
   estimated | actual 
  -----------+--------
-       100 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
   estimated | actual 
  -----------+--------
-       200 |    200
+        45 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
   estimated | actual 
  -----------+--------
-       100 |    100
+        45 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
   estimated | actual 
  -----------+--------
-       200 |    200
+       100 |    180
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+-- replace the second statistics by statistics on both attributes and expressions
+DROP STATISTICS s12;
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-       200 |    200
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
   estimated | actual 
  -----------+--------
-       400 |    400
+         9 |      9
  (1 row)
  
--- OR clauses referencing the same attribute
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
   estimated | actual 
  -----------+--------
-        99 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
   estimated | actual 
  -----------+--------
-        99 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
   estimated | actual 
  -----------+--------
-       197 |    200
+        45 |     45
  (1 row)
  
--- OR clauses referencing different attributes are incompatible
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
   estimated | actual 
  -----------+--------
-         3 |    100
+        45 |     45
  (1 row)
  
--- ANY
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
   estimated | actual 
  -----------+--------
-       100 |    100
+       100 |    180
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+-- replace the other statistics by statistics on both attributes and expressions
+DROP STATISTICS s11;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
   estimated | actual 
  -----------+--------
-       100 |    100
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
   estimated | actual 
  -----------+--------
-       200 |    200
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
   estimated | actual 
  -----------+--------
-       200 |    200
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
   estimated | actual 
  -----------+--------
-       200 |    200
+         9 |      9
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
   estimated | actual 
  -----------+--------
-       400 |    400
+        45 |     45
  (1 row)
  
--- ANY with inequalities should not benefit from functional dependencies
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
   estimated | actual 
  -----------+--------
-      2472 |   2400
+        45 |     45
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
   estimated | actual 
  -----------+--------
-      1441 |   1250
+       100 |    180
+(1 row)
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+ANALYZE ndistinct;
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+ estimated | actual 
+-----------+--------
+         9 |      9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+ estimated | actual 
+-----------+--------
+         9 |      9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+ estimated | actual 
+-----------+--------
+         9 |      9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+ estimated | actual 
+-----------+--------
+         9 |      9
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+ estimated | actual 
+-----------+--------
+        45 |     45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+ estimated | actual 
+-----------+--------
+       100 |     45
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+ estimated | actual 
+-----------+--------
+       100 |    180
+(1 row)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+-- functional dependencies tests
+CREATE TABLE functional_dependencies (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+)
+WITH (autovacuum_enabled = off);
+CREATE INDEX fdeps_ab_idx ON functional_dependencies (a, b);
+CREATE INDEX fdeps_abc_idx ON functional_dependencies (a, b, c);
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies (a, b, c, filler1)
+     SELECT mod(i, 5), mod(i, 7), mod(i, 11), i FROM generate_series(1,1000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        29 |     29
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         3 |      3
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        29 |     29
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         3 |      3
+(1 row)
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual 
+-----------+--------
+         1 |     35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual 
+-----------+--------
+         1 |      5
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+ estimated | actual 
+-----------+--------
+        35 |     35
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+ estimated | actual 
+-----------+--------
+         5 |      5
+(1 row)
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+INSERT INTO functional_dependencies (a, b, c, filler1)
+     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         2 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+ estimated | actual 
+-----------+--------
+         4 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+ estimated | actual 
+-----------+--------
+         8 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         4 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+ estimated | actual 
+-----------+--------
+         3 |    400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         2 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual 
+-----------+--------
+         4 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual 
+-----------+--------
+         8 |    200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         3 |    100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         2 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         4 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         8 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+ estimated | actual 
+-----------+--------
+         3 |    400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+ estimated | actual 
+-----------+--------
+      2472 |   2400
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+      1441 |   1250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+      3909 |   2550
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+ estimated | actual 
+-----------+--------
+         2 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies;
+ANALYZE functional_dependencies;
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+                                                dependencies                                                
+------------------------------------------------------------------------------------------------------------
+ {"3 => 4": 1.000000, "3 => 6": 1.000000, "4 => 6": 1.000000, "3, 4 => 6": 1.000000, "3, 6 => 4": 1.000000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b IN (''1'', ''2'')');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b IN (''1'', ''2'')');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c = 1');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 26, 51, 76) AND b IN (''1'', ''26'') AND c IN (1)');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 26, 27, 51, 52, 76, 77) AND b IN (''1'', ''2'', ''26'', ''27'') AND c IN (1, 2)');
+ estimated | actual 
+-----------+--------
+       400 |    400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+        99 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 51) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual 
+-----------+--------
+        99 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR a = 2 OR a = 51 OR a = 52) AND (b = ''1'' OR b = ''2'')');
+ estimated | actual 
+-----------+--------
+       197 |    200
+(1 row)
+
+-- OR clauses referencing different attributes are incompatible
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a = 1 OR b = ''1'') AND b = ''1''');
+ estimated | actual 
+-----------+--------
+         3 |    100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ''1''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 51]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 51, 52]) AND b = ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = 1');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 26, 51, 76]) AND b = ANY (ARRAY[''1'', ''26'']) AND c = ANY (ARRAY[1])');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = ANY (ARRAY[1, 2, 26, 27, 51, 52, 76, 77]) AND b = ANY (ARRAY[''1'', ''2'', ''26'', ''27'']) AND c = ANY (ARRAY[1, 2])');
+ estimated | actual 
+-----------+--------
+       400 |    400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a < ANY (ARRAY[1, 51]) AND b > ''1''');
+ estimated | actual 
+-----------+--------
+      2472 |   2400
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a >= ANY (ARRAY[1, 51]) AND b <= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+      1441 |   1250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+      3909 |   2550
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+ estimated | actual 
+-----------+--------
+         2 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
+ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+ANALYZE functional_dependencies;
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+DROP STATISTICS func_deps_stat;
+-- now try functional dependencies with expressions
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual 
+-----------+--------
+         1 |    400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual 
+-----------+--------
+         1 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual 
+-----------+--------
+         1 |    400
+(1 row)
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+ estimated | actual 
+-----------+--------
+       926 |   1900
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+      1543 |   2250
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+      2229 |   2050
+(1 row)
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+ANALYZE functional_dependencies;
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+                                                      dependencies                                                      
+------------------------------------------------------------------------------------------------------------------------
+ {"-1 => -2": 1.000000, "-1 => -3": 1.000000, "-2 => -3": 1.000000, "-1, -2 => -3": 1.000000, "-1, -3 => -2": 1.000000}
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+ estimated | actual 
+-----------+--------
+       400 |    400
+(1 row)
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+        99 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual 
+-----------+--------
+        99 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+ estimated | actual 
+-----------+--------
+       197 |    200
+(1 row)
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+         3 |    100
+(1 row)
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+ estimated | actual 
+-----------+--------
+       400 |    400
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a <= ANY (ARRAY[1, 2, 51, 52]) AND b >= ANY (ARRAY[''1'', ''2''])');
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
   estimated | actual 
  -----------+--------
-      3909 |   2550
+      1957 |   1900
  (1 row)
  
--- ALL (should not benefit from functional dependencies)
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1''])');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
   estimated | actual 
  -----------+--------
-         2 |    100
+      2933 |   2250
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 51) AND b = ALL (ARRAY[''1'', ''2''])');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
   estimated | actual 
  -----------+--------
-         1 |      0
+      3548 |   2050
  (1 row)
  
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
   estimated | actual 
  -----------+--------
-         1 |      0
+         2 |    100
  (1 row)
  
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
-ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
   estimated | actual 
  -----------+--------
-        25 |     50
+         1 |      0
  (1 row)
  
-ANALYZE functional_dependencies;
-SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
   estimated | actual 
  -----------+--------
-        50 |     50
+         1 |      0
  (1 row)
  
  -- check the ability to use multiple functional dependencies
@@ -896,6 +1783,39 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
           1 |      1
  (1 row)
  
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual 
+-----------+--------
+         1 |     13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+ estimated | actual 
+-----------+--------
+        13 |     13
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+ estimated | actual 
+-----------+--------
+         1 |      1
+(1 row)
+
  -- 100 distinct combinations, all in the MCV list
  TRUNCATE mcv_lists;
  DROP STATISTICS mcv_lists_stats;
@@ -1121,6 +2041,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
         200 |    200
  (1 row)
  
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
   estimated | actual 
  -----------+--------
@@ -1207,6 +2133,212 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
          50 |     50
  (1 row)
  
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+ANALYZE mcv_lists;
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual 
+-----------+--------
+       111 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+       111 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual 
+-----------+--------
+        15 |    120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual 
+-----------+--------
+        11 |    150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual 
+-----------+--------
+         5 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+         5 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual 
+-----------+--------
+         5 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+         5 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual 
+-----------+--------
+         1 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual 
+-----------+--------
+       149 |    120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual 
+-----------+--------
+        20 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual 
+-----------+--------
+        20 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual 
+-----------+--------
+       116 |    150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual 
+-----------+--------
+        12 |    100
+(1 row)
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+ANALYZE mcv_lists;
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+ estimated | actual 
+-----------+--------
+        50 |     50
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+ estimated | actual 
+-----------+--------
+       105 |    120
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+ estimated | actual 
+-----------+--------
+       150 |    150
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+ estimated | actual 
+-----------+--------
+       200 |    200
+(1 row)
+
  -- 100 distinct combinations with NULL values, all in the MCV list
  TRUNCATE mcv_lists;
  DROP STATISTICS mcv_lists_stats;
@@ -1712,6 +2844,100 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR
  (1 row)
  
  DROP TABLE mcv_lists_multi;
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+DROP TABLE expr_stats;
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual 
+-----------+--------
+        11 |    100
+(1 row)
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+ estimated | actual 
+-----------+--------
+       100 |    100
+(1 row)
+
+DROP TABLE expr_stats;
  -- Permission tests. Users should not be able to see specific data values in
  -- the extended statistics, if they lack permission to see those values in
  -- the underlying table.
@@ -1743,21 +2969,21 @@ create statistics stts_s2.stts_yama (dependencies, mcv) on col1, col3 from stts_
  insert into stts_t1 select i,i from generate_series(1,100) i;
  analyze stts_t1;
  \dX
-                                          List of extended statistics
-  Schema  |          Name          |              Definition              | Ndistinct | Dependencies |   MCV   
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public   | func_deps_stat         | a, b, c FROM functional_dependencies |           | defined      | 
- public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays        |           |              | defined
- public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool          |           |              | defined
- public   | mcv_lists_stats        | a, b, d FROM mcv_lists               |           |              | defined
- public   | stts_1                 | a, b FROM stts_t1                    | defined   |              | 
- public   | stts_2                 | a, b FROM stts_t1                    | defined   | defined      | 
- public   | stts_3                 | a, b FROM stts_t1                    | defined   | defined      | defined
- public   | stts_4                 | b, c FROM stts_t2                    | defined   | defined      | defined
- public   | stts_hoge              | col1, col2, col3 FROM stts_t3        | defined   | defined      | defined
- stts_s1  | stts_foo               | col1, col2 FROM stts_t3              | defined   | defined      | defined
- stts_s2  | stts_yama              | col1, col3 FROM stts_t3              |           | defined      | defined
- tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl     |           |              | defined
+                                                                List of extended statistics
+  Schema  |          Name          |                                    Definition                                    | Ndistinct | Dependencies |   MCV   
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public   | func_deps_stat         | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies |           | defined      | 
+ public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays                                                    |           |              | defined
+ public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool                                                      |           |              | defined
+ public   | mcv_lists_stats        | a, b, d FROM mcv_lists                                                           |           |              | defined
+ public   | stts_1                 | a, b FROM stts_t1                                                                | defined   |              | 
+ public   | stts_2                 | a, b FROM stts_t1                                                                | defined   | defined      | 
+ public   | stts_3                 | a, b FROM stts_t1                                                                | defined   | defined      | defined
+ public   | stts_4                 | b, c FROM stts_t2                                                                | defined   | defined      | defined
+ public   | stts_hoge              | col1, col2, col3 FROM stts_t3                                                    | defined   | defined      | defined
+ stts_s1  | stts_foo               | col1, col2 FROM stts_t3                                                          | defined   | defined      | defined
+ stts_s2  | stts_yama              | col1, col3 FROM stts_t3                                                          |           | defined      | defined
+ tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl                                                 |           |              | defined
  (12 rows)
  
  \dX stts_?
@@ -1778,21 +3004,21 @@ analyze stts_t1;
  (1 row)
  
  \dX+
-                                          List of extended statistics
-  Schema  |          Name          |              Definition              | Ndistinct | Dependencies |   MCV   
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public   | func_deps_stat         | a, b, c FROM functional_dependencies |           | defined      | 
- public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays        |           |              | defined
- public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool          |           |              | defined
- public   | mcv_lists_stats        | a, b, d FROM mcv_lists               |           |              | defined
- public   | stts_1                 | a, b FROM stts_t1                    | defined   |              | 
- public   | stts_2                 | a, b FROM stts_t1                    | defined   | defined      | 
- public   | stts_3                 | a, b FROM stts_t1                    | defined   | defined      | defined
- public   | stts_4                 | b, c FROM stts_t2                    | defined   | defined      | defined
- public   | stts_hoge              | col1, col2, col3 FROM stts_t3        | defined   | defined      | defined
- stts_s1  | stts_foo               | col1, col2 FROM stts_t3              | defined   | defined      | defined
- stts_s2  | stts_yama              | col1, col3 FROM stts_t3              |           | defined      | defined
- tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl     |           |              | defined
+                                                                List of extended statistics
+  Schema  |          Name          |                                    Definition                                    | Ndistinct | Dependencies |   MCV   
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public   | func_deps_stat         | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies |           | defined      | 
+ public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays                                                    |           |              | defined
+ public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool                                                      |           |              | defined
+ public   | mcv_lists_stats        | a, b, d FROM mcv_lists                                                           |           |              | defined
+ public   | stts_1                 | a, b FROM stts_t1                                                                | defined   |              | 
+ public   | stts_2                 | a, b FROM stts_t1                                                                | defined   | defined      | 
+ public   | stts_3                 | a, b FROM stts_t1                                                                | defined   | defined      | defined
+ public   | stts_4                 | b, c FROM stts_t2                                                                | defined   | defined      | defined
+ public   | stts_hoge              | col1, col2, col3 FROM stts_t3                                                    | defined   | defined      | defined
+ stts_s1  | stts_foo               | col1, col2 FROM stts_t3                                                          | defined   | defined      | defined
+ stts_s2  | stts_yama              | col1, col3 FROM stts_t3                                                          |           | defined      | defined
+ tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl                                                 |           |              | defined
  (12 rows)
  
  \dX+ stts_?
@@ -1822,21 +3048,21 @@ analyze stts_t1;
  create role regress_stats_ext nosuperuser;
  set role regress_stats_ext;
  \dX
-                                          List of extended statistics
-  Schema  |          Name          |              Definition              | Ndistinct | Dependencies |   MCV   
-----------+------------------------+--------------------------------------+-----------+--------------+---------
- public   | func_deps_stat         | a, b, c FROM functional_dependencies |           | defined      | 
- public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays        |           |              | defined
- public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool          |           |              | defined
- public   | mcv_lists_stats        | a, b, d FROM mcv_lists               |           |              | defined
- public   | stts_1                 | a, b FROM stts_t1                    | defined   |              | 
- public   | stts_2                 | a, b FROM stts_t1                    | defined   | defined      | 
- public   | stts_3                 | a, b FROM stts_t1                    | defined   | defined      | defined
- public   | stts_4                 | b, c FROM stts_t2                    | defined   | defined      | defined
- public   | stts_hoge              | col1, col2, col3 FROM stts_t3        | defined   | defined      | defined
- stts_s1  | stts_foo               | col1, col2 FROM stts_t3              | defined   | defined      | defined
- stts_s2  | stts_yama              | col1, col3 FROM stts_t3              |           | defined      | defined
- tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl     |           |              | defined
+                                                                List of extended statistics
+  Schema  |          Name          |                                    Definition                                    | Ndistinct | Dependencies |   MCV   
+----------+------------------------+----------------------------------------------------------------------------------+-----------+--------------+---------
+ public   | func_deps_stat         | ((a * 2)), ((b || 'X'::text)), ((c + (1)::numeric)) FROM functional_dependencies |           | defined      | 
+ public   | mcv_lists_arrays_stats | a, b, c FROM mcv_lists_arrays                                                    |           |              | defined
+ public   | mcv_lists_bool_stats   | a, b, c FROM mcv_lists_bool                                                      |           |              | defined
+ public   | mcv_lists_stats        | a, b, d FROM mcv_lists                                                           |           |              | defined
+ public   | stts_1                 | a, b FROM stts_t1                                                                | defined   |              | 
+ public   | stts_2                 | a, b FROM stts_t1                                                                | defined   | defined      | 
+ public   | stts_3                 | a, b FROM stts_t1                                                                | defined   | defined      | defined
+ public   | stts_4                 | b, c FROM stts_t2                                                                | defined   | defined      | defined
+ public   | stts_hoge              | col1, col2, col3 FROM stts_t3                                                    | defined   | defined      | defined
+ stts_s1  | stts_foo               | col1, col2 FROM stts_t3                                                          | defined   | defined      | defined
+ stts_s2  | stts_yama              | col1, col3 FROM stts_t3                                                          |           | defined      | defined
+ tststats | priv_test_stats        | a, b FROM tststats.priv_test_tbl                                                 |           |              | defined
  (12 rows)
  
  reset role;
diff --git a/src/test/regress/sql/create_table_like.sql b/src/test/regress/sql/create_table_like.sql

index 06b76f949d6ea08da5e8e7cf424d08077e5e4aa5..4929d373a2f114d8a6d3c01c8f348fbe210822ba 100644 (file)
--- a/src/test/regress/sql/create_table_like.sql
+++ b/src/test/regress/sql/create_table_like.sql
@@ -124,7 +124,9 @@ CREATE TABLE ctlt1 (a text CHECK (length(a) > 2) PRIMARY KEY, b text);
  CREATE INDEX ctlt1_b_key ON ctlt1 (b);
  CREATE INDEX ctlt1_fnidx ON ctlt1 ((a || b));
  CREATE STATISTICS ctlt1_a_b_stat ON a,b FROM ctlt1;
+CREATE STATISTICS ctlt1_expr_stat ON (a || b) FROM ctlt1;
  COMMENT ON STATISTICS ctlt1_a_b_stat IS 'ab stats';
+COMMENT ON STATISTICS ctlt1_expr_stat IS 'ab expr stats';
  COMMENT ON COLUMN ctlt1.a IS 'A';
  COMMENT ON COLUMN ctlt1.b IS 'B';
  COMMENT ON CONSTRAINT ctlt1_a_check ON ctlt1 IS 't1_a_check';
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql

index f9f12769e4e710c4c662373db25b64f003ec060b..fb3af6e523601db67bf3b0003af76011f994ae21 100644 (file)
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -28,16 +28,21 @@ end;
  $$;
  
  -- Verify failures
-CREATE TABLE ext_stats_test (x int, y int, z int);
+CREATE TABLE ext_stats_test (x text, y int, z int);
  CREATE STATISTICS tst;
  CREATE STATISTICS tst ON a, b;
  CREATE STATISTICS tst FROM sometab;
  CREATE STATISTICS tst ON a, b FROM nonexistent;
  CREATE STATISTICS tst ON a, b FROM ext_stats_test;
  CREATE STATISTICS tst ON x, x, y FROM ext_stats_test;
-CREATE STATISTICS tst ON x + y FROM ext_stats_test;
-CREATE STATISTICS tst ON (x, y) FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, y, x, x, y FROM ext_stats_test;
+CREATE STATISTICS tst ON x, x, y, x, x, (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1), (x || 'x'), (x || 'x'), (y + 1) FROM ext_stats_test;
+CREATE STATISTICS tst ON (x || 'x'), (x || 'x'), y FROM ext_stats_test;
  CREATE STATISTICS tst (unrecognized) ON x, y FROM ext_stats_test;
+-- incorrect expressions
+CREATE STATISTICS tst ON y + z FROM ext_stats_test; -- missing parentheses
+CREATE STATISTICS tst ON (x, y) FROM ext_stats_test; -- tuple expression
  DROP TABLE ext_stats_test;
  
  -- Ensure stats are dropped sanely, and test IF NOT EXISTS while at it
@@ -97,6 +102,36 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
  ANALYZE ab1;
  DROP TABLE ab1 CASCADE;
  
+-- basic test for statistics on expressions
+CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
+
+-- expression stats may be built on a single expression column
+CREATE STATISTICS ab1_exprstat_1 ON (a+b) FROM ab1;
+
+-- with a single expression, we only enable expression statistics
+CREATE STATISTICS ab1_exprstat_2 ON (a+b) FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_2';
+
+-- adding anything to the expression builds all statistics kinds
+CREATE STATISTICS ab1_exprstat_3 ON (a+b), a FROM ab1;
+SELECT stxkind FROM pg_statistic_ext WHERE stxname = 'ab1_exprstat_3';
+
+-- date_trunc on timestamptz is not immutable, but that should not matter
+CREATE STATISTICS ab1_exprstat_4 ON date_trunc('day', d) FROM ab1;
+
+-- date_trunc on timestamp is immutable
+CREATE STATISTICS ab1_exprstat_5 ON date_trunc('day', c) FROM ab1;
+
+-- insert some data and run analyze, to test that these cases build properly
+INSERT INTO ab1
+SELECT
+    generate_series(1,10),
+    generate_series(1,10),
+    generate_series('2020-10-01'::timestamp, '2020-10-10'::timestamp, interval '1 day'),
+    generate_series('2020-10-01'::timestamptz, '2020-10-10'::timestamptz, interval '1 day');
+ANALYZE ab1;
+DROP TABLE ab1;
+
  -- Verify supported object types for extended statistics
  CREATE schema tststats;
  
@@ -164,6 +199,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b
  
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d');
  
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
  -- correct command
  CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
  
@@ -184,6 +227,16 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
  
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');
  
+-- partial improvement (match on attributes)
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+-- expressions - no improvement
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
  -- last two plans keep using Group Aggregate, because 'd' is not covered
  -- by the statistic and while it's NULL-only we assume 200 values for it
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d');
@@ -216,6 +269,14 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
  
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
  
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
  DROP STATISTICS s10;
  
  SELECT s.stxkind, d.stxdndistinct
@@ -234,6 +295,206 @@ SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY b, c
  
  SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, d');
  
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (a+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+-- ndistinct estimates with statistics on expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+CREATE STATISTICS s10 (ndistinct) ON (a+1), (b+100), (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+  FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+   AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a+1), (b+100), (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (a+1), (b+100)');
+
+DROP STATISTICS s10;
+
+-- a mix of attributes and expressions
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+CREATE STATISTICS s10 (ndistinct) ON a, b, (2*c) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT s.stxkind, d.stxdndistinct
+  FROM pg_statistic_ext s, pg_statistic_ext_data d
+ WHERE s.stxrelid = 'ndistinct'::regclass
+   AND d.stxoid = s.oid;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (2*c)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (2*c)');
+
+DROP STATISTICS s10;
+
+-- combination of multiple ndistinct statistics, with/without expressions
+TRUNCATE ndistinct;
+
+-- two mostly independent groups of columns
+INSERT INTO ndistinct (a, b, c, d)
+     SELECT mod(i,3), mod(i,9), mod(i,5), mod(i,20)
+       FROM generate_series(1,1000) s(i);
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+-- basic statistics on both attributes (no expressions)
+CREATE STATISTICS s11 (ndistinct) ON a, b FROM ndistinct;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the second statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s12;
+
+CREATE STATISTICS s12 (ndistinct) ON c, d, (c * 10), (d - 1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace the other statistics by statistics on both attributes and expressions
+
+DROP STATISTICS s11;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+
+-- replace statistics by somewhat overlapping ones (this expected to get worse estimate
+-- because the first statistics shall be applied to 3 columns, and the second one can't
+-- be really applied)
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
+CREATE STATISTICS s11 (ndistinct) ON a, b, (a*5), (b+1) FROM ndistinct;
+CREATE STATISTICS s12 (ndistinct) ON a, (b+1), (c * 10) FROM ndistinct;
+
+ANALYZE ndistinct;
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), b');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY (a*5), (b+1), c');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, (c*10)');
+
+SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, (b+1), c, (d - 1)');
+
+DROP STATISTICS s11;
+DROP STATISTICS s12;
+
  -- functional dependencies tests
  CREATE TABLE functional_dependencies (
      filler1 TEXT,
@@ -272,6 +533,29 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
  TRUNCATE functional_dependencies;
  DROP STATISTICS func_deps_stat;
  
+-- now do the same thing, but with expressions
+INSERT INTO functional_dependencies (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,5000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- create statistics
+CREATE STATISTICS func_deps_stat (dependencies) ON (mod(a,11)), (mod(b::int, 13)), (mod(c, 7)) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE mod(a, 11) = 1 AND mod(b::int, 13) = 1 AND mod(c, 7) = 1');
+
+-- a => b, a => c, b => c
+TRUNCATE functional_dependencies;
+DROP STATISTICS func_deps_stat;
+
  INSERT INTO functional_dependencies (a, b, c, filler1)
       SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
  
@@ -397,9 +681,9 @@ SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE
  
  SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a IN (1, 2, 51, 52) AND b = ALL (ARRAY[''1'', ''2''])');
  
--- changing the type of column c causes its single-column stats to be dropped,
--- giving a default estimate of 0.005 * 5000 = 25 for (c = 1); check multiple
--- clauses estimated with functional dependencies does not exceed this
+-- changing the type of column c causes all its stats to be dropped, reverting
+-- to default estimates without any statistics, i.e. 0.5% selectivity for each
+-- condition
  ALTER TABLE functional_dependencies ALTER COLUMN c TYPE numeric;
  
  SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
@@ -408,6 +692,132 @@ ANALYZE functional_dependencies;
  
  SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE a = 1 AND b = ''1'' AND c = 1');
  
+DROP STATISTICS func_deps_stat;
+
+-- now try functional dependencies with expressions
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+-- create statistics on expressions
+CREATE STATISTICS func_deps_stat (dependencies) ON (a * 2), (b || 'X'), (c + 1) FROM functional_dependencies;
+
+ANALYZE functional_dependencies;
+
+-- print the detected dependencies
+SELECT dependencies FROM pg_stats_ext WHERE statistics_name = 'func_deps_stat';
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = 2 AND (b || ''X'') = ''1X'' AND (c + 1) = 2');
+
+-- IN
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') IN (''1X'', ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 52, 102, 152) AND (b || ''X'') IN (''1X'', ''26X'') AND (c + 1) IN (2)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 52, 54, 102, 104, 152, 154) AND (b || ''X'') IN (''1X'', ''2X'', ''26X'', ''27X'') AND (c + 1) IN (2, 3)');
+
+-- OR clauses referencing the same attribute
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 102) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (a * 2) = 4 OR (a * 2) = 102 OR (a * 2) = 104) AND ((b || ''X'') = ''1X'' OR (b || ''X'') = ''2X'')');
+
+-- OR clauses referencing different attributes
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE ((a * 2) = 2 OR (b || ''X'') = ''1X'') AND (b || ''X'') = ''1X''');
+
+-- ANY
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 102]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = 2');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 52, 102, 152]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''26X'']) AND (c + 1) = ANY (ARRAY[2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) = ANY (ARRAY[2, 4, 52, 54, 102, 104, 152, 154]) AND (b || ''X'') = ANY (ARRAY[''1X'', ''2X'', ''26X'', ''27X'']) AND (c + 1) = ANY (ARRAY[2, 3])');
+
+-- ANY with inequalities should not benefit from functional dependencies
+-- the estimates however improve thanks to having expression statistics
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) < ANY (ARRAY[2, 102]) AND (b || ''X'') > ''1X''');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) >= ANY (ARRAY[2, 102]) AND (b || ''X'') <= ANY (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) <= ANY (ARRAY[2, 4, 102, 104]) AND (b || ''X'') >= ANY (ARRAY[''1X'', ''2X''])');
+
+-- ALL (should not benefit from functional dependencies)
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 102) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM functional_dependencies WHERE (a * 2) IN (2, 4, 102, 104) AND (b || ''X'') = ALL (ARRAY[''1X'', ''2X''])');
+
  -- check the ability to use multiple functional dependencies
  CREATE TABLE functional_dependencies_multi (
     a INTEGER,
@@ -479,6 +889,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b =
  
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1'' AND c = 1');
  
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+-- random data (no MCV list), but with expression
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,7)), (mod(b::int,11)), (mod(c,13)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,7) = 1 AND mod(b::int,11) = 1 AND mod(c,13) = 1');
+
  -- 100 distinct combinations, all in the MCV list
  TRUNCATE mcv_lists;
  DROP STATISTICS mcv_lists_stats;
@@ -565,6 +997,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
  
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
  
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
+
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
  
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@@ -602,6 +1036,98 @@ ANALYZE mcv_lists;
  
  SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 AND b = ''1''');
  
+
+-- 100 distinct combinations, all in the MCV list, but with expressions
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT i, i, i, i FROM generate_series(1,1000) s(i);
+
+ANALYZE mcv_lists;
+
+-- without any stats on the expressions, we have to use default selectivities, which
+-- is why the estimates here are different from the pre-computed case above
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- create statistics with expressions only (we create three separate stats, in order not to build more complex extended stats)
+CREATE STATISTICS mcv_lists_stats_1 ON (mod(a,20)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_2 ON (mod(b::int,10)) FROM mcv_lists;
+CREATE STATISTICS mcv_lists_stats_3 ON (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+DROP STATISTICS mcv_lists_stats_1;
+DROP STATISTICS mcv_lists_stats_2;
+DROP STATISTICS mcv_lists_stats_3;
+
+-- create statistics with both MCV and expressions
+CREATE STATISTICS mcv_lists_stats (mcv) ON (mod(a,20)), (mod(b::int,10)), (mod(c,5)) FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 = mod(a,20) AND 1 = mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < 1 AND mod(b::int,10) < 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 1 > mod(a,20) AND 1 > mod(b::int,10)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 AND mod(b::int,10) = 1 AND mod(c,5) = 1');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,25) = 1 OR d IS NOT NULL');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) IN (1, 2, 51, 52, NULL) AND mod(b::int,10) IN ( 1, 2, NULL)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = ANY (ARRAY[1, 2, 51, 52]) AND mod(b::int,10) = ANY (ARRAY[1, 2])');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) <= ANY (ARRAY[1, NULL, 2, 3]) AND mod(b::int,10) IN (1, 2, NULL, 3)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) < ALL (ARRAY[4, 5]) AND mod(b::int,10) IN (1, 2, 3) AND mod(c,5) > ANY (ARRAY[1, 2, 3])');
+
+-- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE mod(a,20) = 1 OR mod(b::int,10) = 1 OR mod(c,5) = 1 OR d IS NOT NULL');
+
  -- 100 distinct combinations with NULL values, all in the MCV list
  TRUNCATE mcv_lists;
  DROP STATISTICS mcv_lists_stats;
@@ -894,6 +1420,57 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR
  
  DROP TABLE mcv_lists_multi;
  
+
+-- statistics on integer expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON (a+b), (a-b), (2*a), (3*b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE (a+b) = 0 AND (a-b) = 0');
+
+DROP STATISTICS expr_stats_1;
+DROP TABLE expr_stats;
+
+-- statistics on a mix columns and expressions
+CREATE TABLE expr_stats (a int, b int, c int);
+INSERT INTO expr_stats SELECT mod(i,10), mod(i,10), mod(i,10) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (2*a), (3*b), (a+b), (a-b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (2*a) = 0 AND (3*b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 3 AND b = 3 AND (a-b) = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND b = 1 AND (a-b) = 0');
+
+DROP TABLE expr_stats;
+
+-- statistics on expressions with different data types
+CREATE TABLE expr_stats (a int, b name, c text);
+INSERT INTO expr_stats SELECT mod(i,10), md5(mod(i,10)::text), md5(mod(i,10)::text) FROM generate_series(1,1000) s(i);
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+CREATE STATISTICS expr_stats_1 (mcv) ON a, b, (b || c), (c || b) FROM expr_stats;
+ANALYZE expr_stats;
+
+SELECT * FROM check_estimated_rows('SELECT * FROM expr_stats WHERE a = 0 AND (b || c) <= ''z'' AND (c || b) >= ''0''');
+
+DROP TABLE expr_stats;
+
+
  -- Permission tests. Users should not be able to see specific data values in
  -- the extended statistics, if they lack permission to see those values in
  -- the underlying table.
author	Tomas Vondra <[email protected]>
	Fri, 26 Mar 2021 22:22:01 +0000 (23:22 +0100)
committer	Tomas Vondra <[email protected]>
	Fri, 26 Mar 2021 23:01:11 +0000 (00:01 +0100)
doc/src/sgml/catalogs.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_statistics.sgml		patch \| blob \| blame \| history
src/backend/catalog/Makefile		patch \| blob \| blame \| history
src/backend/catalog/system_views.sql		patch \| blob \| blame \| history
src/backend/commands/statscmds.c		patch \| blob \| blame \| history
src/backend/commands/tablecmds.c		patch \| blob \| blame \| history
src/backend/nodes/copyfuncs.c		patch \| blob \| blame \| history
src/backend/nodes/equalfuncs.c		patch \| blob \| blame \| history
src/backend/nodes/outfuncs.c		patch \| blob \| blame \| history
src/backend/optimizer/util/plancat.c		patch \| blob \| blame \| history
src/backend/parser/gram.y		patch \| blob \| blame \| history
src/backend/parser/parse_agg.c		patch \| blob \| blame \| history
src/backend/parser/parse_expr.c		patch \| blob \| blame \| history
src/backend/parser/parse_func.c		patch \| blob \| blame \| history
src/backend/parser/parse_utilcmd.c		patch \| blob \| blame \| history
src/backend/statistics/dependencies.c		patch \| blob \| blame \| history
src/backend/statistics/extended_stats.c		patch \| blob \| blame \| history
src/backend/statistics/mcv.c		patch \| blob \| blame \| history
src/backend/statistics/mvdistinct.c		patch \| blob \| blame \| history
src/backend/tcop/utility.c		patch \| blob \| blame \| history
src/backend/utils/adt/ruleutils.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/bin/pg_dump/t/002_pg_dump.pl		patch \| blob \| blame \| history
src/bin/psql/describe.c		patch \| blob \| blame \| history
src/include/catalog/catversion.h		patch \| blob \| blame \| history
src/include/catalog/pg_proc.dat		patch \| blob \| blame \| history
src/include/catalog/pg_statistic_ext.h		patch \| blob \| blame \| history
src/include/catalog/pg_statistic_ext_data.h		patch \| blob \| blame \| history
src/include/commands/defrem.h		patch \| blob \| blame \| history
src/include/nodes/nodes.h		patch \| blob \| blame \| history
src/include/nodes/parsenodes.h		patch \| blob \| blame \| history
src/include/nodes/pathnodes.h		patch \| blob \| blame \| history
src/include/parser/parse_node.h		patch \| blob \| blame \| history
src/include/parser/parse_utilcmd.h		patch \| blob \| blame \| history
src/include/statistics/extended_stats_internal.h		patch \| blob \| blame \| history
src/include/statistics/statistics.h		patch \| blob \| blame \| history
src/include/utils/ruleutils.h		patch \| blob \| blame \| history
src/test/regress/expected/create_table_like.out		patch \| blob \| blame \| history
src/test/regress/expected/oidjoins.out		patch \| blob \| blame \| history
src/test/regress/expected/rules.out		patch \| blob \| blame \| history
src/test/regress/expected/stats_ext.out		patch \| blob \| blame \| history
src/test/regress/sql/create_table_like.sql		patch \| blob \| blame \| history
src/test/regress/sql/stats_ext.sql		patch \| blob \| blame \| history