Add hash_mem_multiplier GUC.

author Peter Geoghegan <[email protected]>

Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)

committer Peter Geoghegan <[email protected]>

Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)
author Peter Geoghegan <[email protected]>
Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)
committer Peter Geoghegan <[email protected]>
Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml

index 822bbf1f27269fa13c0cc46bd80d4a338554fffd..427947cf4962efbfc3452a0759203f99ab408037 100644 (file)
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1690,22 +1690,64 @@ include_dir 'conf.d'
        </term>
        <listitem>
         <para>
-        Sets the maximum amount of memory to be used by a query operation
+        Sets the base maximum amount of memory to be used by a query operation
          (such as a sort or hash table) before writing to temporary disk files.
          If this value is specified without units, it is taken as kilobytes.
          The default value is four megabytes (<literal>4MB</literal>).
          Note that for a complex query, several sort or hash operations might be
-        running in parallel; each operation will be allowed to use as much memory
-        as this value specifies before it starts to write data into temporary
-        files. Also, several running sessions could be doing such operations
-        concurrently.  Therefore, the total memory used could be many
-        times the value of <varname>work_mem</varname>; it is necessary to
-        keep this fact in mind when choosing the value. Sort operations are
-        used for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>, and
-        merge joins.
+        running in parallel; each operation will generally be allowed
+        to use as much memory as this value specifies before it starts
+        to write data into temporary files.  Also, several running
+        sessions could be doing such operations concurrently.
+        Therefore, the total memory used could be many times the value
+        of <varname>work_mem</varname>; it is necessary to keep this
+        fact in mind when choosing the value.  Sort operations are used
+        for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>,
+        and merge joins.
          Hash tables are used in hash joins, hash-based aggregation, and
          hash-based processing of <literal>IN</literal> subqueries.
         </para>
+       <para>
+        Hash-based operations are generally more sensitive to memory
+        availability than equivalent sort-based operations.  The
+        memory available for hash tables is computed by multiplying
+        <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  This makes it
+        possible for hash-based operations to use an amount of memory
+        that exceeds the usual <varname>work_mem</varname> base
+        amount.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-hash-mem-multiplier" xreflabel="hash_mem_multiplier">
+      <term><varname>hash_mem_multiplier</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>hash_mem_multiplier</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Used to compute the maximum amount of memory that hash-based
+        operations can use.  The final limit is determined by
+        multiplying <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  The default value is
+        1.0, which makes hash-based operations subject to the same
+        simple <varname>work_mem</varname> maximum as sort-based
+        operations.
+       </para>
+       <para>
+        Consider increasing <varname>hash_mem_multiplier</varname> in
+        environments where spilling by query operations is a regular
+        occurrence, especially when simply increasing
+        <varname>work_mem</varname> results in memory pressure (memory
+        pressure typically takes the form of intermittent out of
+        memory errors).  A setting of 1.5 or 2.0 may be effective with
+        mixed workloads.  Higher settings in the range of 2.0 - 8.0 or
+        more may be effective in environments where
+        <varname>work_mem</varname> has already been increased to 40MB
+        or more.
+       </para>
        </listitem>
       </varlistentry>
  
diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml

index 5e5794bc90d8bb59f7117ecc6803b8bc6f8e2996..6e62f54c597c06acc82cc6f974bbb881c75742ac 100644 (file)
--- a/doc/src/sgml/ref/postgres-ref.sgml
+++ b/doc/src/sgml/ref/postgres-ref.sgml
@@ -338,10 +338,10 @@ PostgreSQL documentation
        <term><option>-S</option> <replaceable class="parameter">work-mem</replaceable></term>
        <listitem>
         <para>
-        Specifies the amount of memory to be used by internal sorts and hashes
-        before resorting to temporary disk files.  See the description of the
-        <varname>work_mem</varname> configuration parameter in <xref
-        linkend="runtime-config-resource-memory"/>.
+        Specifies the base amount of memory to be used by sorts and
+        hash tables before resorting to temporary disk files.  See the
+        description of the <varname>work_mem</varname> configuration
+        parameter in <xref linkend="runtime-config-resource-memory"/>.
         </para>
        </listitem>
       </varlistentry>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml

index e09cb55efcd6c365b4fc21c15ec2e4a2ef31074f..c8698898f3256eb5dc5c5113ff30c4c7f2b037e0 100644 (file)
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1326,10 +1326,12 @@ Out of Memory: Killed process 12345 (postgres).
      system running out of memory, you can avoid the problem by changing
      your configuration.  In some cases, it may help to lower memory-related
      configuration parameters, particularly
-    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>
-    and <link linkend="guc-work-mem"><varname>work_mem</varname></link>.  In
-    other cases, the problem may be caused by allowing too many connections
-    to the database server itself.  In many cases, it may be better to reduce
+    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>,
+    <link linkend="guc-work-mem"><varname>work_mem</varname></link>, and
+    <link linkend="guc-hash-mem-multiplier"><varname>hash_mem_multiplier</varname></link>.
+    In other cases, the problem may be caused by allowing too many
+    connections to the database server itself.  In many cases, it may
+    be better to reduce
      <link linkend="guc-max-connections"><varname>max_connections</varname></link>
      and instead make use of external connection-pooling software.
     </para>
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c

index 321f427e478fcf3748f910b4655882c8302080c9..90d04f9228aa4598d1cceb80ed4d8a714fb09bb3 100644 (file)
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -165,13 +165,14 @@ BuildTupleHashTableExt(PlanState *parent,
  {
     TupleHashTable hashtable;
     Size        entrysize = sizeof(TupleHashEntryData) + additionalsize;
+   int         hash_mem = get_hash_mem();
     MemoryContext oldcontext;
     bool        allow_jit;
  
     Assert(nbuckets > 0);
  
-   /* Limit initial table size request to not more than work_mem */
-   nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize));
+   /* Limit initial table size request to not more than hash_mem */
+   nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
  
     oldcontext = MemoryContextSwitchTo(metacxt);
  
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c

index 02a9165c6941e6bb191d61452889d6318d8675bb..9776263ae75ac10068624981786be8391d0ebd24 100644 (file)
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -203,7 +203,7 @@
   *   entries (and initialize new transition states), we instead spill them to
   *   disk to be processed later. The tuples are spilled in a partitioned
   *   manner, so that subsequent batches are smaller and less likely to exceed
- *   work_mem (if a batch does exceed work_mem, it must be spilled
+ *   hash_mem (if a batch does exceed hash_mem, it must be spilled
   *   recursively).
   *
   *   Spilled data is written to logical tapes. These provide better control
@@ -212,7 +212,7 @@
   *
   *   Note that it's possible for transition states to start small but then
   *   grow very large; for instance in the case of ARRAY_AGG. In such cases,
- *   it's still possible to significantly exceed work_mem. We try to avoid
+ *   it's still possible to significantly exceed hash_mem. We try to avoid
   *   this situation by estimating what will fit in the available memory, and
   *   imposing a limit on the number of groups separately from the amount of
   *   memory consumed.
@@ -1516,7 +1516,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
  
     /*
      * Used to make sure initial hash table allocation does not exceed
-    * work_mem. Note that the estimate does not include space for
+    * hash_mem. Note that the estimate does not include space for
      * pass-by-reference transition data values, nor for the representative
      * tuple of each group.
      */
@@ -1782,7 +1782,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
  }
  
  /*
- * Set limits that trigger spilling to avoid exceeding work_mem. Consider the
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
   * number of partitions we expect to create (if we do spill).
   *
   * There are two limits: a memory limit, and also an ngroups limit. The
@@ -1796,13 +1796,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
  {
     int         npartitions;
     Size        partition_mem;
+   int         hash_mem = get_hash_mem();
  
-   /* if not expected to spill, use all of work_mem */
-   if (input_groups * hashentrysize < work_mem * 1024L)
+   /* if not expected to spill, use all of hash_mem */
+   if (input_groups * hashentrysize < hash_mem * 1024L)
     {
         if (num_partitions != NULL)
             *num_partitions = 0;
-       *mem_limit = work_mem * 1024L;
+       *mem_limit = hash_mem * 1024L;
         *ngroups_limit = *mem_limit / hashentrysize;
         return;
     }
@@ -1824,14 +1825,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
         HASHAGG_WRITE_BUFFER_SIZE * npartitions;
  
     /*
-    * Don't set the limit below 3/4 of work_mem. In that case, we are at the
+    * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
      * minimum number of partitions, so we aren't going to dramatically exceed
      * work mem anyway.
      */
-   if (work_mem * 1024L > 4 * partition_mem)
-       *mem_limit = work_mem * 1024L - partition_mem;
+   if (hash_mem * 1024L > 4 * partition_mem)
+       *mem_limit = hash_mem * 1024L - partition_mem;
     else
-       *mem_limit = work_mem * 1024L * 0.75;
+       *mem_limit = hash_mem * 1024L * 0.75;
  
     if (*mem_limit > hashentrysize)
         *ngroups_limit = *mem_limit / hashentrysize;
@@ -1989,19 +1990,20 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
     int         partition_limit;
     int         npartitions;
     int         partition_bits;
+   int         hash_mem = get_hash_mem();
  
     /*
      * Avoid creating so many partitions that the memory requirements of the
-    * open partition files are greater than 1/4 of work_mem.
+    * open partition files are greater than 1/4 of hash_mem.
      */
     partition_limit =
-       (work_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+       (hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
         HASHAGG_WRITE_BUFFER_SIZE;
  
     mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
  
     /* make enough partitions so that each one is likely to fit in memory */
-   npartitions = 1 + (mem_wanted / (work_mem * 1024L));
+   npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
  
     if (npartitions > partition_limit)
         npartitions = partition_limit;
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c

index 45b342011fe5ee1db764581350ad0addd026c7c3..ea69eeb2a1e4ba6121607bf15dd8b3b68a87b5c6 100644 (file)
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
  #include "port/atomics.h"
  #include "port/pg_bitutils.h"
  #include "utils/dynahash.h"
+#include "utils/guc.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
  #include "utils/syscache.h"
@@ -506,7 +507,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
     hashtable->spaceAllowed = space_allowed;
     hashtable->spaceUsedSkew = 0;
     hashtable->spaceAllowedSkew =
-       hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+       hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
     hashtable->chunks = NULL;
     hashtable->current_chunk = NULL;
     hashtable->parallel_state = state->parallel_state;
@@ -665,7 +666,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
  
  void
  ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-                       bool try_combined_work_mem,
+                       bool try_combined_hash_mem,
                         int parallel_workers,
                         size_t *space_allowed,
                         int *numbuckets,
@@ -682,6 +683,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
     int         nbatch = 1;
     int         nbuckets;
     double      dbuckets;
+   int         hash_mem = get_hash_mem();
  
     /* Force a plausible relation size if no info */
     if (ntuples <= 0.0)
@@ -698,16 +700,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
     inner_rel_bytes = ntuples * tupsize;
  
     /*
-    * Target in-memory hashtable size is work_mem kilobytes.
+    * Target in-memory hashtable size is hash_mem kilobytes.
      */
-   hash_table_bytes = work_mem * 1024L;
+   hash_table_bytes = hash_mem * 1024L;
  
     /*
-    * Parallel Hash tries to use the combined work_mem of all workers to
-    * avoid the need to batch.  If that won't work, it falls back to work_mem
+    * Parallel Hash tries to use the combined hash_mem of all workers to
+    * avoid the need to batch.  If that won't work, it falls back to hash_mem
      * per worker and tries to process batches in parallel.
      */
-   if (try_combined_work_mem)
+   if (try_combined_hash_mem)
         hash_table_bytes += hash_table_bytes * parallel_workers;
  
     *space_allowed = hash_table_bytes;
@@ -728,7 +730,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
      */
     if (useskew)
     {
-       skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
+       skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
  
         /*----------
          * Divisor is:
@@ -751,7 +753,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
     /*
      * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
      * memory is filled, assuming a single batch; but limit the value so that
-    * the pointer arrays we'll try to allocate do not exceed work_mem nor
+    * the pointer arrays we'll try to allocate do not exceed hash_mem nor
      * MaxAllocSize.
      *
      * Note that both nbuckets and nbatch must be powers of 2 to make
@@ -790,10 +792,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
         long        bucket_size;
  
         /*
-        * If Parallel Hash with combined work_mem would still need multiple
-        * batches, we'll have to fall back to regular work_mem budget.
+        * If Parallel Hash with combined hash_mem would still need multiple
+        * batches, we'll have to fall back to regular hash_mem budget.
          */
-       if (try_combined_work_mem)
+       if (try_combined_hash_mem)
         {
             ExecChooseHashTableSize(ntuples, tupwidth, useskew,
                                     false, parallel_workers,
@@ -805,7 +807,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
         }
  
         /*
-        * Estimate the number of buckets we'll want to have when work_mem is
+        * Estimate the number of buckets we'll want to have when hash_mem is
          * entirely full.  Each bucket will contain a bucket pointer plus
          * NTUP_PER_BUCKET tuples, whose projected size already includes
          * overhead for the hash code, pointer to the next tuple, etc.
@@ -820,8 +822,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
         /*
          * Buckets are simple pointers to hashjoin tuples, while tupsize
          * includes the pointer, hash code, and MinimalTupleData.  So buckets
-        * should never really exceed 25% of work_mem (even for
-        * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
+        * should never really exceed 25% of hash_mem (even for
+        * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
          * 2^N bytes, where we might get more because of doubling. So let's
          * look for 50% here.
          */
@@ -1095,15 +1097,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
                 /* Figure out how many batches to use. */
                 if (hashtable->nbatch == 1)
                 {
+                   int         hash_mem = get_hash_mem();
+
                     /*
                      * We are going from single-batch to multi-batch.  We need
                      * to switch from one large combined memory budget to the
-                    * regular work_mem budget.
+                    * regular hash_mem budget.
                      */
-                   pstate->space_allowed = work_mem * 1024L;
+                   pstate->space_allowed = hash_mem * 1024L;
  
                     /*
-                    * The combined work_mem of all participants wasn't
+                    * The combined hash_mem of all participants wasn't
                      * enough. Therefore one batch per participant would be
                      * approximately equivalent and would probably also be
                      * insufficient.  So try two batches per participant,
@@ -2855,7 +2859,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
  
         /*
          * Check if our space limit would be exceeded.  To avoid choking on
-        * very large tuples or very low work_mem setting, we'll always allow
+        * very large tuples or very low hash_mem setting, we'll always allow
          * each backend to allocate at least one chunk.
          */
         if (hashtable->batches[0].at_least_one_chunk &&
@@ -3366,3 +3370,41 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
  
     return true;
  }
+
+/*
+ * Get a hash_mem value by multiplying the work_mem GUC's value by the
+ * hash_mem_multiplier GUC's value.
+ *
+ * Returns a work_mem style KB value that hash-based nodes (including but not
+ * limited to hash join) use in place of work_mem.  This is subject to the
+ * same restrictions as work_mem itself.  (There is no such thing as the
+ * hash_mem GUC, but it's convenient for our callers to pretend that there
+ * is.)
+ *
+ * Exported for use by the planner, as well as other hash-based executor
+ * nodes.  This is a rather random place for this, but there is no better
+ * place.
+ */
+int
+get_hash_mem(void)
+{
+   double      hash_mem;
+
+   Assert(hash_mem_multiplier >= 1.0);
+
+   hash_mem = (double) work_mem * hash_mem_multiplier;
+
+   /*
+    * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
+    * support the assumption that raw derived byte values can be stored in
+    * 'long' variables.  The returned hash_mem value must also meet this
+    * assumption.
+    *
+    * We clamp the final value rather than throw an error because it should
+    * be possible to set work_mem and hash_mem_multiplier independently.
+    */
+   if (hash_mem < MAX_KILOBYTES)
+       return (int) hash_mem;
+
+   return MAX_KILOBYTES;
+}
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c

index 9bb23fef1a6e90a7b1ae8f810c50aace81f96340..5532b91a71dca14b8b982d9242bd07748eb55df9 100644 (file)
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -89,9 +89,9 @@
   * PHJ_BUILD_HASHING_INNER so we can skip loading.
   *
   * Initially we try to plan for a single-batch hash join using the combined
- * work_mem of all participants to create a large shared hash table.  If that
+ * hash_mem of all participants to create a large shared hash table.  If that
   * turns out either at planning or execution time to be impossible then we
- * fall back to regular work_mem sized hash tables.
+ * fall back to regular hash_mem sized hash tables.
   *
   * To avoid deadlocks, we never wait for any barrier unless it is known that
   * all other backends attached to it are actively executing the node or have
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c

index 27ce4cc8069b255512123a55d3110802db78a2fd..fda4b2c6e875f513c8bf78c87d878e184664bc95 100644 (file)
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3525,7 +3525,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
      * Get hash table size that executor would use for inner relation.
      *
      * XXX for the moment, always assume that skew optimization will be
-    * performed.  As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
+    * performed.  As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
      * trying to determine that for sure.
      *
      * XXX at some point it might be interesting to try to account for skew
@@ -3534,7 +3534,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
     ExecChooseHashTableSize(inner_path_rows_total,
                             inner_path->pathtarget->width,
                             true,   /* useskew */
-                           parallel_hash,  /* try_combined_work_mem */
+                           parallel_hash,  /* try_combined_hash_mem */
                             outer_path->parallel_workers,
                             &space_allowed,
                             &numbuckets,
@@ -3597,6 +3597,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
     Cost        run_cost = workspace->run_cost;
     int         numbuckets = workspace->numbuckets;
     int         numbatches = workspace->numbatches;
+   int         hash_mem;
     Cost        cpu_per_tuple;
     QualCost    hash_qual_cost;
     QualCost    qp_qual_cost;
@@ -3715,16 +3716,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
     }
  
     /*
-    * If the bucket holding the inner MCV would exceed work_mem, we don't
+    * If the bucket holding the inner MCV would exceed hash_mem, we don't
      * want to hash unless there is really no other alternative, so apply
      * disable_cost.  (The executor normally copes with excessive memory usage
      * by splitting batches, but obviously it cannot separate equal values
-    * that way, so it will be unable to drive the batch size below work_mem
+    * that way, so it will be unable to drive the batch size below hash_mem
      * when this is true.)
      */
+   hash_mem = get_hash_mem();
     if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
                            inner_path->pathtarget->width) >
-       (work_mem * 1024L))
+       (hash_mem * 1024L))
         startup_cost += disable_cost;
  
     /*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 1345e522dcf5ec8cb1fbbd09cb65adf31b69a405..b40a112c25b2324a0530c187756be1cc9feb6be2 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4196,16 +4196,17 @@ consider_groupingsets_paths(PlannerInfo *root,
                             double dNumGroups)
  {
     Query      *parse = root->parse;
+   int         hash_mem = get_hash_mem();
  
     /*
      * If we're not being offered sorted input, then only consider plans that
      * can be done entirely by hashing.
      *
-    * We can hash everything if it looks like it'll fit in work_mem. But if
+    * We can hash everything if it looks like it'll fit in hash_mem. But if
      * the input is actually sorted despite not being advertised as such, we
      * prefer to make use of that in order to use less memory.
      *
-    * If none of the grouping sets are sortable, then ignore the work_mem
+    * If none of the grouping sets are sortable, then ignore the hash_mem
      * limit and generate a path anyway, since otherwise we'll just fail.
      */
     if (!is_sorted)
@@ -4257,10 +4258,10 @@ consider_groupingsets_paths(PlannerInfo *root,
  
         /*
          * gd->rollups is empty if we have only unsortable columns to work
-        * with.  Override work_mem in that case; otherwise, we'll rely on the
+        * with.  Override hash_mem in that case; otherwise, we'll rely on the
          * sorted-input case to generate usable mixed paths.
          */
-       if (hashsize > work_mem * 1024L && gd->rollups)
+       if (hashsize > hash_mem * 1024L && gd->rollups)
             return;             /* nope, won't fit */
  
         /*
@@ -4379,7 +4380,7 @@ consider_groupingsets_paths(PlannerInfo *root,
     {
         List       *rollups = NIL;
         List       *hash_sets = list_copy(gd->unsortable_sets);
-       double      availspace = (work_mem * 1024.0);
+       double      availspace = (hash_mem * 1024.0);
         ListCell   *lc;
  
         /*
@@ -4400,7 +4401,7 @@ consider_groupingsets_paths(PlannerInfo *root,
  
             /*
              * We treat this as a knapsack problem: the knapsack capacity
-            * represents work_mem, the item weights are the estimated memory
+            * represents hash_mem, the item weights are the estimated memory
              * usage of the hashtables needed to implement a single rollup,
              * and we really ought to use the cost saving as the item value;
              * however, currently the costs assigned to sort nodes don't
@@ -4441,7 +4442,7 @@ consider_groupingsets_paths(PlannerInfo *root,
                                                                 rollup->numGroups);
  
                     /*
-                    * If sz is enormous, but work_mem (and hence scale) is
+                    * If sz is enormous, but hash_mem (and hence scale) is
                      * small, avoid integer overflow here.
                      */
                     k_weights[i] = (int) Min(floor(sz / scale),
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c

index b02fcb9bfe7ac25a9efb84faff85654218b78254..9a8f738c9d05b1884c41d1789c49a72c858aaaac 100644 (file)
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -200,7 +200,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
      * XXX If an ANY subplan is uncorrelated, build_subplan may decide to hash
      * its output.  In that case it would've been better to specify full
      * retrieval.  At present, however, we can only check hashability after
-    * we've made the subplan :-(.  (Determining whether it'll fit in work_mem
+    * we've made the subplan :-(.  (Determining whether it'll fit in hash_mem
      * is the really hard part.)  Therefore, we don't want to be too
      * optimistic about the percentage of tuples retrieved, for fear of
      * selecting a plan that's bad for the materialization case.
@@ -278,7 +278,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
  
             plan = create_plan(subroot, best_path);
  
-           /* Now we can check if it'll fit in work_mem */
+           /* Now we can check if it'll fit in hash_mem */
             /* XXX can we check this at the Path stage? */
             if (subplan_is_hashable(plan))
             {
@@ -716,16 +716,17 @@ static bool
  subplan_is_hashable(Plan *plan)
  {
     double      subquery_size;
+   int         hash_mem = get_hash_mem();
  
     /*
-    * The estimated size of the subquery result must fit in work_mem. (Note:
+    * The estimated size of the subquery result must fit in hash_mem. (Note:
      * we use heap tuple overhead here even though the tuples will actually be
      * stored as MinimalTuples; this provides some fudge factor for hashtable
      * overhead.)
      */
     subquery_size = plan->plan_rows *
         (MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader));
-   if (subquery_size > work_mem * 1024L)
+   if (subquery_size > hash_mem * 1024L)
         return false;
  
     return true;
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c

index 6588f83d5ec6fde6491db599b12565ac736ec8bc..2ebd4ea332071cff60eb34c0b02bd78b1e192baa 100644 (file)
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1018,6 +1018,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
                     const char *construct)
  {
     int         numGroupCols = list_length(groupClauses);
+   int         hash_mem = get_hash_mem();
     bool        can_sort;
     bool        can_hash;
     Size        hashentrysize;
@@ -1049,15 +1050,17 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
  
     /*
      * Don't do it if it doesn't look like the hashtable will fit into
-    * work_mem.
+    * hash_mem.
      */
     hashentrysize = MAXALIGN(input_path->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader);
  
-   if (hashentrysize * dNumGroups > work_mem * 1024L)
+   if (hashentrysize * dNumGroups > hash_mem * 1024L)
         return false;
  
     /*
-    * See if the estimated cost is no more than doing it the other way.
+    * See if the estimated cost is no more than doing it the other way.  We
+    * deliberately give the hash case more memory when hash_mem exceeds
+    * standard work mem (i.e. when hash_mem_multiplier exceeds 1.0).
      *
      * We need to consider input_plan + hashagg versus input_plan + sort +
      * group.  Note that the actual result plan might involve a SetOp or
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c

index 5110a6b806017e1d82688a236243debaf3250c1c..c1fc866cbf911049639c83be78b560bdc9b5a8fc 100644 (file)
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1720,8 +1720,9 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
          * planner.c).
          */
         int         hashentrysize = subpath->pathtarget->width + 64;
+       int         hash_mem = get_hash_mem();
  
-       if (hashentrysize * pathnode->path.rows > work_mem * 1024L)
+       if (hashentrysize * pathnode->path.rows > hash_mem * 1024L)
         {
             /*
              * We should not try to hash.  Hack the SpecialJoinInfo to
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c

index bb49e80d1665b46c234cb3dbe7babbf6fbde41c8..06cf16d9d71649bd8adb20410a7e5134f5d5a2a6 100644 (file)
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -1450,7 +1450,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
      * enough to not use a multiple of work_mem, and one typically would not
      * have many large foreign-key validations happening concurrently.  So
      * this seems to meet the criteria for being considered a "maintenance"
-    * operation, and accordingly we use maintenance_work_mem.
+    * operation, and accordingly we use maintenance_work_mem.  However, we
+    * must also set hash_mem_multiplier to 1, since it is surely not okay to
+    * let that get applied to the maintenance_work_mem value.
      *
      * We use the equivalent of a function SET option to allow the setting to
      * persist for exactly the duration of the check query.  guc.c also takes
@@ -1462,6 +1464,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
     (void) set_config_option("work_mem", workmembuf,
                              PGC_USERSET, PGC_S_SESSION,
                              GUC_ACTION_SAVE, true, 0, false);
+   (void) set_config_option("hash_mem_multiplier", "1",
+                            PGC_USERSET, PGC_S_SESSION,
+                            GUC_ACTION_SAVE, true, 0, false);
  
     if (SPI_connect() != SPI_OK_CONNECT)
         elog(ERROR, "SPI_connect failed");
@@ -1553,7 +1558,7 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
         elog(ERROR, "SPI_finish failed");
  
     /*
-    * Restore work_mem.
+    * Restore work_mem and hash_mem_multiplier.
      */
     AtEOXact_GUC(true, save_nestlevel);
  
@@ -1685,7 +1690,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
      * enough to not use a multiple of work_mem, and one typically would not
      * have many large foreign-key validations happening concurrently.  So
      * this seems to meet the criteria for being considered a "maintenance"
-    * operation, and accordingly we use maintenance_work_mem.
+    * operation, and accordingly we use maintenance_work_mem.  However, we
+    * must also set hash_mem_multiplier to 1, since it is surely not okay to
+    * let that get applied to the maintenance_work_mem value.
      *
      * We use the equivalent of a function SET option to allow the setting to
      * persist for exactly the duration of the check query.  guc.c also takes
@@ -1697,6 +1704,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
     (void) set_config_option("work_mem", workmembuf,
                              PGC_USERSET, PGC_S_SESSION,
                              GUC_ACTION_SAVE, true, 0, false);
+   (void) set_config_option("hash_mem_multiplier", "1",
+                            PGC_USERSET, PGC_S_SESSION,
+                            GUC_ACTION_SAVE, true, 0, false);
  
     if (SPI_connect() != SPI_OK_CONNECT)
         elog(ERROR, "SPI_connect failed");
@@ -1763,7 +1773,7 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
         elog(ERROR, "SPI_finish failed");
  
     /*
-    * Restore work_mem.
+    * Restore work_mem and hash_mem_multiplier.
      */
     AtEOXact_GUC(true, save_nestlevel);
  }
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c

index 497d7c38ae6fdd779a75a4f0156c1c3996319afb..6ab8216839891ef92684d63742dfce93a537e5dd 100644 (file)
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -119,6 +119,7 @@ int         IntervalStyle = INTSTYLE_POSTGRES;
  bool       enableFsync = true;
  bool       allowSystemTableMods = false;
  int            work_mem = 4096;
+double     hash_mem_multiplier = 1.0;
  int            maintenance_work_mem = 65536;
  int            max_parallel_maintenance_workers = 2;
  
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c

index abfa95a2314bf04cc5140dd05ac8a9f75c8fc77c..c20885e97b2035d684b4f9794115058f7f521d97 100644 (file)
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3542,6 +3542,17 @@ static struct config_real ConfigureNamesReal[] =
         NULL, NULL, NULL
     },
  
+   {
+       {"hash_mem_multiplier", PGC_USERSET, RESOURCES_MEM,
+           gettext_noop("Multiple of work_mem to use for hash tables."),
+           NULL,
+           GUC_EXPLAIN
+       },
+       &hash_mem_multiplier,
+       1.0, 1.0, 1000.0,
+       NULL, NULL, NULL
+   },
+
     {
         {"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER,
             gettext_noop("Multiple of the average buffer usage to free per round."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample

index 5a0b8e982179eae7a742ec919a608c140b75ffef..aa30291ea3964a060f8a40617899aac4a0204d64 100644 (file)
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -130,6 +130,7 @@
  # Caution: it is not advisable to set max_prepared_transactions nonzero unless
  # you actively intend to use prepared transactions.
  #work_mem = 4MB                # min 64kB
+#hash_mem_multiplier = 1.0     # 1-1000.0 multiplier on hash table work_mem
  #maintenance_work_mem = 64MB       # min 1MB
  #autovacuum_work_mem = -1      # min 1MB, or -1 to use maintenance_work_mem
  #logical_decoding_work_mem = 64MB  # min 64kB
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h

index 79b634e8ed10e060b7dfc29fcf3c040517a015d0..eb5daba36b0ff08dd43278f81c12a01b2e73d1d2 100644 (file)
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -88,7 +88,7 @@ typedef struct HashJoinTupleData
   * outer relation tuples with these hash values are matched against that
   * table instead of the main one.  Thus, tuples with these hash values are
   * effectively handled as part of the first batch and will never go to disk.
- * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
+ * The skew hashtable is limited to SKEW_HASH_MEM_PERCENT of the total memory
   * allowed for the join; while building the hashtables, we decrease the number
   * of MCVs being specially treated if needed to stay under this limit.
   *
@@ -107,7 +107,7 @@ typedef struct HashSkewBucket
  
  #define SKEW_BUCKET_OVERHEAD  MAXALIGN(sizeof(HashSkewBucket))
  #define INVALID_SKEW_BUCKET_NO (-1)
-#define SKEW_WORK_MEM_PERCENT  2
+#define SKEW_HASH_MEM_PERCENT  2
  #define SKEW_MIN_OUTER_FRACTION  0.01
  
  /*
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h

index 64d2ce693ca7d40e4a02e7d20241e3ee16e9b5f7..2db4e2f67267bf9c62701d5aa72ed8e21b50ba54 100644 (file)
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -61,7 +61,7 @@ extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
  extern void ExecHashTableReset(HashJoinTable hashtable);
  extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
  extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-                                   bool try_combined_work_mem,
+                                   bool try_combined_hash_mem,
                                     int parallel_workers,
                                     size_t *space_allowed,
                                     int *numbuckets,
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h

index 18bc8a7b9045ca80802ae740a4e122e87e1abdda..72e33523984f4833d7aea01d9ee6a111cd9fbd0c 100644 (file)
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -243,6 +243,7 @@ extern PGDLLIMPORT int IntervalStyle;
  extern bool enableFsync;
  extern PGDLLIMPORT bool allowSystemTableMods;
  extern PGDLLIMPORT int work_mem;
+extern PGDLLIMPORT double hash_mem_multiplier;
  extern PGDLLIMPORT int maintenance_work_mem;
  extern PGDLLIMPORT int max_parallel_maintenance_workers;
  
@@ -469,4 +470,7 @@ extern bool has_rolreplication(Oid roleid);
  extern bool BackupInProgress(void);
  extern void CancelBackup(void);
  
+/* in executor/nodeHash.c */
+extern int get_hash_mem(void);
+
  #endif                         /* MISCADMIN_H */
author	Peter Geoghegan <[email protected]>
	Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)
committer	Peter Geoghegan <[email protected]>
	Wed, 29 Jul 2020 21:14:58 +0000 (14:14 -0700)
doc/src/sgml/config.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/postgres-ref.sgml		patch \| blob \| blame \| history
doc/src/sgml/runtime.sgml		patch \| blob \| blame \| history
src/backend/executor/execGrouping.c		patch \| blob \| blame \| history
src/backend/executor/nodeAgg.c		patch \| blob \| blame \| history
src/backend/executor/nodeHash.c		patch \| blob \| blame \| history
src/backend/executor/nodeHashjoin.c		patch \| blob \| blame \| history
src/backend/optimizer/path/costsize.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/subselect.c		patch \| blob \| blame \| history
src/backend/optimizer/prep/prepunion.c		patch \| blob \| blame \| history
src/backend/optimizer/util/pathnode.c		patch \| blob \| blame \| history
src/backend/utils/adt/ri_triggers.c		patch \| blob \| blame \| history
src/backend/utils/init/globals.c		patch \| blob \| blame \| history
src/backend/utils/misc/guc.c		patch \| blob \| blame \| history
src/backend/utils/misc/postgresql.conf.sample		patch \| blob \| blame \| history
src/include/executor/hashjoin.h		patch \| blob \| blame \| history
src/include/executor/nodeHash.h		patch \| blob \| blame \| history
src/include/miscadmin.h		patch \| blob \| blame \| history