Change the way we decide whether to give up on abbreviated text keys.
authorRobert Haas <[email protected]>
Fri, 3 Apr 2015 12:32:05 +0000 (08:32 -0400)
committerRobert Haas <[email protected]>
Fri, 3 Apr 2015 12:32:05 +0000 (08:32 -0400)
Be more aggressive about aborting early on if it looks like it's not
helping, but be less aggressive about aborting later on, since it's
more expensive at that point, and also since we're currently aborting
in some cases where abbreviation can still deliver a substantial win.

Peter Geoghegan. Extensive testing by Tomas Vondra.

src/backend/utils/adt/varlena.c

index 02e994972c342b3fce3b36fa380681cfff702c25..d874a1aec095b4165ce79b2e26c81cdbdca4a91a 100644 (file)
@@ -66,6 +66,7 @@ typedef struct
        bool                            collate_c;
        hyperLogLogState        abbr_card;      /* Abbreviated key cardinality state */
        hyperLogLogState        full_card;      /* Full key cardinality state */
+       double                          prop_card;      /* Required cardinality proportion */
 #ifdef HAVE_LOCALE_T
        pg_locale_t locale;
 #endif
@@ -1845,6 +1846,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
                 */
                if (abbreviate)
                {
+                       tss->prop_card = 0.20;
                        initHyperLogLog(&tss->abbr_card, 10);
                        initHyperLogLog(&tss->full_card, 10);
                        ssup->abbrev_full_comparator = ssup->comparator;
@@ -2125,7 +2127,7 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
        Assert(ssup->abbreviate);
 
        /* Have a little patience */
-       if (memtupcount < 20)
+       if (memtupcount < 100)
                return false;
 
        abbrev_distinct = estimateHyperLogLog(&tss->abbr_card);
@@ -2151,8 +2153,9 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
        {
                double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
 
-               elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)",
-                        memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card);
+               elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
+                        memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
+                        tss->prop_card);
        }
 #endif
 
@@ -2172,8 +2175,38 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
         * abbreviated comparison with a cheap memcmp()-based authoritative
         * resolution are equivalent.
         */
-       if (abbrev_distinct > key_distinct * 0.05)
+       if (abbrev_distinct > key_distinct * tss->prop_card)
+       {
+               /*
+                * When we have exceeded 10,000 tuples, decay required cardinality
+                * aggressively for next call.
+                *
+                * This is useful because the number of comparisons required on average
+                * increases at a linearithmic rate, and at roughly 10,000 tuples that
+                * factor will start to dominate over the linear costs of string
+                * transformation (this is a conservative estimate).  The decay rate is
+                * chosen to be a little less aggressive than halving -- which (since
+                * we're called at points at which memtupcount has doubled) would never
+                * see the cost model actually abort past the first call following a
+                * decay.  This decay rate is mostly a precaution against a sudden,
+                * violent swing in how well abbreviated cardinality tracks full key
+                * cardinality.  The decay also serves to prevent a marginal case from
+                * being aborted too late, when too much has already been invested in
+                * string transformation.
+                *
+                * It's possible for sets of several million distinct strings with mere
+                * tens of thousands of distinct abbreviated keys to still benefit very
+                * significantly.  This will generally occur provided each abbreviated
+                * key is a proxy for a roughly uniform number of the set's full keys.
+                * If it isn't so, we hope to catch that early and abort.  If it isn't
+                * caught early, by the time the problem is apparent it's probably not
+                * worth aborting.
+                */
+               if (memtupcount > 10000)
+                       tss->prop_card *= 0.65;
+
                return false;
+       }
 
        /*
         * Abort abbreviation strategy.
@@ -2187,8 +2220,8 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
         * lose but much to gain, which our strategy reflects.
         */
 #ifdef DEBUG_ABBREV_KEYS
-       elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f",
-                memtupcount, abbrev_distinct, key_distinct);
+       elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f, prop_card: %f",
+                memtupcount, abbrev_distinct, key_distinct, tss->prop_card);
        /* Actually abort only when debugging is disabled */
        return false;
 #endif