* These fields do not change after initialization.
*/
CHashDescriptor desc; /* descriptor for this hash table */
- uint32 nbuckets; /* # of buckets; must be a power of two */
uint32 bucket_mask; /* # of buckets, minus one */
- uint32 garbage_shift; /* log2(nbuckets/ngarbage) */
- uint32 ngarbage; /* # of garbage lists, a power of two */
- uint32 nfreelists; /* # of freelists */
+ uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */
+ uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */
+ uint16 nfreelists; /* # of freelists */
uint32 arena_limit; /* # of arena elements */
uint32 arena_stride; /* bytes allocated per arena element */
- CHashPtr *bucket; /* array of size nbuckets */
- CHashPtr *garbage; /* array of size ngarbage */
- CHashPtr *freelist; /* array of size nfreelists */
+ CHashPtr *bucket; /* array with 1 entry per bucket */
+ CHashPtr *extra; /* entries for garbage and free lists */
char *arena; /* arena */
/*
uint64 stats[CHS_NumberOfStatistics]; /* statistics */
} CHashTableData;
+/* Compute # of buckets, garbage lists, or free lists. */
+#define CHashTableNBuckets(table) \
+ ((table)->bucket_mask + 1)
+#define CHashTableNGarbage(table) \
+ (CHashTableNBuckets((table)) >> (table)->garbage_shift)
+#define CHashTableNFreeLists(table) \
+ ((table)->nfreelists)
+
+/*
+ * Garbage lists and free lists are interleaved to reduce cache line
+ * contention on the free lists, so the calculation of where an individual
+ * list is located is a bit complex.
+ */
+#define CHashTableGetGarbageList(table, n) \
+ (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)])
+#define CHashTableGetGarbageByBucket(table, n) \
+ (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift))
+#define CHashTableGetFreeList(table, n) \
+ (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)])
+
+/* Access macros for arena nodes. */
#define CHashTableGetRaw(table, offset) \
(AssertMacro((offset) < (table)->arena_limit), \
(CHashNode *) ((table)->arena + (table)->arena_stride * (offset)))
#define CHashTableGetNode(table, ptr) \
(AssertMacro(!CHashPtrIsInvalid(ptr)), \
CHashTableGetRaw((table), CHashPtrGetOffset((ptr))))
+
+/* Statistics macros. */
#define CHashTableIncrementStatistic(table, stat) \
((table)->stats[(stat)]++)
* smallest power of two greater than or equal to the target capacity.
*/
bucket_shift = fls(desc->capacity) - 1;
- table->nbuckets = 1 << bucket_shift;
- table->bucket_mask = table->nbuckets - 1;
+ table->bucket_mask = (1 << bucket_shift) - 1;
/*
- * It's not exactly clear how to determine the optimal number of garbage
- * lists. If there are too few, then garbage collection will have to wait
- * behind concurrent scans excessively frequently. But if there are too
- * many, then garbage collection won't recover very many items.
+ * We choose to have one garbage list for every 64 hash table buckets
+ * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in
+ * total, in which case we still have a minimum of one garbage list.
+ *
+ * Increasing the garbage_shift would reduce the likelihood of a backend
+ * performing garbage collection needing to wait for a backend walking a
+ * bucket to finish its scan. On the other hand, decreasing the garbage
+ * shift would allow more items to be recovered in a single garbage
+ * collection cycle. It's not clear what the optimal value is.
*/
table->garbage_shift = Min(bucket_shift, 6);
- table->ngarbage = table->nbuckets >> table->garbage_shift;
table->gc_next = 0;
table->gc_pid = 0;
/*
- * The number of freelists must be large enough to avoid contention;
- * having extras is fairly harmless. But there seems to be no point in
- * having more free lists than garbage lists; if the garbage lists aren't
- * causing contention, an equal number of free lists shouldn't either.
- */
- table->nfreelists = Min(table->ngarbage, 64);
+ * Experimentation reveals that the free list manipulation is both one of
+ * the slowest parts of this algorithm and the most vulnerable to
+ * contention. Therefore, we want to have as many free lists as possible,
+ * but there's no need to have more than the number of CPU cores, so we
+ * limit the number of freelists to 64. There might be a benefit in some
+ * larger limit on a really big system, but we'll cap it here pending some
+ * actual test results. We're also limited to having no more freelists
+ * than we do garbage lists.
+ */
+#define LOG2_MAX_FREELIST 6
+ if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST)
+ table->freelist_shift = 0;
+ else
+ table->freelist_shift =
+ (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST;
+ table->nfreelists =
+ 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift));
/*
* To make garbage collection efficient, we overallocate. Normally, we
Size
CHashEstimateSize(CHashTable table)
{
- Size size;
Size total_buckets;
+ Size size;
+ Size nbuckets = CHashTableNBuckets(table);
+ Size ngarbage = CHashTableNGarbage(table);
+ Size nfreelists = CHashTableNFreeLists(table);
- total_buckets = add_size(table->nbuckets, table->ngarbage);
- total_buckets = add_size(total_buckets, table->nfreelists);
+ Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0);
+ total_buckets = add_size(nbuckets, ngarbage);
+ total_buckets = add_size(total_buckets, nfreelists);
size = MAXALIGN(sizeof(CHashTableData));
size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets));
bool found;
void *shmem;
uint32 i;
+ uint32 nbuckets;
+ uint32 nfreelists;
+ uint32 ngarbage;
+ uint32 nextra;
/*
* If we're under the postmaster, this must be the EXEC_BACKEND case where
/* Bucket, garbage, and freelist arrays follow table info. */
table->bucket = (CHashPtr *)
(((char *) shmem) + MAXALIGN(sizeof(CHashTableData)));
- table->garbage = &table->bucket[table->nbuckets];
- table->freelist = &table->garbage[table->ngarbage];
+ nbuckets = CHashTableNBuckets(table);
+ table->extra = &table->bucket[nbuckets];
/* Arena follows the various lists. */
- table->arena = (void *) (&table->freelist[table->nfreelists]);
+ ngarbage = CHashTableNGarbage(table);
+ nfreelists = CHashTableNFreeLists(table);
+ nextra = ngarbage + nfreelists;
+ table->arena = (void *) (&table->extra[nextra]);
/* Initialize all three sets of lists to empty. */
- for (i = 0; i < table->nbuckets; ++i)
+ for (i = 0; i < nbuckets; ++i)
table->bucket[i] = InvalidCHashPtr;
- for (i = 0; i < table->ngarbage; ++i)
- table->garbage[i] = InvalidCHashPtr;
- for (i = 0; i < table->nfreelists; ++i)
- table->freelist[i] = InvalidCHashPtr;
+ for (i = 0; i < nextra; ++i)
+ table->extra[i] = InvalidCHashPtr;
/* Put all arena elements on the free lists. */
for (i = 0; i < table->arena_limit; ++i)
{
- CHashPtr *f = &table->freelist[i % table->nfreelists];
+ CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists);
CHashNode *n = CHashTableGetRaw(table, i);
n->un.gcnext = *f;
/* Prevent garbage collection for this bucket. */
Assert(MyProc->hazard[0] == NULL);
- MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift];
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
pg_memory_barrier();
/* Scan bucket and return data from any matching entry. */
memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size);
/* Prevent garbage collection for this bucket. */
- MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift];
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
pg_memory_barrier();
/*
/* Prevent garbage collection for this bucket. */
Assert(MyProc->hazard[0] == NULL);
- MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift];
+ MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket);
pg_memory_barrier();
/* Scan bucket. */
static CHashPtr
CHashAllocate(CHashTable table)
{
- uint32 f_current = ((uint32) MyBackendId) % table->nfreelists;
+ uint32 f_current;
CHashPtr new;
+ /* Pick a starting freelist base on our backend ID. */
+ f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+
/* If this process hasn't initialized gc_next yet, do that now. */
if (table->gc_pid != MyProcPid)
{
table->gc_pid = MyProcPid;
- table->gc_next = ((uint32) MyProcPid) % table->ngarbage;
+ table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table);
}
/* Loop until we allocate a buffer. */
* we're obliged to update our hazard pointers, which is a material
* savings due to the associated memory barrier.
*/
- b = &table->freelist[f_current];
+ b = CHashTableGetFreeList(table, f_current);
MyProc->hazard[0] = b;
pg_memory_barrier();
new = *b;
return new;
/* Advance to next freelist. */
- f_current = (f_current + 1) % table->nfreelists;
+ f_current = (f_current + 1) % CHashTableNFreeLists(table);
}
}
static CHashPtr
CHashAllocateViaGC(CHashTable table)
{
- uint32 f_home = ((uint32) MyBackendId) % table->nfreelists;
+ uint32 f_home;
CHashPtr *b;
- CHashPtr *fh = &table->freelist[f_home];
+ CHashPtr *fh;
CHashPtr fhead;
CHashPtr garbage;
CHashPtr new;
CHashNode *n;
uint32 i;
+ /* Pick a target freelist based on our backend ID. */
+ f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table);
+ fh = CHashTableGetFreeList(table, f_home);
+
/* Select target garbage list. */
- table->gc_next = (table->gc_next + 1) % table->ngarbage;
- b = &table->garbage[table->gc_next];
+ table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table);
+ b = CHashTableGetGarbageList(table, table->gc_next);
garbage = *b;
/* If list is empty, fail. */
static void
CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c)
{
- uint32 garbage_bucket;
CHashPtr g;
CHashNode *n;
CHashPtr *garbage;
- garbage_bucket = bucket >> table->garbage_shift;
n = CHashTableGetNode(table, c);
- garbage = &table->garbage[garbage_bucket];
+ garbage = CHashTableGetGarbageByBucket(table, bucket);
while (1)
{