From: Robert Haas Date: Thu, 30 Aug 2012 13:34:59 +0000 (+0000) Subject: Rearrange pointers so that the freelist pointers are as far from X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=refs%2Fheads%2Fchash;p=users%2Frhaas%2Fpostgres.git Rearrange pointers so that the freelist pointers are as far from each other as possible, to reduce contention. --- diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c index 04a2eab89f..0d4dc78a4e 100644 --- a/src/backend/utils/hash/chash.c +++ b/src/backend/utils/hash/chash.c @@ -144,16 +144,14 @@ typedef struct CHashTableData * These fields do not change after initialization. */ CHashDescriptor desc; /* descriptor for this hash table */ - uint32 nbuckets; /* # of buckets; must be a power of two */ uint32 bucket_mask; /* # of buckets, minus one */ - uint32 garbage_shift; /* log2(nbuckets/ngarbage) */ - uint32 ngarbage; /* # of garbage lists, a power of two */ - uint32 nfreelists; /* # of freelists */ + uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */ + uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */ + uint16 nfreelists; /* # of freelists */ uint32 arena_limit; /* # of arena elements */ uint32 arena_stride; /* bytes allocated per arena element */ - CHashPtr *bucket; /* array of size nbuckets */ - CHashPtr *garbage; /* array of size ngarbage */ - CHashPtr *freelist; /* array of size nfreelists */ + CHashPtr *bucket; /* array with 1 entry per bucket */ + CHashPtr *extra; /* entries for garbage and free lists */ char *arena; /* arena */ /* @@ -165,12 +163,35 @@ typedef struct CHashTableData uint64 stats[CHS_NumberOfStatistics]; /* statistics */ } CHashTableData; +/* Compute # of buckets, garbage lists, or free lists. */ +#define CHashTableNBuckets(table) \ + ((table)->bucket_mask + 1) +#define CHashTableNGarbage(table) \ + (CHashTableNBuckets((table)) >> (table)->garbage_shift) +#define CHashTableNFreeLists(table) \ + ((table)->nfreelists) + +/* + * Garbage lists and free lists are interleaved to reduce cache line + * contention on the free lists, so the calculation of where an individual + * list is located is a bit complex. + */ +#define CHashTableGetGarbageList(table, n) \ + (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)]) +#define CHashTableGetGarbageByBucket(table, n) \ + (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift)) +#define CHashTableGetFreeList(table, n) \ + (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)]) + +/* Access macros for arena nodes. */ #define CHashTableGetRaw(table, offset) \ (AssertMacro((offset) < (table)->arena_limit), \ (CHashNode *) ((table)->arena + (table)->arena_stride * (offset))) #define CHashTableGetNode(table, ptr) \ (AssertMacro(!CHashPtrIsInvalid(ptr)), \ CHashTableGetRaw((table), CHashPtrGetOffset((ptr)))) + +/* Statistics macros. */ #define CHashTableIncrementStatistic(table, stat) \ ((table)->stats[(stat)]++) @@ -251,27 +272,41 @@ CHashBootstrap(CHashDescriptor *desc) * smallest power of two greater than or equal to the target capacity. */ bucket_shift = fls(desc->capacity) - 1; - table->nbuckets = 1 << bucket_shift; - table->bucket_mask = table->nbuckets - 1; + table->bucket_mask = (1 << bucket_shift) - 1; /* - * It's not exactly clear how to determine the optimal number of garbage - * lists. If there are too few, then garbage collection will have to wait - * behind concurrent scans excessively frequently. But if there are too - * many, then garbage collection won't recover very many items. + * We choose to have one garbage list for every 64 hash table buckets + * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in + * total, in which case we still have a minimum of one garbage list. + * + * Increasing the garbage_shift would reduce the likelihood of a backend + * performing garbage collection needing to wait for a backend walking a + * bucket to finish its scan. On the other hand, decreasing the garbage + * shift would allow more items to be recovered in a single garbage + * collection cycle. It's not clear what the optimal value is. */ table->garbage_shift = Min(bucket_shift, 6); - table->ngarbage = table->nbuckets >> table->garbage_shift; table->gc_next = 0; table->gc_pid = 0; /* - * The number of freelists must be large enough to avoid contention; - * having extras is fairly harmless. But there seems to be no point in - * having more free lists than garbage lists; if the garbage lists aren't - * causing contention, an equal number of free lists shouldn't either. - */ - table->nfreelists = Min(table->ngarbage, 64); + * Experimentation reveals that the free list manipulation is both one of + * the slowest parts of this algorithm and the most vulnerable to + * contention. Therefore, we want to have as many free lists as possible, + * but there's no need to have more than the number of CPU cores, so we + * limit the number of freelists to 64. There might be a benefit in some + * larger limit on a really big system, but we'll cap it here pending some + * actual test results. We're also limited to having no more freelists + * than we do garbage lists. + */ +#define LOG2_MAX_FREELIST 6 + if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST) + table->freelist_shift = 0; + else + table->freelist_shift = + (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST; + table->nfreelists = + 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift)); /* * To make garbage collection efficient, we overallocate. Normally, we @@ -301,11 +336,15 @@ CHashBootstrap(CHashDescriptor *desc) Size CHashEstimateSize(CHashTable table) { - Size size; Size total_buckets; + Size size; + Size nbuckets = CHashTableNBuckets(table); + Size ngarbage = CHashTableNGarbage(table); + Size nfreelists = CHashTableNFreeLists(table); - total_buckets = add_size(table->nbuckets, table->ngarbage); - total_buckets = add_size(total_buckets, table->nfreelists); + Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0); + total_buckets = add_size(nbuckets, ngarbage); + total_buckets = add_size(total_buckets, nfreelists); size = MAXALIGN(sizeof(CHashTableData)); size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets)); @@ -325,6 +364,10 @@ CHashInitialize(CHashTable table, CHashDescriptor *desc) bool found; void *shmem; uint32 i; + uint32 nbuckets; + uint32 nfreelists; + uint32 ngarbage; + uint32 nextra; /* * If we're under the postmaster, this must be the EXEC_BACKEND case where @@ -355,24 +398,25 @@ CHashInitialize(CHashTable table, CHashDescriptor *desc) /* Bucket, garbage, and freelist arrays follow table info. */ table->bucket = (CHashPtr *) (((char *) shmem) + MAXALIGN(sizeof(CHashTableData))); - table->garbage = &table->bucket[table->nbuckets]; - table->freelist = &table->garbage[table->ngarbage]; + nbuckets = CHashTableNBuckets(table); + table->extra = &table->bucket[nbuckets]; /* Arena follows the various lists. */ - table->arena = (void *) (&table->freelist[table->nfreelists]); + ngarbage = CHashTableNGarbage(table); + nfreelists = CHashTableNFreeLists(table); + nextra = ngarbage + nfreelists; + table->arena = (void *) (&table->extra[nextra]); /* Initialize all three sets of lists to empty. */ - for (i = 0; i < table->nbuckets; ++i) + for (i = 0; i < nbuckets; ++i) table->bucket[i] = InvalidCHashPtr; - for (i = 0; i < table->ngarbage; ++i) - table->garbage[i] = InvalidCHashPtr; - for (i = 0; i < table->nfreelists; ++i) - table->freelist[i] = InvalidCHashPtr; + for (i = 0; i < nextra; ++i) + table->extra[i] = InvalidCHashPtr; /* Put all arena elements on the free lists. */ for (i = 0; i < table->arena_limit; ++i) { - CHashPtr *f = &table->freelist[i % table->nfreelists]; + CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists); CHashNode *n = CHashTableGetRaw(table, i); n->un.gcnext = *f; @@ -405,7 +449,7 @@ CHashSearch(CHashTable table, void *entry) /* Prevent garbage collection for this bucket. */ Assert(MyProc->hazard[0] == NULL); - MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift]; + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); pg_memory_barrier(); /* Scan bucket and return data from any matching entry. */ @@ -460,7 +504,7 @@ CHashInsert(CHashTable table, void *entry) memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size); /* Prevent garbage collection for this bucket. */ - MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift]; + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); pg_memory_barrier(); /* @@ -545,7 +589,7 @@ CHashDelete(CHashTable table, void *entry) /* Prevent garbage collection for this bucket. */ Assert(MyProc->hazard[0] == NULL); - MyProc->hazard[0] = &table->garbage[bucket >> table->garbage_shift]; + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); pg_memory_barrier(); /* Scan bucket. */ @@ -805,14 +849,17 @@ zap: static CHashPtr CHashAllocate(CHashTable table) { - uint32 f_current = ((uint32) MyBackendId) % table->nfreelists; + uint32 f_current; CHashPtr new; + /* Pick a starting freelist base on our backend ID. */ + f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + /* If this process hasn't initialized gc_next yet, do that now. */ if (table->gc_pid != MyProcPid) { table->gc_pid = MyProcPid; - table->gc_next = ((uint32) MyProcPid) % table->ngarbage; + table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table); } /* Loop until we allocate a buffer. */ @@ -835,7 +882,7 @@ CHashAllocate(CHashTable table) * we're obliged to update our hazard pointers, which is a material * savings due to the associated memory barrier. */ - b = &table->freelist[f_current]; + b = CHashTableGetFreeList(table, f_current); MyProc->hazard[0] = b; pg_memory_barrier(); new = *b; @@ -861,7 +908,7 @@ CHashAllocate(CHashTable table) return new; /* Advance to next freelist. */ - f_current = (f_current + 1) % table->nfreelists; + f_current = (f_current + 1) % CHashTableNFreeLists(table); } } @@ -871,18 +918,22 @@ CHashAllocate(CHashTable table) static CHashPtr CHashAllocateViaGC(CHashTable table) { - uint32 f_home = ((uint32) MyBackendId) % table->nfreelists; + uint32 f_home; CHashPtr *b; - CHashPtr *fh = &table->freelist[f_home]; + CHashPtr *fh; CHashPtr fhead; CHashPtr garbage; CHashPtr new; CHashNode *n; uint32 i; + /* Pick a target freelist based on our backend ID. */ + f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + fh = CHashTableGetFreeList(table, f_home); + /* Select target garbage list. */ - table->gc_next = (table->gc_next + 1) % table->ngarbage; - b = &table->garbage[table->gc_next]; + table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table); + b = CHashTableGetGarbageList(table, table->gc_next); garbage = *b; /* If list is empty, fail. */ @@ -1006,14 +1057,12 @@ CHashAllocateViaGC(CHashTable table) static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c) { - uint32 garbage_bucket; CHashPtr g; CHashNode *n; CHashPtr *garbage; - garbage_bucket = bucket >> table->garbage_shift; n = CHashTableGetNode(table, c); - garbage = &table->garbage[garbage_bucket]; + garbage = CHashTableGetGarbageByBucket(table, bucket); while (1) {