Skip to content

Commit 896ddf9

Browse files
committed
Avoid fragmentation of logical tapes when writing concurrently.
Disk-based HashAgg relies on writing to multiple tapes concurrently. Avoid fragmentation of the tapes' blocks by preallocating many blocks for a tape at once. No file operations are performed during preallocation; only the block numbers are reserved. Reviewed-by: Tomas Vondra Discussion: https://postgr.es/m/20200519151202.u2p2gpiawoaznsv2%40development
1 parent 49223e1 commit 896ddf9

File tree

1 file changed

+77
-3
lines changed

1 file changed

+77
-3
lines changed

src/backend/utils/sort/logtape.c

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,18 @@ typedef struct TapeBlockTrailer
110110
#define TapeBlockSetNBytes(buf, nbytes) \
111111
(TapeBlockGetTrailer(buf)->next = -(nbytes))
112112

113+
/*
114+
* When multiple tapes are being written to concurrently (as in HashAgg),
115+
* avoid excessive fragmentation by preallocating block numbers to individual
116+
* tapes. Each preallocation doubles in size starting at
117+
* TAPE_WRITE_PREALLOC_MIN blocks up to TAPE_WRITE_PREALLOC_MAX blocks.
118+
*
119+
* No filesystem operations are performed for preallocation; only the block
120+
* numbers are reserved. This may lead to sparse writes, which will cause
121+
* ltsWriteBlock() to fill in holes with zeros.
122+
*/
123+
#define TAPE_WRITE_PREALLOC_MIN 8
124+
#define TAPE_WRITE_PREALLOC_MAX 128
113125

114126
/*
115127
* This data structure represents a single "logical tape" within the set
@@ -151,6 +163,15 @@ typedef struct LogicalTape
151163
int max_size; /* highest useful, safe buffer_size */
152164
int pos; /* next read/write position in buffer */
153165
int nbytes; /* total # of valid bytes in buffer */
166+
167+
/*
168+
* Preallocated block numbers are held in an array sorted in descending
169+
* order; blocks are consumed from the end of the array (lowest block
170+
* numbers first).
171+
*/
172+
long *prealloc;
173+
int nprealloc; /* number of elements in list */
174+
int prealloc_size; /* number of elements list can hold */
154175
} LogicalTape;
155176

156177
/*
@@ -198,6 +219,7 @@ struct LogicalTapeSet
198219
static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
199220
static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
200221
static long ltsGetFreeBlock(LogicalTapeSet *lts);
222+
static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
201223
static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
202224
static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
203225
SharedFileSet *fileset);
@@ -397,6 +419,45 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
397419
return blocknum;
398420
}
399421

422+
/*
423+
* Return the lowest free block number from the tape's preallocation list.
424+
* Refill the preallocation list if necessary.
425+
*/
426+
static long
427+
ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
428+
{
429+
/* sorted in descending order, so return the last element */
430+
if (lt->nprealloc > 0)
431+
return lt->prealloc[--lt->nprealloc];
432+
433+
if (lt->prealloc == NULL)
434+
{
435+
lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
436+
lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
437+
}
438+
else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX)
439+
{
440+
/* when the preallocation list runs out, double the size */
441+
lt->prealloc_size *= 2;
442+
if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX)
443+
lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX;
444+
lt->prealloc = (long *) repalloc(lt->prealloc,
445+
sizeof(long) * lt->prealloc_size);
446+
}
447+
448+
/* refill preallocation list */
449+
lt->nprealloc = lt->prealloc_size;
450+
for (int i = lt->nprealloc; i > 0; i--)
451+
{
452+
lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
453+
454+
/* verify descending order */
455+
Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]);
456+
}
457+
458+
return lt->prealloc[--lt->nprealloc];
459+
}
460+
400461
/*
401462
* Return a block# to the freelist.
402463
*/
@@ -557,6 +618,9 @@ ltsInitTape(LogicalTape *lt)
557618
lt->max_size = MaxAllocSize;
558619
lt->pos = 0;
559620
lt->nbytes = 0;
621+
lt->prealloc = NULL;
622+
lt->nprealloc = 0;
623+
lt->prealloc_size = 0;
560624
}
561625

562626
/*
@@ -709,7 +773,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
709773
Assert(lt->firstBlockNumber == -1);
710774
Assert(lt->pos == 0);
711775

712-
lt->curBlockNumber = ltsGetFreeBlock(lts);
776+
lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
713777
lt->firstBlockNumber = lt->curBlockNumber;
714778

715779
TapeBlockGetTrailer(lt->buffer)->prev = -1L;
@@ -733,7 +797,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
733797
* First allocate the next block, so that we can store it in the
734798
* 'next' pointer of this block.
735799
*/
736-
nextBlockNumber = ltsGetFreeBlock(lts);
800+
nextBlockNumber = ltsGetPreallocBlock(lts, lt);
737801

738802
/* set the next-pointer and dump the current block. */
739803
TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -835,13 +899,23 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
835899
Assert(lt->frozen);
836900
}
837901

838-
/* Allocate a read buffer (unless the tape is empty) */
839902
if (lt->buffer)
840903
pfree(lt->buffer);
841904

842905
/* the buffer is lazily allocated, but set the size here */
843906
lt->buffer = NULL;
844907
lt->buffer_size = buffer_size;
908+
909+
/* free the preallocation list, and return unused block numbers */
910+
if (lt->prealloc != NULL)
911+
{
912+
for (int i = lt->nprealloc; i > 0; i--)
913+
ltsReleaseBlock(lts, lt->prealloc[i - 1]);
914+
pfree(lt->prealloc);
915+
lt->prealloc = NULL;
916+
lt->nprealloc = 0;
917+
lt->prealloc_size = 0;
918+
}
845919
}
846920

847921
/*

0 commit comments

Comments
 (0)