Skip to content

Commit e94568e

Browse files
committed
Change the way pre-reading in external sort's merge phase works.
Don't pre-read tuples into SortTuple slots during merge. Instead, use the memory for larger read buffers in logtape.c. We're doing the same number of READTUP() calls either way, but managing the pre-read SortTuple slots is much more complicated. Also, the on-tape representation is more compact than SortTuples, so we can fit more pre-read tuples into the same amount of memory this way. And we have better cache-locality, when we use just a small number of SortTuple slots. Now that we only hold one tuple from each tape in the SortTuple slots, we can greatly simplify the "batch memory" management. We now maintain a small set of fixed-sized slots, to hold the tuples, and fall back to palloc() for larger tuples. We use this method during all merge phases, not just the final merge, and also when randomAccess is requested, and also in the TSS_SORTEDONTAPE case. In other words, it's used whenever we do an external sort. Reviewed by Peter Geoghegan and Claudio Freire. Discussion: <CAM3SWZTpaORV=yQGVCG8Q4axcZ3MvF-05xe39ZvORdU9JcD6hQ@mail.gmail.com>
1 parent e8bdee2 commit e94568e

File tree

3 files changed

+574
-797
lines changed

3 files changed

+574
-797
lines changed

src/backend/utils/sort/logtape.c

Lines changed: 132 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,17 @@
5252
* not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
5353
* policy for free blocks would be better?)
5454
*
55+
* To further make the I/Os more sequential, we can use a larger buffer
56+
* when reading, and read multiple blocks from the same tape in one go,
57+
* whenever the buffer becomes empty. LogicalTapeAssignReadBufferSize()
58+
* can be used to set the size of the read buffer.
59+
*
5560
* To support the above policy of writing to the lowest free block,
5661
* ltsGetFreeBlock sorts the list of free block numbers into decreasing
5762
* order each time it is asked for a block and the list isn't currently
5863
* sorted. This is an efficient way to handle it because we expect cycles
5964
* of releasing many blocks followed by re-using many blocks, due to
60-
* tuplesort.c's "preread" behavior.
65+
* the larger read buffer.
6166
*
6267
* Since all the bookkeeping and buffer memory is allocated with palloc(),
6368
* and the underlying file(s) are made with OpenTemporaryFile, all resources
@@ -79,6 +84,7 @@
7984

8085
#include "storage/buffile.h"
8186
#include "utils/logtape.h"
87+
#include "utils/memutils.h"
8288

8389
/*
8490
* Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
131137
* reading.
132138
*/
133139
char *buffer; /* physical buffer (separately palloc'd) */
140+
int buffer_size; /* allocated size of the buffer */
134141
long curBlockNumber; /* this block's logical blk# within tape */
135142
int pos; /* next read/write position in buffer */
136143
int nbytes; /* total # of valid bytes in buffer */
144+
145+
/*
146+
* Desired buffer size to use when reading. To keep things simple, we use
147+
* a single-block buffer when writing, or when reading a frozen tape. But
148+
* when we are reading and will only read forwards, we allocate a larger
149+
* buffer, determined by read_buffer_size.
150+
*/
151+
int read_buffer_size;
137152
} LogicalTape;
138153

139154
/*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
227242
blocknum)));
228243
}
229244

245+
/*
246+
* Read as many blocks as we can into the per-tape buffer.
247+
*
248+
* The caller can specify the next physical block number to read, in
249+
* datablocknum, or -1 to fetch the next block number from the internal block.
250+
* If datablocknum == -1, the caller must've already set curBlockNumber.
251+
*
252+
* Returns true if anything was read, 'false' on EOF.
253+
*/
254+
static bool
255+
ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt, long datablocknum)
256+
{
257+
lt->pos = 0;
258+
lt->nbytes = 0;
259+
260+
do
261+
{
262+
/* Fetch next block number (unless provided by caller) */
263+
if (datablocknum == -1)
264+
{
265+
datablocknum = ltsRecallNextBlockNum(lts, lt->indirect, lt->frozen);
266+
if (datablocknum == -1L)
267+
break; /* EOF */
268+
lt->curBlockNumber++;
269+
}
270+
271+
/* Read the block */
272+
ltsReadBlock(lts, datablocknum, (void *) (lt->buffer + lt->nbytes));
273+
if (!lt->frozen)
274+
ltsReleaseBlock(lts, datablocknum);
275+
276+
if (lt->curBlockNumber < lt->numFullBlocks)
277+
lt->nbytes += BLCKSZ;
278+
else
279+
{
280+
/* EOF */
281+
lt->nbytes += lt->lastBlockBytes;
282+
break;
283+
}
284+
285+
/* Advance to next block, if we have buffer space left */
286+
datablocknum = -1;
287+
} while (lt->nbytes < lt->buffer_size);
288+
289+
return (lt->nbytes > 0);
290+
}
291+
230292
/*
231293
* qsort comparator for sorting freeBlocks[] into decreasing order.
232294
*/
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
546608
lt->numFullBlocks = 0L;
547609
lt->lastBlockBytes = 0;
548610
lt->buffer = NULL;
611+
lt->buffer_size = 0;
612+
lt->read_buffer_size = BLCKSZ;
549613
lt->curBlockNumber = 0L;
550614
lt->pos = 0;
551615
lt->nbytes = 0;
@@ -628,14 +692,18 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
628692

629693
/* Allocate data buffer and first indirect block on first write */
630694
if (lt->buffer == NULL)
695+
{
631696
lt->buffer = (char *) palloc(BLCKSZ);
697+
lt->buffer_size = BLCKSZ;
698+
}
632699
if (lt->indirect == NULL)
633700
{
634701
lt->indirect = (IndirectBlock *) palloc(sizeof(IndirectBlock));
635702
lt->indirect->nextSlot = 0;
636703
lt->indirect->nextup = NULL;
637704
}
638705

706+
Assert(lt->buffer_size == BLCKSZ);
639707
while (size > 0)
640708
{
641709
if (lt->pos >= BLCKSZ)
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
709777
Assert(lt->frozen);
710778
datablocknum = ltsRewindFrozenIndirectBlock(lts, lt->indirect);
711779
}
780+
781+
/* Allocate a read buffer */
782+
if (lt->buffer)
783+
pfree(lt->buffer);
784+
lt->buffer = palloc(lt->read_buffer_size);
785+
lt->buffer_size = lt->read_buffer_size;
786+
712787
/* Read the first block, or reset if tape is empty */
713788
lt->curBlockNumber = 0L;
714789
lt->pos = 0;
715790
lt->nbytes = 0;
716791
if (datablocknum != -1L)
717-
{
718-
ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
719-
if (!lt->frozen)
720-
ltsReleaseBlock(lts, datablocknum);
721-
lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
722-
BLCKSZ : lt->lastBlockBytes;
723-
}
792+
ltsReadFillBuffer(lts, lt, datablocknum);
724793
}
725794
else
726795
{
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
754823
lt->curBlockNumber = 0L;
755824
lt->pos = 0;
756825
lt->nbytes = 0;
826+
827+
if (lt->buffer)
828+
{
829+
pfree(lt->buffer);
830+
lt->buffer = NULL;
831+
lt->buffer_size = 0;
832+
}
757833
}
758834
}
759835

@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
779855
if (lt->pos >= lt->nbytes)
780856
{
781857
/* Try to load more data into buffer. */
782-
long datablocknum = ltsRecallNextBlockNum(lts, lt->indirect,
783-
lt->frozen);
784-
785-
if (datablocknum == -1L)
858+
if (!ltsReadFillBuffer(lts, lt, -1))
786859
break; /* EOF */
787-
lt->curBlockNumber++;
788-
lt->pos = 0;
789-
ltsReadBlock(lts, datablocknum, (void *) lt->buffer);
790-
if (!lt->frozen)
791-
ltsReleaseBlock(lts, datablocknum);
792-
lt->nbytes = (lt->curBlockNumber < lt->numFullBlocks) ?
793-
BLCKSZ : lt->lastBlockBytes;
794-
if (lt->nbytes <= 0)
795-
break; /* EOF (possible here?) */
796860
}
797861

798862
nthistime = lt->nbytes - lt->pos;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
842906
lt->writing = false;
843907
lt->frozen = true;
844908
datablocknum = ltsRewindIndirectBlock(lts, lt->indirect, true);
909+
910+
/*
911+
* The seek and backspace functions assume a single block read buffer.
912+
* That's OK with current usage. A larger buffer is helpful to make the
913+
* read pattern of the backing file look more sequential to the OS, when
914+
* we're reading from multiple tapes. But at the end of a sort, when a
915+
* tape is frozen, we only read from a single tape anyway.
916+
*/
917+
if (!lt->buffer || lt->buffer_size != BLCKSZ)
918+
{
919+
if (lt->buffer)
920+
pfree(lt->buffer);
921+
lt->buffer = palloc(BLCKSZ);
922+
lt->buffer_size = BLCKSZ;
923+
}
924+
845925
/* Read the first block, or reset if tape is empty */
846926
lt->curBlockNumber = 0L;
847927
lt->pos = 0;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
875955
Assert(tapenum >= 0 && tapenum < lts->nTapes);
876956
lt = &lts->tapes[tapenum];
877957
Assert(lt->frozen);
958+
Assert(lt->buffer_size == BLCKSZ);
878959

879960
/*
880961
* Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
9411022
lt = &lts->tapes[tapenum];
9421023
Assert(lt->frozen);
9431024
Assert(offset >= 0 && offset <= BLCKSZ);
1025+
Assert(lt->buffer_size == BLCKSZ);
9441026

9451027
/*
9461028
* Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
10021084

10031085
Assert(tapenum >= 0 && tapenum < lts->nTapes);
10041086
lt = &lts->tapes[tapenum];
1087+
1088+
/* With a larger buffer, 'pos' wouldn't be the same as offset within page */
1089+
Assert(lt->buffer_size == BLCKSZ);
1090+
10051091
*blocknum = lt->curBlockNumber;
10061092
*offset = lt->pos;
10071093
}
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
10141100
{
10151101
return lts->nFileBlocks;
10161102
}
1103+
1104+
/*
1105+
* Set buffer size to use, when reading from given tape.
1106+
*/
1107+
void
1108+
LogicalTapeAssignReadBufferSize(LogicalTapeSet *lts, int tapenum, size_t avail_mem)
1109+
{
1110+
LogicalTape *lt;
1111+
1112+
Assert(tapenum >= 0 && tapenum < lts->nTapes);
1113+
lt = &lts->tapes[tapenum];
1114+
1115+
/*
1116+
* The buffer size must be a multiple of BLCKSZ in size, so round the
1117+
* given value down to nearest BLCKSZ. Make sure we have at least one
1118+
* page. Also, don't go above MaxAllocSize, to avoid erroring out. A
1119+
* multi-gigabyte buffer is unlikely to be helpful, anyway.
1120+
*/
1121+
if (avail_mem < BLCKSZ)
1122+
avail_mem = BLCKSZ;
1123+
if (avail_mem > MaxAllocSize)
1124+
avail_mem = MaxAllocSize;
1125+
avail_mem -= avail_mem % BLCKSZ;
1126+
lt->read_buffer_size = avail_mem;
1127+
}

0 commit comments

Comments
 (0)