52
52
* not clear this helps much, but it can't hurt. (XXX perhaps a LIFO
53
53
* policy for free blocks would be better?)
54
54
*
55
+ * To further make the I/Os more sequential, we can use a larger buffer
56
+ * when reading, and read multiple blocks from the same tape in one go,
57
+ * whenever the buffer becomes empty. LogicalTapeAssignReadBufferSize()
58
+ * can be used to set the size of the read buffer.
59
+ *
55
60
* To support the above policy of writing to the lowest free block,
56
61
* ltsGetFreeBlock sorts the list of free block numbers into decreasing
57
62
* order each time it is asked for a block and the list isn't currently
58
63
* sorted. This is an efficient way to handle it because we expect cycles
59
64
* of releasing many blocks followed by re-using many blocks, due to
60
- * tuplesort.c's "preread" behavior .
65
+ * the larger read buffer .
61
66
*
62
67
* Since all the bookkeeping and buffer memory is allocated with palloc(),
63
68
* and the underlying file(s) are made with OpenTemporaryFile, all resources
79
84
80
85
#include "storage/buffile.h"
81
86
#include "utils/logtape.h"
87
+ #include "utils/memutils.h"
82
88
83
89
/*
84
90
* Block indexes are "long"s, so we can fit this many per indirect block.
@@ -131,9 +137,18 @@ typedef struct LogicalTape
131
137
* reading.
132
138
*/
133
139
char * buffer ; /* physical buffer (separately palloc'd) */
140
+ int buffer_size ; /* allocated size of the buffer */
134
141
long curBlockNumber ; /* this block's logical blk# within tape */
135
142
int pos ; /* next read/write position in buffer */
136
143
int nbytes ; /* total # of valid bytes in buffer */
144
+
145
+ /*
146
+ * Desired buffer size to use when reading. To keep things simple, we use
147
+ * a single-block buffer when writing, or when reading a frozen tape. But
148
+ * when we are reading and will only read forwards, we allocate a larger
149
+ * buffer, determined by read_buffer_size.
150
+ */
151
+ int read_buffer_size ;
137
152
} LogicalTape ;
138
153
139
154
/*
@@ -227,6 +242,53 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer)
227
242
blocknum )));
228
243
}
229
244
245
+ /*
246
+ * Read as many blocks as we can into the per-tape buffer.
247
+ *
248
+ * The caller can specify the next physical block number to read, in
249
+ * datablocknum, or -1 to fetch the next block number from the internal block.
250
+ * If datablocknum == -1, the caller must've already set curBlockNumber.
251
+ *
252
+ * Returns true if anything was read, 'false' on EOF.
253
+ */
254
+ static bool
255
+ ltsReadFillBuffer (LogicalTapeSet * lts , LogicalTape * lt , long datablocknum )
256
+ {
257
+ lt -> pos = 0 ;
258
+ lt -> nbytes = 0 ;
259
+
260
+ do
261
+ {
262
+ /* Fetch next block number (unless provided by caller) */
263
+ if (datablocknum == -1 )
264
+ {
265
+ datablocknum = ltsRecallNextBlockNum (lts , lt -> indirect , lt -> frozen );
266
+ if (datablocknum == -1L )
267
+ break ; /* EOF */
268
+ lt -> curBlockNumber ++ ;
269
+ }
270
+
271
+ /* Read the block */
272
+ ltsReadBlock (lts , datablocknum , (void * ) (lt -> buffer + lt -> nbytes ));
273
+ if (!lt -> frozen )
274
+ ltsReleaseBlock (lts , datablocknum );
275
+
276
+ if (lt -> curBlockNumber < lt -> numFullBlocks )
277
+ lt -> nbytes += BLCKSZ ;
278
+ else
279
+ {
280
+ /* EOF */
281
+ lt -> nbytes += lt -> lastBlockBytes ;
282
+ break ;
283
+ }
284
+
285
+ /* Advance to next block, if we have buffer space left */
286
+ datablocknum = -1 ;
287
+ } while (lt -> nbytes < lt -> buffer_size );
288
+
289
+ return (lt -> nbytes > 0 );
290
+ }
291
+
230
292
/*
231
293
* qsort comparator for sorting freeBlocks[] into decreasing order.
232
294
*/
@@ -546,6 +608,8 @@ LogicalTapeSetCreate(int ntapes)
546
608
lt -> numFullBlocks = 0L ;
547
609
lt -> lastBlockBytes = 0 ;
548
610
lt -> buffer = NULL ;
611
+ lt -> buffer_size = 0 ;
612
+ lt -> read_buffer_size = BLCKSZ ;
549
613
lt -> curBlockNumber = 0L ;
550
614
lt -> pos = 0 ;
551
615
lt -> nbytes = 0 ;
@@ -628,14 +692,18 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
628
692
629
693
/* Allocate data buffer and first indirect block on first write */
630
694
if (lt -> buffer == NULL )
695
+ {
631
696
lt -> buffer = (char * ) palloc (BLCKSZ );
697
+ lt -> buffer_size = BLCKSZ ;
698
+ }
632
699
if (lt -> indirect == NULL )
633
700
{
634
701
lt -> indirect = (IndirectBlock * ) palloc (sizeof (IndirectBlock ));
635
702
lt -> indirect -> nextSlot = 0 ;
636
703
lt -> indirect -> nextup = NULL ;
637
704
}
638
705
706
+ Assert (lt -> buffer_size == BLCKSZ );
639
707
while (size > 0 )
640
708
{
641
709
if (lt -> pos >= BLCKSZ )
@@ -709,18 +777,19 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
709
777
Assert (lt -> frozen );
710
778
datablocknum = ltsRewindFrozenIndirectBlock (lts , lt -> indirect );
711
779
}
780
+
781
+ /* Allocate a read buffer */
782
+ if (lt -> buffer )
783
+ pfree (lt -> buffer );
784
+ lt -> buffer = palloc (lt -> read_buffer_size );
785
+ lt -> buffer_size = lt -> read_buffer_size ;
786
+
712
787
/* Read the first block, or reset if tape is empty */
713
788
lt -> curBlockNumber = 0L ;
714
789
lt -> pos = 0 ;
715
790
lt -> nbytes = 0 ;
716
791
if (datablocknum != -1L )
717
- {
718
- ltsReadBlock (lts , datablocknum , (void * ) lt -> buffer );
719
- if (!lt -> frozen )
720
- ltsReleaseBlock (lts , datablocknum );
721
- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
722
- BLCKSZ : lt -> lastBlockBytes ;
723
- }
792
+ ltsReadFillBuffer (lts , lt , datablocknum );
724
793
}
725
794
else
726
795
{
@@ -754,6 +823,13 @@ LogicalTapeRewind(LogicalTapeSet *lts, int tapenum, bool forWrite)
754
823
lt -> curBlockNumber = 0L ;
755
824
lt -> pos = 0 ;
756
825
lt -> nbytes = 0 ;
826
+
827
+ if (lt -> buffer )
828
+ {
829
+ pfree (lt -> buffer );
830
+ lt -> buffer = NULL ;
831
+ lt -> buffer_size = 0 ;
832
+ }
757
833
}
758
834
}
759
835
@@ -779,20 +855,8 @@ LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
779
855
if (lt -> pos >= lt -> nbytes )
780
856
{
781
857
/* Try to load more data into buffer. */
782
- long datablocknum = ltsRecallNextBlockNum (lts , lt -> indirect ,
783
- lt -> frozen );
784
-
785
- if (datablocknum == -1L )
858
+ if (!ltsReadFillBuffer (lts , lt , -1 ))
786
859
break ; /* EOF */
787
- lt -> curBlockNumber ++ ;
788
- lt -> pos = 0 ;
789
- ltsReadBlock (lts , datablocknum , (void * ) lt -> buffer );
790
- if (!lt -> frozen )
791
- ltsReleaseBlock (lts , datablocknum );
792
- lt -> nbytes = (lt -> curBlockNumber < lt -> numFullBlocks ) ?
793
- BLCKSZ : lt -> lastBlockBytes ;
794
- if (lt -> nbytes <= 0 )
795
- break ; /* EOF (possible here?) */
796
860
}
797
861
798
862
nthistime = lt -> nbytes - lt -> pos ;
@@ -842,6 +906,22 @@ LogicalTapeFreeze(LogicalTapeSet *lts, int tapenum)
842
906
lt -> writing = false;
843
907
lt -> frozen = true;
844
908
datablocknum = ltsRewindIndirectBlock (lts , lt -> indirect , true);
909
+
910
+ /*
911
+ * The seek and backspace functions assume a single block read buffer.
912
+ * That's OK with current usage. A larger buffer is helpful to make the
913
+ * read pattern of the backing file look more sequential to the OS, when
914
+ * we're reading from multiple tapes. But at the end of a sort, when a
915
+ * tape is frozen, we only read from a single tape anyway.
916
+ */
917
+ if (!lt -> buffer || lt -> buffer_size != BLCKSZ )
918
+ {
919
+ if (lt -> buffer )
920
+ pfree (lt -> buffer );
921
+ lt -> buffer = palloc (BLCKSZ );
922
+ lt -> buffer_size = BLCKSZ ;
923
+ }
924
+
845
925
/* Read the first block, or reset if tape is empty */
846
926
lt -> curBlockNumber = 0L ;
847
927
lt -> pos = 0 ;
@@ -875,6 +955,7 @@ LogicalTapeBackspace(LogicalTapeSet *lts, int tapenum, size_t size)
875
955
Assert (tapenum >= 0 && tapenum < lts -> nTapes );
876
956
lt = & lts -> tapes [tapenum ];
877
957
Assert (lt -> frozen );
958
+ Assert (lt -> buffer_size == BLCKSZ );
878
959
879
960
/*
880
961
* Easy case for seek within current block.
@@ -941,6 +1022,7 @@ LogicalTapeSeek(LogicalTapeSet *lts, int tapenum,
941
1022
lt = & lts -> tapes [tapenum ];
942
1023
Assert (lt -> frozen );
943
1024
Assert (offset >= 0 && offset <= BLCKSZ );
1025
+ Assert (lt -> buffer_size == BLCKSZ );
944
1026
945
1027
/*
946
1028
* Easy case for seek within current block.
@@ -1002,6 +1084,10 @@ LogicalTapeTell(LogicalTapeSet *lts, int tapenum,
1002
1084
1003
1085
Assert (tapenum >= 0 && tapenum < lts -> nTapes );
1004
1086
lt = & lts -> tapes [tapenum ];
1087
+
1088
+ /* With a larger buffer, 'pos' wouldn't be the same as offset within page */
1089
+ Assert (lt -> buffer_size == BLCKSZ );
1090
+
1005
1091
* blocknum = lt -> curBlockNumber ;
1006
1092
* offset = lt -> pos ;
1007
1093
}
@@ -1014,3 +1100,28 @@ LogicalTapeSetBlocks(LogicalTapeSet *lts)
1014
1100
{
1015
1101
return lts -> nFileBlocks ;
1016
1102
}
1103
+
1104
+ /*
1105
+ * Set buffer size to use, when reading from given tape.
1106
+ */
1107
+ void
1108
+ LogicalTapeAssignReadBufferSize (LogicalTapeSet * lts , int tapenum , size_t avail_mem )
1109
+ {
1110
+ LogicalTape * lt ;
1111
+
1112
+ Assert (tapenum >= 0 && tapenum < lts -> nTapes );
1113
+ lt = & lts -> tapes [tapenum ];
1114
+
1115
+ /*
1116
+ * The buffer size must be a multiple of BLCKSZ in size, so round the
1117
+ * given value down to nearest BLCKSZ. Make sure we have at least one
1118
+ * page. Also, don't go above MaxAllocSize, to avoid erroring out. A
1119
+ * multi-gigabyte buffer is unlikely to be helpful, anyway.
1120
+ */
1121
+ if (avail_mem < BLCKSZ )
1122
+ avail_mem = BLCKSZ ;
1123
+ if (avail_mem > MaxAllocSize )
1124
+ avail_mem = MaxAllocSize ;
1125
+ avail_mem -= avail_mem % BLCKSZ ;
1126
+ lt -> read_buffer_size = avail_mem ;
1127
+ }
0 commit comments