Skip to content

Commit b7b8f0b

Browse files
committed
Implement prefetching via posix_fadvise() for bitmap index scans. A new
GUC variable effective_io_concurrency controls how many concurrent block prefetch requests will be issued. (The best way to handle this for plain index scans is still under debate, so that part is not applied yet --- tgl) Greg Stark
1 parent 1a37056 commit b7b8f0b

File tree

15 files changed

+422
-17
lines changed

15 files changed

+422
-17
lines changed

doc/src/sgml/config.sgml

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.204 2009/01/09 10:13:18 mha Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.205 2009/01/12 05:10:44 tgl Exp $ -->
22

33
<chapter Id="runtime-config">
44
<title>Server Configuration</title>
@@ -1203,6 +1203,55 @@ SET ENABLE_SEQSCAN TO OFF;
12031203
queries.
12041204
</para>
12051205
</sect2>
1206+
1207+
<sect2 id="runtime-config-resource-async-behavior">
1208+
<title>Asynchronous Behavior</title>
1209+
1210+
<variablelist>
1211+
<varlistentry id="guc-effective-io-concurrency" xreflabel="effective_io_concurrency">
1212+
<term><varname>effective_io_concurrency</varname> (<type>integer</type>)</term>
1213+
<indexterm>
1214+
<primary><varname>effective_io_concurrency</> configuration parameter</primary>
1215+
</indexterm>
1216+
<listitem>
1217+
<para>
1218+
Sets the number of concurrent disk I/O operations that
1219+
<productname>PostgreSQL</> expects can be executed
1220+
simultaneously. Raising this value will increase the number of I/O
1221+
operations that any individual <productname>PostgreSQL</> session
1222+
attempts to initiate in parallel. The allowed range is 1 to 1000,
1223+
or zero to disable issuance of asynchronous I/O requests.
1224+
</para>
1225+
1226+
<para>
1227+
A good starting point for this setting is the number of separate
1228+
drives comprising a RAID 0 stripe or RAID 1 mirror being used for the
1229+
database. (For RAID 5 the parity drive should not be counted.)
1230+
However, if the database is often busy with multiple queries issued in
1231+
concurrent sessions, lower values may be sufficient to keep the disk
1232+
array busy. A value higher than needed to keep the disks busy will
1233+
only result in extra CPU overhead.
1234+
</para>
1235+
1236+
<para>
1237+
For more exotic systems, such as memory-based storage or a RAID array
1238+
that is limited by bus bandwidth, the correct value might be the
1239+
number of I/O paths available. Some experimentation may be needed
1240+
to find the best value.
1241+
</para>
1242+
1243+
<para>
1244+
Asynchronous I/O depends on an effective <function>posix_fadvise</>
1245+
function, which some operating systems lack. If the function is not
1246+
present then setting this parameter to anything but zero will result
1247+
in an error. On some operating systems the function is present but
1248+
does not actually do anything. On such systems setting a nonzero
1249+
value will add CPU overhead without improving performance.
1250+
</para>
1251+
</listitem>
1252+
</varlistentry>
1253+
</variablelist>
1254+
</sect2>
12061255
</sect1>
12071256

12081257
<sect1 id="runtime-config-wal">

src/backend/executor/nodeBitmapHeapscan.c

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
*
2222
*
2323
* IDENTIFICATION
24-
* $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.32 2009/01/10 21:08:36 tgl Exp $
24+
* $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.33 2009/01/12 05:10:44 tgl Exp $
2525
*
2626
*-------------------------------------------------------------------------
2727
*/
@@ -67,6 +67,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
6767
TIDBitmap *tbm;
6868
TBMIterator *tbmiterator;
6969
TBMIterateResult *tbmres;
70+
TBMIterator *prefetch_iterator;
7071
OffsetNumber targoffset;
7172
TupleTableSlot *slot;
7273

@@ -81,6 +82,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
8182
tbm = node->tbm;
8283
tbmiterator = node->tbmiterator;
8384
tbmres = node->tbmres;
85+
prefetch_iterator = node->prefetch_iterator;
8486

8587
/*
8688
* Check if we are evaluating PlanQual for tuple of this relation.
@@ -114,6 +116,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
114116
/*
115117
* If we haven't yet performed the underlying index scan, do it, and
116118
* begin the iteration over the bitmap.
119+
*
120+
* For prefetching, we use *two* iterators, one for the pages we are
121+
* actually scanning and another that runs ahead of the first for
122+
* prefetching. node->prefetch_pages tracks exactly how many pages
123+
* ahead the prefetch iterator is. Also, node->prefetch_target tracks
124+
* the desired prefetch distance, which starts small and increases up
125+
* to the GUC-controlled maximum, target_prefetch_pages. This is to
126+
* avoid doing a lot of prefetching in a scan that stops after a few
127+
* tuples because of a LIMIT.
117128
*/
118129
if (tbm == NULL)
119130
{
@@ -125,6 +136,15 @@ BitmapHeapNext(BitmapHeapScanState *node)
125136
node->tbm = tbm;
126137
node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
127138
node->tbmres = tbmres = NULL;
139+
140+
#ifdef USE_PREFETCH
141+
if (target_prefetch_pages > 0)
142+
{
143+
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
144+
node->prefetch_pages = 0;
145+
node->prefetch_target = -1;
146+
}
147+
#endif /* USE_PREFETCH */
128148
}
129149

130150
for (;;)
@@ -144,6 +164,22 @@ BitmapHeapNext(BitmapHeapScanState *node)
144164
break;
145165
}
146166

167+
#ifdef USE_PREFETCH
168+
if (node->prefetch_pages > 0)
169+
{
170+
/* The main iterator has closed the distance by one page */
171+
node->prefetch_pages--;
172+
}
173+
else if (prefetch_iterator)
174+
{
175+
/* Do not let the prefetch iterator get behind the main one */
176+
TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
177+
178+
if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
179+
elog(ERROR, "prefetch and main iterators are out of sync");
180+
}
181+
#endif /* USE_PREFETCH */
182+
147183
/*
148184
* Ignore any claimed entries past what we think is the end of the
149185
* relation. (This is probably not necessary given that we got at
@@ -165,14 +201,64 @@ BitmapHeapNext(BitmapHeapScanState *node)
165201
* Set rs_cindex to first slot to examine
166202
*/
167203
scan->rs_cindex = 0;
204+
205+
#ifdef USE_PREFETCH
206+
/*
207+
* Increase prefetch target if it's not yet at the max. Note
208+
* that we will increase it to zero after fetching the very
209+
* first page/tuple, then to one after the second tuple is
210+
* fetched, then it doubles as later pages are fetched.
211+
*/
212+
if (node->prefetch_target >= target_prefetch_pages)
213+
/* don't increase any further */ ;
214+
else if (node->prefetch_target >= target_prefetch_pages / 2)
215+
node->prefetch_target = target_prefetch_pages;
216+
else if (node->prefetch_target > 0)
217+
node->prefetch_target *= 2;
218+
else
219+
node->prefetch_target++;
220+
#endif /* USE_PREFETCH */
168221
}
169222
else
170223
{
171224
/*
172225
* Continuing in previously obtained page; advance rs_cindex
173226
*/
174227
scan->rs_cindex++;
228+
229+
#ifdef USE_PREFETCH
230+
/*
231+
* Try to prefetch at least a few pages even before we get to the
232+
* second page if we don't stop reading after the first tuple.
233+
*/
234+
if (node->prefetch_target < target_prefetch_pages)
235+
node->prefetch_target++;
236+
#endif /* USE_PREFETCH */
237+
}
238+
239+
#ifdef USE_PREFETCH
240+
/*
241+
* We issue prefetch requests *after* fetching the current page
242+
* to try to avoid having prefetching interfere with the main I/O.
243+
*/
244+
if (prefetch_iterator)
245+
{
246+
while (node->prefetch_pages < node->prefetch_target)
247+
{
248+
TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
249+
250+
if (tbmpre == NULL)
251+
{
252+
/* No more pages to prefetch */
253+
tbm_end_iterate(prefetch_iterator);
254+
node->prefetch_iterator = prefetch_iterator = NULL;
255+
break;
256+
}
257+
node->prefetch_pages++;
258+
PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
259+
}
175260
}
261+
#endif /* USE_PREFETCH */
176262

177263
/*
178264
* Out of range? If so, nothing more to look at on this page
@@ -379,11 +465,14 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
379465

380466
if (node->tbmiterator)
381467
tbm_end_iterate(node->tbmiterator);
468+
if (node->prefetch_iterator)
469+
tbm_end_iterate(node->prefetch_iterator);
382470
if (node->tbm)
383471
tbm_free(node->tbm);
384472
node->tbm = NULL;
385473
node->tbmiterator = NULL;
386474
node->tbmres = NULL;
475+
node->prefetch_iterator = NULL;
387476

388477
/*
389478
* Always rescan the input immediately, to ensure we can pass down any
@@ -429,6 +518,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
429518
*/
430519
if (node->tbmiterator)
431520
tbm_end_iterate(node->tbmiterator);
521+
if (node->prefetch_iterator)
522+
tbm_end_iterate(node->prefetch_iterator);
432523
if (node->tbm)
433524
tbm_free(node->tbm);
434525

@@ -474,6 +565,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
474565
scanstate->tbm = NULL;
475566
scanstate->tbmiterator = NULL;
476567
scanstate->tbmres = NULL;
568+
scanstate->prefetch_iterator = NULL;
569+
scanstate->prefetch_pages = 0;
570+
scanstate->prefetch_target = 0;
477571

478572
/*
479573
* Miscellaneous initialization

src/backend/storage/buffer/bufmgr.c

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.244 2009/01/01 17:23:47 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.245 2009/01/12 05:10:44 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -65,6 +65,13 @@ bool zero_damaged_pages = false;
6565
int bgwriter_lru_maxpages = 100;
6666
double bgwriter_lru_multiplier = 2.0;
6767

68+
/*
69+
* How many buffers PrefetchBuffer callers should try to stay ahead of their
70+
* ReadBuffer calls by. This is maintained by the assign hook for
71+
* effective_io_concurrency. Zero means "never prefetch".
72+
*/
73+
int target_prefetch_pages = 0;
74+
6875
/* local state for StartBufferIO and related functions */
6976
static volatile BufferDesc *InProgressBuf = NULL;
7077
static bool IsForInput;
@@ -95,6 +102,56 @@ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
95102
static void AtProcExit_Buffers(int code, Datum arg);
96103

97104

105+
/*
106+
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
107+
*
108+
* This is named by analogy to ReadBuffer but doesn't actually allocate a
109+
* buffer. Instead it tries to ensure that a future ReadBuffer for the given
110+
* block will not be delayed by the I/O. Prefetching is optional.
111+
* No-op if prefetching isn't compiled in.
112+
*/
113+
void
114+
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
115+
{
116+
#ifdef USE_PREFETCH
117+
Assert(RelationIsValid(reln));
118+
Assert(BlockNumberIsValid(blockNum));
119+
120+
/* Open it at the smgr level if not already done */
121+
RelationOpenSmgr(reln);
122+
123+
if (reln->rd_istemp)
124+
{
125+
/* pass it off to localbuf.c */
126+
LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
127+
}
128+
else
129+
{
130+
BufferTag newTag; /* identity of requested block */
131+
uint32 newHash; /* hash value for newTag */
132+
LWLockId newPartitionLock; /* buffer partition lock for it */
133+
int buf_id;
134+
135+
/* create a tag so we can lookup the buffer */
136+
INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode, forkNum, blockNum);
137+
138+
/* determine its hash code and partition lock ID */
139+
newHash = BufTableHashCode(&newTag);
140+
newPartitionLock = BufMappingPartitionLock(newHash);
141+
142+
/* see if the block is in the buffer pool already */
143+
LWLockAcquire(newPartitionLock, LW_SHARED);
144+
buf_id = BufTableLookup(&newTag, newHash);
145+
LWLockRelease(newPartitionLock);
146+
147+
/* If not in buffers, initiate prefetch */
148+
if (buf_id < 0)
149+
smgrprefetch(reln->rd_smgr, forkNum, blockNum);
150+
}
151+
#endif /* USE_PREFETCH */
152+
}
153+
154+
98155
/*
99156
* ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
100157
* fork with RBM_NORMAL mode and default strategy.

src/backend/storage/buffer/localbuf.c

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
*
1111
* IDENTIFICATION
12-
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.85 2009/01/01 17:23:47 momjian Exp $
12+
* $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.86 2009/01/12 05:10:44 tgl Exp $
1313
*
1414
*-------------------------------------------------------------------------
1515
*/
@@ -52,6 +52,43 @@ static void InitLocalBuffers(void);
5252
static Block GetLocalBufferStorage(void);
5353

5454

55+
/*
56+
* LocalPrefetchBuffer -
57+
* initiate asynchronous read of a block of a relation
58+
*
59+
* Do PrefetchBuffer's work for temporary relations.
60+
* No-op if prefetching isn't compiled in.
61+
*/
62+
void
63+
LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
64+
BlockNumber blockNum)
65+
{
66+
#ifdef USE_PREFETCH
67+
BufferTag newTag; /* identity of requested block */
68+
LocalBufferLookupEnt *hresult;
69+
70+
INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum);
71+
72+
/* Initialize local buffers if first request in this session */
73+
if (LocalBufHash == NULL)
74+
InitLocalBuffers();
75+
76+
/* See if the desired buffer already exists */
77+
hresult = (LocalBufferLookupEnt *)
78+
hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL);
79+
80+
if (hresult)
81+
{
82+
/* Yes, so nothing to do */
83+
return;
84+
}
85+
86+
/* Not in buffers, so initiate prefetch */
87+
smgrprefetch(smgr, forkNum, blockNum);
88+
#endif /* USE_PREFETCH */
89+
}
90+
91+
5592
/*
5693
* LocalBufferAlloc -
5794
* Find or create a local buffer for the given page of the given relation.

0 commit comments

Comments
 (0)