PostgreSQL Source Code  git master
syncscan.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * syncscan.c
4  * scan synchronization support
5  *
6  * When multiple backends run a sequential scan on the same table, we try
7  * to keep them synchronized to reduce the overall I/O needed. The goal is
8  * to read each page into shared buffer cache only once, and let all backends
9  * that take part in the shared scan process the page before it falls out of
10  * the cache.
11  *
12  * Since the "leader" in a pack of backends doing a seqscan will have to wait
13  * for I/O, while the "followers" don't, there is a strong self-synchronizing
14  * effect once we can get the backends examining approximately the same part
15  * of the table at the same time. Hence all that is really needed is to get
16  * a new backend beginning a seqscan to begin it close to where other backends
17  * are reading. We can scan the table circularly, from block X up to the
18  * end and then from block 0 to X-1, to ensure we visit all rows while still
19  * participating in the common scan.
20  *
21  * To accomplish that, we keep track of the scan position of each table, and
22  * start new scans close to where the previous scan(s) are. We don't try to
23  * do any extra synchronization to keep the scans together afterwards; some
24  * scans might progress much more slowly than others, for example if the
25  * results need to be transferred to the client over a slow network, and we
26  * don't want such queries to slow down others.
27  *
28  * There can realistically only be a few large sequential scans on different
29  * tables in progress at any time. Therefore we just keep the scan positions
30  * in a small LRU list which we scan every time we need to look up or update a
31  * scan position. The whole mechanism is only applied for tables exceeding
32  * a threshold size (but that is not the concern of this module).
33  *
34  * INTERFACE ROUTINES
35  * ss_get_location - return current scan location of a relation
36  * ss_report_location - update current scan location
37  *
38  *
39  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  * IDENTIFICATION
43  * src/backend/access/common/syncscan.c
44  *
45  *-------------------------------------------------------------------------
46  */
47 #include "postgres.h"
48 
49 #include "access/syncscan.h"
50 #include "miscadmin.h"
51 #include "storage/lwlock.h"
52 #include "storage/shmem.h"
53 #include "utils/rel.h"
54 
55 
56 /* GUC variables */
57 #ifdef TRACE_SYNCSCAN
58 bool trace_syncscan = false;
59 #endif
60 
61 
62 /*
63  * Size of the LRU list.
64  *
65  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
66  *
67  * XXX: What's a good value? It should be large enough to hold the
68  * maximum number of large tables scanned simultaneously. But a larger value
69  * means more traversing of the LRU list when starting a new scan.
70  */
71 #define SYNC_SCAN_NELEM 20
72 
73 /*
74  * Interval between reports of the location of the current scan, in pages.
75  *
76  * Note: This should be smaller than the ring size (see buffer/freelist.c)
77  * we use for bulk reads. Otherwise a scan joining other scans might start
78  * from a page that's no longer in the buffer cache. This is a bit fuzzy;
79  * there's no guarantee that the new scan will read the page before it leaves
80  * the buffer cache anyway, and on the other hand the page is most likely
81  * still in the OS cache.
82  */
83 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84 
85 
86 /*
87  * The scan locations structure is essentially a doubly-linked LRU with head
88  * and tail pointer, but designed to hold a fixed maximum number of elements in
89  * fixed-size shared memory.
90  */
91 typedef struct ss_scan_location_t
92 {
93  RelFileLocator relfilelocator; /* identity of a relation */
94  BlockNumber location; /* last-reported location in the relation */
96 
97 typedef struct ss_lru_item_t
98 {
103 
104 typedef struct ss_scan_locations_t
105 {
108  ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
110 
111 #define SizeOfScanLocations(N) \
112  (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113 
114 /* Pointer to struct in shared memory */
116 
117 /* prototypes for internal functions */
118 static BlockNumber ss_search(RelFileLocator relfilelocator,
119  BlockNumber location, bool set);
120 
121 
122 /*
123  * SyncScanShmemSize --- report amount of shared memory space needed
124  */
125 Size
127 {
129 }
130 
131 /*
132  * SyncScanShmemInit --- initialize this module's shared memory
133  */
134 void
136 {
137  int i;
138  bool found;
139 
141  ShmemInitStruct("Sync Scan Locations List",
143  &found);
144 
145  if (!IsUnderPostmaster)
146  {
147  /* Initialize shared memory area */
148  Assert(!found);
149 
152 
153  for (i = 0; i < SYNC_SCAN_NELEM; i++)
154  {
155  ss_lru_item_t *item = &scan_locations->items[i];
156 
157  /*
158  * Initialize all slots with invalid values. As scans are started,
159  * these invalid entries will fall off the LRU list and get
160  * replaced with real entries.
161  */
166 
167  item->prev = (i > 0) ?
168  (&scan_locations->items[i - 1]) : NULL;
169  item->next = (i < SYNC_SCAN_NELEM - 1) ?
170  (&scan_locations->items[i + 1]) : NULL;
171  }
172  }
173  else
174  Assert(found);
175 }
176 
177 /*
178  * ss_search --- search the scan_locations structure for an entry with the
179  * given relfilelocator.
180  *
181  * If "set" is true, the location is updated to the given location. If no
182  * entry for the given relfilelocator is found, it will be created at the head
183  * of the list with the given location, even if "set" is false.
184  *
185  * In any case, the location after possible update is returned.
186  *
187  * Caller is responsible for having acquired suitable lock on the shared
188  * data structure.
189  */
190 static BlockNumber
191 ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
192 {
193  ss_lru_item_t *item;
194 
195  item = scan_locations->head;
196  for (;;)
197  {
198  bool match;
199 
201  relfilelocator);
202 
203  if (match || item->next == NULL)
204  {
205  /*
206  * If we reached the end of list and no match was found, take over
207  * the last entry
208  */
209  if (!match)
210  {
211  item->location.relfilelocator = relfilelocator;
212  item->location.location = location;
213  }
214  else if (set)
215  item->location.location = location;
216 
217  /* Move the entry to the front of the LRU list */
218  if (item != scan_locations->head)
219  {
220  /* unlink */
221  if (item == scan_locations->tail)
222  scan_locations->tail = item->prev;
223  item->prev->next = item->next;
224  if (item->next)
225  item->next->prev = item->prev;
226 
227  /* link */
228  item->prev = NULL;
229  item->next = scan_locations->head;
230  scan_locations->head->prev = item;
231  scan_locations->head = item;
232  }
233 
234  return item->location.location;
235  }
236 
237  item = item->next;
238  }
239 
240  /* not reached */
241 }
242 
243 /*
244  * ss_get_location --- get the optimal starting location for scan
245  *
246  * Returns the last-reported location of a sequential scan on the
247  * relation, or 0 if no valid location is found.
248  *
249  * We expect the caller has just done RelationGetNumberOfBlocks(), and
250  * so that number is passed in rather than computing it again. The result
251  * is guaranteed less than relnblocks (assuming that's > 0).
252  */
255 {
256  BlockNumber startloc;
257 
258  LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
259  startloc = ss_search(rel->rd_locator, 0, false);
260  LWLockRelease(SyncScanLock);
261 
262  /*
263  * If the location is not a valid block number for this scan, start at 0.
264  *
265  * This can happen if for instance a VACUUM truncated the table since the
266  * location was saved.
267  */
268  if (startloc >= relnblocks)
269  startloc = 0;
270 
271 #ifdef TRACE_SYNCSCAN
272  if (trace_syncscan)
273  elog(LOG,
274  "SYNC_SCAN: start \"%s\" (size %u) at %u",
275  RelationGetRelationName(rel), relnblocks, startloc);
276 #endif
277 
278  return startloc;
279 }
280 
281 /*
282  * ss_report_location --- update the current scan location
283  *
284  * Writes an entry into the shared Sync Scan state of the form
285  * (relfilelocator, blocknumber), overwriting any existing entry for the
286  * same relfilelocator.
287  */
288 void
290 {
291 #ifdef TRACE_SYNCSCAN
292  if (trace_syncscan)
293  {
294  if ((location % 1024) == 0)
295  elog(LOG,
296  "SYNC_SCAN: scanning \"%s\" at %u",
297  RelationGetRelationName(rel), location);
298  }
299 #endif
300 
301  /*
302  * To reduce lock contention, only report scan progress every N pages. For
303  * the same reason, don't block if the lock isn't immediately available.
304  * Missing a few updates isn't critical, it just means that a new scan
305  * that wants to join the pack will start a little bit behind the head of
306  * the scan. Hopefully the pages are still in OS cache and the scan
307  * catches up quickly.
308  */
309  if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
310  {
311  if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
312  {
313  (void) ss_search(rel->rd_locator, location, true);
314  LWLockRelease(SyncScanLock);
315  }
316 #ifdef TRACE_SYNCSCAN
317  else if (trace_syncscan)
318  elog(LOG,
319  "SYNC_SCAN: missed update for \"%s\" at %u",
320  RelationGetRelationName(rel), location);
321 #endif
322  }
323 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define Assert(condition)
Definition: c.h:858
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:398
size_t Size
Definition: c.h:605
#define LOG
Definition: elog.h:31
#define elog(elevel,...)
Definition: elog.h:225
bool IsUnderPostmaster
Definition: globals.c:119
int i
Definition: isn.c:73
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339
@ LW_EXCLUSIVE
Definition: lwlock.h:114
#define InvalidOid
Definition: postgres_ext.h:36
#define RelationGetRelationName(relation)
Definition: rel.h:539
#define RelFileLocatorEquals(locator1, locator2)
#define InvalidRelFileNumber
Definition: relpath.h:26
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
RelFileNumber relNumber
RelFileLocator rd_locator
Definition: rel.h:57
ss_scan_location_t location
Definition: syncscan.c:101
struct ss_lru_item_t * next
Definition: syncscan.c:100
struct ss_lru_item_t * prev
Definition: syncscan.c:99
RelFileLocator relfilelocator
Definition: syncscan.c:93
BlockNumber location
Definition: syncscan.c:94
ss_lru_item_t * head
Definition: syncscan.c:106
ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]
Definition: syncscan.c:108
ss_lru_item_t * tail
Definition: syncscan.c:107
struct ss_lru_item_t ss_lru_item_t
void ss_report_location(Relation rel, BlockNumber location)
Definition: syncscan.c:289
void SyncScanShmemInit(void)
Definition: syncscan.c:135
static ss_scan_locations_t * scan_locations
Definition: syncscan.c:115
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition: syncscan.c:254
struct ss_scan_locations_t ss_scan_locations_t
Size SyncScanShmemSize(void)
Definition: syncscan.c:126
static BlockNumber ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
Definition: syncscan.c:191
struct ss_scan_location_t ss_scan_location_t
#define SYNC_SCAN_NELEM
Definition: syncscan.c:71
#define SizeOfScanLocations(N)
Definition: syncscan.c:111
#define SYNC_SCAN_REPORT_INTERVAL
Definition: syncscan.c:83