PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "common/hashfn.h"
49#include "executor/instrument.h"
50#include "lib/binaryheap.h"
51#include "miscadmin.h"
52#include "pg_trace.h"
53#include "pgstat.h"
54#include "postmaster/bgwriter.h"
55#include "storage/aio.h"
57#include "storage/bufmgr.h"
58#include "storage/fd.h"
59#include "storage/ipc.h"
60#include "storage/lmgr.h"
61#include "storage/proc.h"
62#include "storage/proclist.h"
63#include "storage/procsignal.h"
64#include "storage/read_stream.h"
65#include "storage/smgr.h"
66#include "storage/standby.h"
67#include "utils/memdebug.h"
68#include "utils/ps_status.h"
69#include "utils/rel.h"
70#include "utils/resowner.h"
71#include "utils/timestamp.h"
72#include "utils/wait_event.h"
73
74
75/* Note: these two macros only work on shared buffers, not local ones! */
76#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
77#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
78
79/* Note: this macro only works on local buffers, not shared ones! */
80#define LocalBufHdrGetBlock(bufHdr) \
81 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
82
83/* Bits in SyncOneBuffer's return value */
84#define BUF_WRITTEN 0x01
85#define BUF_REUSABLE 0x02
86
87#define RELS_BSEARCH_THRESHOLD 20
88
89/*
90 * This is the size (in the number of blocks) above which we scan the
91 * entire buffer pool to remove the buffers for all the pages of relation
92 * being dropped. For the relations with size below this threshold, we find
93 * the buffers by doing lookups in BufMapping table.
94 */
95#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
96
97/*
98 * This is separated out from PrivateRefCountEntry to allow for copying all
99 * the data members via struct assignment.
100 */
102{
103 /*
104 * How many times has the buffer been pinned by this backend.
105 */
107
108 /*
109 * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
110 * the buffer is not locked.
111 */
114
116{
117 /*
118 * Note that this needs to be same as the entry's corresponding
119 * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
120 * store it in both places as this is used for the hashtable key and
121 * because it is more convenient (passing around a PrivateRefCountEntry
122 * suffices to identify the buffer) and faster (checking the keys array is
123 * faster when checking many entries, checking the entry is faster if just
124 * checking a single entry).
125 */
127
128 char status;
129
132
133#define SH_PREFIX refcount
134#define SH_ELEMENT_TYPE PrivateRefCountEntry
135#define SH_KEY_TYPE Buffer
136#define SH_KEY buffer
137#define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
138#define SH_EQUAL(tb, a, b) ((a) == (b))
139#define SH_SCOPE static inline
140#define SH_DECLARE
141#define SH_DEFINE
142#include "lib/simplehash.h"
143
144/* 64 bytes, about the size of a cache line on common systems */
145#define REFCOUNT_ARRAY_ENTRIES 8
146
147/*
148 * Status of buffers to checkpoint for a particular tablespace, used
149 * internally in BufferSync.
150 */
151typedef struct CkptTsStatus
152{
153 /* oid of the tablespace */
155
156 /*
157 * Checkpoint progress for this tablespace. To make progress comparable
158 * between tablespaces the progress is, for each tablespace, measured as a
159 * number between 0 and the total number of to-be-checkpointed pages. Each
160 * page checkpointed in this tablespace increments this space's progress
161 * by progress_slice.
162 */
165
166 /* number of to-be checkpointed pages in this tablespace */
168 /* already processed pages in this tablespace */
170
171 /* current offset in CkptBufferIds for this tablespace */
172 int index;
174
175/*
176 * Type for array used to sort SMgrRelations
177 *
178 * FlushRelationsAllBuffers shares the same comparator function with
179 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
180 * compatible.
181 */
182typedef struct SMgrSortArray
183{
184 RelFileLocator rlocator; /* This must be the first member */
187
188/* GUC variables */
192bool track_io_timing = false;
193
194/*
195 * How many buffers PrefetchBuffer callers should try to stay ahead of their
196 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
197 * for buffers not belonging to tablespaces that have their
198 * effective_io_concurrency parameter set.
199 */
201
202/*
203 * Like effective_io_concurrency, but used by maintenance code paths that might
204 * benefit from a higher setting because they work on behalf of many sessions.
205 * Overridden by the tablespace setting of the same name.
206 */
208
209/*
210 * Limit on how many blocks should be handled in single I/O operations.
211 * StartReadBuffers() callers should respect it, as should other operations
212 * that call smgr APIs directly. It is computed as the minimum of underlying
213 * GUCs io_combine_limit_guc and io_max_combine_limit.
214 */
218
219/*
220 * GUC variables about triggering kernel writeback for buffers written; OS
221 * dependent defaults are set via the GUC mechanism.
222 */
226
227/* local state for LockBufferForCleanup */
229
230/*
231 * Backend-Private refcount management:
232 *
233 * Each buffer also has a private refcount that keeps track of the number of
234 * times the buffer is pinned in the current process. This is so that the
235 * shared refcount needs to be modified only once if a buffer is pinned more
236 * than once by an individual backend. It's also used to check that no
237 * buffers are still pinned at the end of transactions and when exiting. We
238 * also use this mechanism to track whether this backend has a buffer locked,
239 * and, if so, in what mode.
240 *
241 *
242 * To avoid - as we used to - requiring an array with NBuffers entries to keep
243 * track of local buffers, we use a small sequentially searched array
244 * (PrivateRefCountArrayKeys, with the corresponding data stored in
245 * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
246 * keep track of backend local pins.
247 *
248 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
249 * refcounts are kept track of in the array; after that, new array entries
250 * displace old ones into the hash table. That way a frequently used entry
251 * can't get "stuck" in the hashtable while infrequent ones clog the array.
252 *
253 * Note that in most scenarios the number of pinned buffers will not exceed
254 * REFCOUNT_ARRAY_ENTRIES.
255 *
256 *
257 * To enter a buffer into the refcount tracking mechanism first reserve a free
258 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
259 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
260 * memory allocations in NewPrivateRefCountEntry() which can be important
261 * because in some scenarios it's called with a spinlock held...
262 */
268static int ReservedRefCountSlot = -1;
270
272
273static void ReservePrivateRefCountEntry(void);
278
279/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
280static void ResOwnerReleaseBufferIO(Datum res);
281static char *ResOwnerPrintBufferIO(Datum res);
282static void ResOwnerReleaseBuffer(Datum res);
283static char *ResOwnerPrintBuffer(Datum res);
284
286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};
293
295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};
302
303/*
304 * Ensure that the PrivateRefCountArray has sufficient space to store one more
305 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
306 * a new entry - but it's perfectly fine to not use a reserved entry.
307 */
308static void
310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
383
384/*
385 * Fill a previously reserved refcount entry.
386 */
389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}
411
412/*
413 * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
414 * inlining. This particularly seems to be true if the compiler is capable of
415 * auto-vectorizing the code, as that imposes additional stack-alignment
416 * requirements etc.
417 */
420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
498
499/*
500 * Return the PrivateRefCount entry for the passed buffer.
501 *
502 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
503 * do_move is true, and the entry resides in the hashtable the entry is
504 * optimized for frequent access by moving it to the array.
505 */
506static inline PrivateRefCountEntry *
508{
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
534}
535
536/*
537 * Returns how many times the passed buffer is pinned by this backend.
538 *
539 * Only works for shared memory buffers!
540 */
541static inline int32
543{
545
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}
559
560/*
561 * Release resources used to track the reference count of a buffer which we no
562 * longer have pinned and don't want to pin again immediately.
563 */
564static void
566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
591
592/*
593 * BufferIsPinned
594 * True iff the buffer is pinned (also checks for valid buffer number).
595 *
596 * NOTE: what we check here is that *this* backend holds a pin on
597 * the buffer. We do not care whether some other backend does.
598 */
599#define BufferIsPinned(bufnum) \
600( \
601 !BufferIsValid(bufnum) ? \
602 false \
603 : \
604 BufferIsLocal(bufnum) ? \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
607 (GetPrivateRefCount(bufnum) > 0) \
608)
609
610
613 ForkNumber forkNum, BlockNumber blockNum,
617 BufferAccessStrategy strategy,
618 uint32 flags,
621 Buffer *buffers,
625 BufferAccessStrategy strategy,
626 uint32 flags,
629 Buffer *buffers,
631static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
632 bool skip_if_not_valid);
633static void PinBuffer_Locked(BufferDesc *buf);
634static void UnpinBuffer(BufferDesc *buf);
635static void UnpinBufferNoOwner(BufferDesc *buf);
636static void BufferSync(int flags);
637static int SyncOneBuffer(int buf_id, bool skip_recently_used,
639static void WaitIO(BufferDesc *buf);
640static void AbortBufferIO(Buffer buffer);
641static void shared_buffer_write_error_callback(void *arg);
642static void local_buffer_write_error_callback(void *arg);
643static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
644 char relpersistence,
645 ForkNumber forkNum,
646 BlockNumber blockNum,
647 BufferAccessStrategy strategy,
650static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
651
654 Relation rel, char persistence, SMgrRelation smgr,
655 ForkNumber forknum, BlockNumber blocknum);
661static void FindAndDropRelationBuffers(RelFileLocator rlocator,
662 ForkNumber forkNum,
667 ForkNumber forkNum, bool permanent);
668static void AtProcExit_Buffers(int code, Datum arg);
669static void CheckForBufferLeaks(void);
670#ifdef USE_ASSERT_CHECKING
672#endif
673static int rlocator_comparator(const void *p1, const void *p2);
674static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
675static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
676static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
677
683static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
691
692
693/*
694 * Implementation of PrefetchBuffer() for shared buffers.
695 */
698 ForkNumber forkNum,
699 BlockNumber blockNum)
700{
702 BufferTag newTag; /* identity of requested block */
703 uint32 newHash; /* hash value for newTag */
704 LWLock *newPartitionLock; /* buffer partition lock for it */
705 int buf_id;
706
707 Assert(BlockNumberIsValid(blockNum));
708
709 /* create a tag so we can lookup the buffer */
710 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
711 forkNum, blockNum);
712
713 /* determine its hash code and partition lock ID */
716
717 /* see if the block is in the buffer pool already */
719 buf_id = BufTableLookup(&newTag, newHash);
721
722 /* If not in buffers, initiate prefetch */
723 if (buf_id < 0)
724 {
725#ifdef USE_PREFETCH
726 /*
727 * Try to initiate an asynchronous read. This returns false in
728 * recovery if the relation file doesn't exist.
729 */
730 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
731 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
732 {
733 result.initiated_io = true;
734 }
735#endif /* USE_PREFETCH */
736 }
737 else
738 {
739 /*
740 * Report the buffer it was in at that time. The caller may be able
741 * to avoid a buffer table lookup, but it's not pinned and it must be
742 * rechecked!
743 */
744 result.recent_buffer = buf_id + 1;
745 }
746
747 /*
748 * If the block *is* in buffers, we do nothing. This is not really ideal:
749 * the block might be just about to be evicted, which would be stupid
750 * since we know we are going to need it soon. But the only easy answer
751 * is to bump the usage_count, which does not seem like a great solution:
752 * when the caller does ultimately touch the block, usage_count would get
753 * bumped again, resulting in too much favoritism for blocks that are
754 * involved in a prefetch sequence. A real fix would involve some
755 * additional per-buffer state, and it's not clear that there's enough of
756 * a problem to justify that.
757 */
758
759 return result;
760}
761
762/*
763 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
764 *
765 * This is named by analogy to ReadBuffer but doesn't actually allocate a
766 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
767 * block will not be delayed by the I/O. Prefetching is optional.
768 *
769 * There are three possible outcomes:
770 *
771 * 1. If the block is already cached, the result includes a valid buffer that
772 * could be used by the caller to avoid the need for a later buffer lookup, but
773 * it's not pinned, so the caller must recheck it.
774 *
775 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
776 * true. Currently there is no way to know if the data was already cached by
777 * the kernel and therefore didn't really initiate I/O, and no way to know when
778 * the I/O completes other than using synchronous ReadBuffer().
779 *
780 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
781 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
782 * lack of a kernel facility), direct I/O is enabled, or the underlying
783 * relation file wasn't found and we are in recovery. (If the relation file
784 * wasn't found and we are not in recovery, an error is raised).
785 */
788{
790 Assert(BlockNumberIsValid(blockNum));
791
793 {
794 /* see comments in ReadBuffer_common */
798 errmsg("cannot access temporary tables of other sessions")));
799
800 /* pass it off to localbuf.c */
801 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803 else
804 {
805 /* pass it to the shared buffer version */
806 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
807 }
808}
809
810/*
811 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
812 *
813 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
814 * successful. Return true if the buffer is valid and still has the expected
815 * tag. In that case, the buffer is pinned and the usage count is bumped.
816 */
817bool
819 Buffer recent_buffer)
820{
822 BufferTag tag;
824
825 Assert(BufferIsValid(recent_buffer));
826
829 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
830
831 if (BufferIsLocal(recent_buffer))
832 {
833 int b = -recent_buffer - 1;
834
837
838 /* Is it still valid and holding the right tag? */
839 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
840 {
841 PinLocalBuffer(bufHdr, true);
842
844
845 return true;
846 }
847 }
848 else
849 {
850 bufHdr = GetBufferDescriptor(recent_buffer - 1);
851
852 /*
853 * Is it still valid and holding the right tag? We do an unlocked tag
854 * comparison first, to make it unlikely that we'll increment the
855 * usage counter of the wrong buffer, if someone calls us with a very
856 * out of date recent_buffer. Then we'll check it again if we get the
857 * pin.
858 */
859 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
860 PinBuffer(bufHdr, NULL, true))
861 {
862 if (BufferTagsEqual(&tag, &bufHdr->tag))
863 {
865 return true;
866 }
868 }
869 }
870
871 return false;
872}
873
874/*
875 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
876 * fork with RBM_NORMAL mode and default strategy.
877 */
878Buffer
883
884/*
885 * ReadBufferExtended -- returns a buffer containing the requested
886 * block of the requested relation. If the blknum
887 * requested is P_NEW, extend the relation file and
888 * allocate a new block. (Caller is responsible for
889 * ensuring that only one backend tries to extend a
890 * relation at the same time!)
891 *
892 * Returns: the buffer number for the buffer containing
893 * the block read. The returned buffer has been pinned.
894 * Does not return on error --- elog's instead.
895 *
896 * Assume when this function is called, that reln has been opened already.
897 *
898 * In RBM_NORMAL mode, the page is read from disk, and the page header is
899 * validated. An error is thrown if the page header is not valid. (But
900 * note that an all-zero page is considered "valid"; see
901 * PageIsVerified().)
902 *
903 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
904 * valid, the page is zeroed instead of throwing an error. This is intended
905 * for non-critical data, where the caller is prepared to repair errors.
906 *
907 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
908 * filled with zeros instead of reading it from disk. Useful when the caller
909 * is going to fill the page from scratch, since this saves I/O and avoids
910 * unnecessary failure if the page-on-disk has corrupt page headers.
911 * The page is returned locked to ensure that the caller has a chance to
912 * initialize the page before it's made visible to others.
913 * Caution: do not use this mode to read a page that is beyond the relation's
914 * current physical EOF; that is likely to cause problems in md.c when
915 * the page is modified and written out. P_NEW is OK, though.
916 *
917 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
918 * a cleanup-strength lock on the page.
919 *
920 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
921 *
922 * If strategy is not NULL, a nondefault buffer access strategy is used.
923 * See buffer/README for details.
924 */
925inline Buffer
928{
929 Buffer buf;
930
931 /*
932 * Read the buffer, and update pgstat counters to reflect a cache hit or
933 * miss. The other-session temp-relation check is enforced by
934 * ReadBuffer_common().
935 */
937 forkNum, blockNum, mode, strategy);
938
939 return buf;
940}
941
942
943/*
944 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
945 * a relcache entry for the relation.
946 *
947 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
948 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
949 * cannot be used for temporary relations (and making that work might be
950 * difficult, unless we only want to read temporary relations for our own
951 * ProcNumber).
952 */
953Buffer
956 BufferAccessStrategy strategy, bool permanent)
957{
958 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
959
960 return ReadBuffer_common(NULL, smgr,
962 forkNum, blockNum,
963 mode, strategy);
964}
965
966/*
967 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
968 */
969Buffer
971 ForkNumber forkNum,
972 BufferAccessStrategy strategy,
973 uint32 flags)
974{
975 Buffer buf;
976 uint32 extend_by = 1;
977
978 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
979 &buf, &extend_by);
980
981 return buf;
982}
983
984/*
985 * Extend relation by multiple blocks.
986 *
987 * Tries to extend the relation by extend_by blocks. Depending on the
988 * availability of resources the relation may end up being extended by a
989 * smaller number of pages (unless an error is thrown, always by at least one
990 * page). *extended_by is updated to the number of pages the relation has been
991 * extended to.
992 *
993 * buffers needs to be an array that is at least extend_by long. Upon
994 * completion, the first extend_by array elements will point to a pinned
995 * buffer.
996 *
997 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
998 * locked. This is useful for callers that want a buffer that is guaranteed to
999 * be empty.
1000 */
1004 BufferAccessStrategy strategy,
1005 uint32 flags,
1007 Buffer *buffers,
1009{
1010 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1011 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1012 Assert(extend_by > 0);
1013
1014 if (bmr.relpersistence == '\0')
1015 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1016
1017 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1019 buffers, extended_by);
1020}
1021
1022/*
1023 * Extend the relation so it is at least extend_to blocks large, return buffer
1024 * (extend_to - 1).
1025 *
1026 * This is useful for callers that want to write a specific page, regardless
1027 * of the current size of the relation (e.g. useful for visibilitymap and for
1028 * crash recovery).
1029 */
1030Buffer
1033 BufferAccessStrategy strategy,
1034 uint32 flags,
1037{
1039 uint32 extended_by = 0;
1041 Buffer buffers[64];
1042
1043 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1044 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1046
1047 if (bmr.relpersistence == '\0')
1048 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1049
1050 /*
1051 * If desired, create the file if it doesn't exist. If
1052 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1053 * an smgrexists call.
1054 */
1055 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1056 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1057 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1059 {
1061
1062 /* recheck, fork might have been created concurrently */
1065
1067 }
1068
1069 /*
1070 * If requested, invalidate size cache, so that smgrnblocks asks the
1071 * kernel.
1072 */
1073 if (flags & EB_CLEAR_SIZE_CACHE)
1074 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1075
1076 /*
1077 * Estimate how many pages we'll need to extend by. This avoids acquiring
1078 * unnecessarily many victim buffers.
1079 */
1081
1082 /*
1083 * Since no-one else can be looking at the page contents yet, there is no
1084 * difference between an exclusive lock and a cleanup-strength lock. Note
1085 * that we pass the original mode to ReadBuffer_common() below, when
1086 * falling back to reading the buffer to a concurrent relation extension.
1087 */
1089 flags |= EB_LOCK_TARGET;
1090
1091 while (current_size < extend_to)
1092 {
1093 uint32 num_pages = lengthof(buffers);
1095
1096 if ((uint64) current_size + num_pages > extend_to)
1097 num_pages = extend_to - current_size;
1098
1099 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1100 num_pages, extend_to,
1101 buffers, &extended_by);
1102
1104 Assert(num_pages != 0 || current_size >= extend_to);
1105
1106 for (uint32 i = 0; i < extended_by; i++)
1107 {
1108 if (first_block + i != extend_to - 1)
1109 ReleaseBuffer(buffers[i]);
1110 else
1111 buffer = buffers[i];
1112 }
1113 }
1114
1115 /*
1116 * It's possible that another backend concurrently extended the relation.
1117 * In that case read the buffer.
1118 *
1119 * XXX: Should we control this via a flag?
1120 */
1121 if (buffer == InvalidBuffer)
1122 {
1123 Assert(extended_by == 0);
1124 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1125 fork, extend_to - 1, mode, strategy);
1126 }
1127
1128 return buffer;
1129}
1130
1131/*
1132 * Lock and optionally zero a buffer, as part of the implementation of
1133 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1134 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1135 */
1136static void
1138{
1140 bool need_to_zero;
1143
1145
1146 if (already_valid)
1147 {
1148 /*
1149 * If the caller already knew the buffer was valid, we can skip some
1150 * header interaction. The caller just wants to lock the buffer.
1151 */
1152 need_to_zero = false;
1153 }
1154 else
1155 {
1156 if (isLocalBuf)
1157 {
1158 /* Simple case for non-shared buffers. */
1160 sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
1161 }
1162 else
1163 {
1164 /*
1165 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1166 * concurrently. Even though we aren't doing I/O, that ensures
1167 * that we don't zero a page that someone else has pinned. An
1168 * exclusive content lock wouldn't be enough, because readers are
1169 * allowed to drop the content lock after determining that a tuple
1170 * is visible (see buffer access rules in README).
1171 */
1173 sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
1174 }
1175
1178 }
1179
1180 if (need_to_zero)
1181 {
1183
1184 /*
1185 * Grab the buffer content lock before marking the page as valid, to
1186 * make sure that no other backend sees the zeroed page before the
1187 * caller has had a chance to initialize it.
1188 *
1189 * Since no-one else can be looking at the page contents yet, there is
1190 * no difference between an exclusive lock and a cleanup-strength
1191 * lock. (Note that we cannot use LockBuffer() or
1192 * LockBufferForCleanup() here, because they assert that the buffer is
1193 * already valid.)
1194 */
1195 if (!isLocalBuf)
1197
1198 /* Set BM_VALID, terminate IO, and wake up any waiters */
1199 if (isLocalBuf)
1200 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1201 else
1202 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1203 }
1204 else if (!isLocalBuf)
1205 {
1206 /*
1207 * The buffer is valid, so we can't zero it. The caller still expects
1208 * the page to be locked on return.
1209 */
1210 if (mode == RBM_ZERO_AND_LOCK)
1212 else
1214 }
1215}
1216
1217/*
1218 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1219 * already present, or false if more work is required to either read it in or
1220 * zero it.
1221 */
1224 SMgrRelation smgr,
1225 char persistence,
1226 ForkNumber forkNum,
1227 BlockNumber blockNum,
1228 BufferAccessStrategy strategy,
1231 bool *foundPtr)
1232{
1234
1235 Assert(blockNum != P_NEW);
1236
1237 /* Persistence should be set before */
1238 Assert((persistence == RELPERSISTENCE_TEMP ||
1239 persistence == RELPERSISTENCE_PERMANENT ||
1240 persistence == RELPERSISTENCE_UNLOGGED));
1241
1242 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1246 smgr->smgr_rlocator.backend);
1247
1248 if (persistence == RELPERSISTENCE_TEMP)
1249 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1250 else
1251 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1252 strategy, foundPtr, io_context);
1253
1254 if (*foundPtr)
1255 TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
1256
1257 if (rel)
1258 {
1259 /*
1260 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1261 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1262 * zeroed instead), the per-relation stats always count them.
1263 */
1265 }
1266
1268}
1269
1270/*
1271 * ReadBuffer_common -- common logic for all ReadBuffer variants
1272 *
1273 * smgr is required, rel is optional unless using P_NEW.
1274 */
1277 ForkNumber forkNum,
1279 BufferAccessStrategy strategy)
1280{
1282 Buffer buffer;
1283 int flags;
1284 char persistence;
1285
1286 /*
1287 * Reject attempts to read non-local temporary relations; we would be
1288 * likely to get wrong data since we have no visibility into the owning
1289 * session's local buffers. This is the canonical place for the check,
1290 * covering the ReadBufferExtended() entry point and any other caller that
1291 * supplies a Relation.
1292 */
1293 if (rel && RELATION_IS_OTHER_TEMP(rel))
1294 ereport(ERROR,
1296 errmsg("cannot access temporary tables of other sessions")));
1297
1298 /*
1299 * Backward compatibility path, most code should use ExtendBufferedRel()
1300 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1301 * scales a lot better.
1302 */
1303 if (unlikely(blockNum == P_NEW))
1304 {
1306
1307 /*
1308 * Since no-one else can be looking at the page contents yet, there is
1309 * no difference between an exclusive lock and a cleanup-strength
1310 * lock.
1311 */
1313 flags |= EB_LOCK_FIRST;
1314
1315 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1316 }
1317
1318 if (rel)
1319 persistence = rel->rd_rel->relpersistence;
1320 else
1321 persistence = smgr_persistence;
1322
1325 {
1326 bool found;
1329
1330 if (persistence == RELPERSISTENCE_TEMP)
1331 {
1334 }
1335 else
1336 {
1337 io_context = IOContextForStrategy(strategy);
1339 }
1340
1341 buffer = PinBufferForBlock(rel, smgr, persistence,
1342 forkNum, blockNum, strategy,
1343 io_object, io_context, &found);
1344 ZeroAndLockBuffer(buffer, mode, found);
1345 return buffer;
1346 }
1347
1348 /*
1349 * Signal that we are going to immediately wait. If we're immediately
1350 * waiting, there is no benefit in actually executing the IO
1351 * asynchronously, it would just add dispatch overhead.
1352 */
1354 if (mode == RBM_ZERO_ON_ERROR)
1356 operation.smgr = smgr;
1357 operation.rel = rel;
1358 operation.persistence = persistence;
1359 operation.forknum = forkNum;
1360 operation.strategy = strategy;
1362 &buffer,
1363 blockNum,
1364 flags))
1366
1367 return buffer;
1368}
1369
1372 Buffer *buffers,
1373 BlockNumber blockNum,
1374 int *nblocks,
1375 int flags,
1376 bool allow_forwarding)
1377{
1378 int actual_nblocks = *nblocks;
1379 int maxcombine = 0;
1380 bool did_start_io;
1383
1384 Assert(*nblocks == 1 || allow_forwarding);
1385 Assert(*nblocks > 0);
1386 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1387
1388 /* see comments in ReadBuffer_common */
1389 if (operation->rel && RELATION_IS_OTHER_TEMP(operation->rel))
1390 ereport(ERROR,
1392 errmsg("cannot access temporary tables of other sessions")));
1393
1394 if (operation->persistence == RELPERSISTENCE_TEMP)
1395 {
1398 }
1399 else
1400 {
1403 }
1404
1405 for (int i = 0; i < actual_nblocks; ++i)
1406 {
1407 bool found;
1408
1409 if (allow_forwarding && buffers[i] != InvalidBuffer)
1410 {
1412
1413 /*
1414 * This is a buffer that was pinned by an earlier call to
1415 * StartReadBuffers(), but couldn't be handled in one operation at
1416 * that time. The operation was split, and the caller has passed
1417 * an already pinned buffer back to us to handle the rest of the
1418 * operation. It must continue at the expected block number.
1419 */
1420 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1421
1422 /*
1423 * It might be an already valid buffer (a hit) that followed the
1424 * final contiguous block of an earlier I/O (a miss) marking the
1425 * end of it, or a buffer that some other backend has since made
1426 * valid by performing the I/O for us, in which case we can handle
1427 * it as a hit now. It is safe to check for a BM_VALID flag with
1428 * a relaxed load, because we got a fresh view of it while pinning
1429 * it in the previous call.
1430 *
1431 * On the other hand if we don't see BM_VALID yet, it must be an
1432 * I/O that was split by the previous call and we need to try to
1433 * start a new I/O from this block. We're also racing against any
1434 * other backend that might start the I/O or even manage to mark
1435 * it BM_VALID after this check, but StartBufferIO() will handle
1436 * those cases.
1437 */
1438 if (BufferIsLocal(buffers[i]))
1439 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1440 else
1441 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1443 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1444 }
1445 else
1446 {
1447 buffers[i] = PinBufferForBlock(operation->rel,
1448 operation->smgr,
1449 operation->persistence,
1450 operation->forknum,
1451 blockNum + i,
1452 operation->strategy,
1454 &found);
1455 }
1456
1457 if (found)
1458 {
1459 /*
1460 * We have a hit. If it's the first block in the requested range,
1461 * we can return it immediately and report that WaitReadBuffers()
1462 * does not need to be called. If the initial value of *nblocks
1463 * was larger, the caller will have to call again for the rest.
1464 */
1465 if (i == 0)
1466 {
1467 *nblocks = 1;
1468
1469#ifdef USE_ASSERT_CHECKING
1470
1471 /*
1472 * Initialize enough of ReadBuffersOperation to make
1473 * CheckReadBuffersOperation() work. Outside of assertions
1474 * that's not necessary when no IO is issued.
1475 */
1476 operation->buffers = buffers;
1477 operation->blocknum = blockNum;
1478 operation->nblocks = 1;
1479 operation->nblocks_done = 1;
1481#endif
1482 return false;
1483 }
1484
1485 /*
1486 * Otherwise we already have an I/O to perform, but this block
1487 * can't be included as it is already valid. Split the I/O here.
1488 * There may or may not be more blocks requiring I/O after this
1489 * one, we haven't checked, but they can't be contiguous with this
1490 * one in the way. We'll leave this buffer pinned, forwarding it
1491 * to the next call, avoiding the need to unpin it here and re-pin
1492 * it in the next call.
1493 */
1494 actual_nblocks = i;
1495 break;
1496 }
1497 else
1498 {
1499 /*
1500 * Check how many blocks we can cover with the same IO. The smgr
1501 * implementation might e.g. be limited due to a segment boundary.
1502 */
1503 if (i == 0 && actual_nblocks > 1)
1504 {
1506 operation->forknum,
1507 blockNum);
1509 {
1510 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1511 blockNum, actual_nblocks, maxcombine);
1513 }
1514 }
1515 }
1516 }
1517 *nblocks = actual_nblocks;
1518
1519 /* Populate information needed for I/O. */
1520 operation->buffers = buffers;
1521 operation->blocknum = blockNum;
1522 operation->flags = flags;
1523 operation->nblocks = actual_nblocks;
1524 operation->nblocks_done = 0;
1525 pgaio_wref_clear(&operation->io_wref);
1526
1527 /*
1528 * When using AIO, start the IO in the background. If not, issue prefetch
1529 * requests if desired by the caller.
1530 *
1531 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1532 * de-risk the introduction of AIO somewhat. It's a large architectural
1533 * change, with lots of chances for unanticipated performance effects.
1534 *
1535 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1536 * asynchronously, but without the check here we'd execute IO earlier than
1537 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1538 */
1539 if (io_method != IOMETHOD_SYNC)
1540 {
1541 /*
1542 * Try to start IO asynchronously. It's possible that no IO needs to
1543 * be started, if another backend already performed the IO.
1544 *
1545 * Note that if an IO is started, it might not cover the entire
1546 * requested range, e.g. because an intermediary block has been read
1547 * in by another backend. In that case any "trailing" buffers we
1548 * already pinned above will be "forwarded" by read_stream.c to the
1549 * next call to StartReadBuffers().
1550 *
1551 * This is signalled to the caller by decrementing *nblocks *and*
1552 * reducing operation->nblocks. The latter is done here, but not below
1553 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1554 * overall read size anymore, we need to retry until done in its
1555 * entirety or until failed.
1556 */
1558
1559 operation->nblocks = *nblocks;
1560 }
1561 else
1562 {
1564
1565 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1566 {
1567 /*
1568 * In theory we should only do this if PinBufferForBlock() had to
1569 * allocate new buffers above. That way, if two calls to
1570 * StartReadBuffers() were made for the same blocks before
1571 * WaitReadBuffers(), only the first would issue the advice.
1572 * That'd be a better simulation of true asynchronous I/O, which
1573 * would only start the I/O once, but isn't done here for
1574 * simplicity.
1575 */
1576 smgrprefetch(operation->smgr,
1577 operation->forknum,
1578 blockNum,
1580 }
1581
1582 /*
1583 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1584 * will initiate the necessary IO.
1585 */
1586 did_start_io = true;
1587 }
1588
1590
1591 return did_start_io;
1592}
1593
1594/*
1595 * Begin reading a range of blocks beginning at blockNum and extending for
1596 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1597 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1598 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1599 * and is now being continued. On return, *nblocks holds the number of blocks
1600 * accepted by this operation. If it is less than the original number then
1601 * this operation has been split, but buffer elements up to the original
1602 * requested size may hold forwarded buffers to be used for a continuing
1603 * operation. The caller must either start a new I/O beginning at the block
1604 * immediately following the blocks accepted by this call and pass those
1605 * buffers back in, or release them if it chooses not to. It shouldn't make
1606 * any other use of or assumptions about forwarded buffers.
1607 *
1608 * If false is returned, no I/O is necessary and the buffers covered by
1609 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1610 * an I/O has been started, and WaitReadBuffers() must be called with the same
1611 * operation object before the buffers covered by *nblocks on exit can be
1612 * accessed. Along with the operation object, the caller-supplied array of
1613 * buffers must remain valid until WaitReadBuffers() is called, and any
1614 * forwarded buffers must also be preserved for a continuing call unless
1615 * they are explicitly released.
1616 */
1617bool
1619 Buffer *buffers,
1620 BlockNumber blockNum,
1621 int *nblocks,
1622 int flags)
1623{
1624 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1625 true /* expect forwarded buffers */ );
1626}
1627
1628/*
1629 * Single block version of the StartReadBuffers(). This might save a few
1630 * instructions when called from another translation unit, because it is
1631 * specialized for nblocks == 1.
1632 *
1633 * This version does not support "forwarded" buffers: they cannot be created
1634 * by reading only one block and *buffer is ignored on entry.
1635 */
1636bool
1638 Buffer *buffer,
1639 BlockNumber blocknum,
1640 int flags)
1641{
1642 int nblocks = 1;
1643 bool result;
1644
1645 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1646 false /* single block, no forwarding */ );
1647 Assert(nblocks == 1); /* single block can't be short */
1648
1649 return result;
1650}
1651
1652/*
1653 * Perform sanity checks on the ReadBuffersOperation.
1654 */
1655static void
1657{
1658#ifdef USE_ASSERT_CHECKING
1659 Assert(operation->nblocks_done <= operation->nblocks);
1660 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1661
1662 for (int i = 0; i < operation->nblocks; i++)
1663 {
1664 Buffer buffer = operation->buffers[i];
1668
1669 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1671
1672 if (i < operation->nblocks_done)
1674 }
1675#endif
1676}
1677
1678/*
1679 * We track various stats related to buffer hits. Because this is done in a
1680 * few separate places, this helper exists for convenience.
1681 */
1684 Relation rel, char persistence, SMgrRelation smgr,
1685 ForkNumber forknum, BlockNumber blocknum)
1686{
1688 blocknum,
1692 smgr->smgr_rlocator.backend,
1693 true);
1694
1695 if (persistence == RELPERSISTENCE_TEMP)
1697 else
1699
1701
1702 if (VacuumCostActive)
1704
1705 if (rel)
1707}
1708
1709/*
1710 * Helper for WaitReadBuffers() that processes the results of a readv
1711 * operation, raising an error if necessary.
1712 */
1713static void
1715{
1716 PgAioReturn *aio_ret = &operation->io_return;
1718 int newly_read_blocks = 0;
1719
1720 Assert(pgaio_wref_valid(&operation->io_wref));
1721 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1722
1723 /*
1724 * SMGR reports the number of blocks successfully read as the result of
1725 * the IO operation. Thus we can simply add that to ->nblocks_done.
1726 */
1727
1728 if (likely(rs != PGAIO_RS_ERROR))
1729 newly_read_blocks = aio_ret->result.result;
1730
1731 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1732 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1733 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1734 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1735 {
1736 /*
1737 * We'll retry, so we just emit a debug message to the server log (or
1738 * not even that in prod scenarios).
1739 */
1740 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1741 elog(DEBUG3, "partial read, will retry");
1742 }
1743
1746
1747 operation->nblocks_done += newly_read_blocks;
1748
1749 Assert(operation->nblocks_done <= operation->nblocks);
1750}
1751
1752/*
1753 * Wait for the IO operation initiated by StartReadBuffers() et al to
1754 * complete.
1755 *
1756 * Returns true if we needed to wait for the IO operation, false otherwise.
1757 */
1758bool
1760{
1761 PgAioReturn *aio_ret = &operation->io_return;
1764 bool needed_wait = false;
1765
1766 if (operation->persistence == RELPERSISTENCE_TEMP)
1767 {
1770 }
1771 else
1772 {
1775 }
1776
1777 /*
1778 * If we get here without an IO operation having been issued, the
1779 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1780 * caller should not have called WaitReadBuffers().
1781 *
1782 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1783 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1784 * of the retry logic below, no extra code is required.
1785 *
1786 * This path is expected to eventually go away.
1787 */
1788 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1789 elog(ERROR, "waiting for read operation that didn't read");
1790
1791 /*
1792 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1793 * done. We may need multiple retries, not just because we could get
1794 * multiple partial reads, but also because some of the remaining
1795 * to-be-read buffers may have been read in by other backends, limiting
1796 * the IO size.
1797 */
1798 while (true)
1799 {
1801
1803
1804 /*
1805 * If there is an IO associated with the operation, we may need to
1806 * wait for it.
1807 */
1808 if (pgaio_wref_valid(&operation->io_wref))
1809 {
1810 /*
1811 * Track the time spent waiting for the IO to complete. As
1812 * tracking a wait even if we don't actually need to wait
1813 *
1814 * a) is not cheap, due to the timestamping overhead
1815 *
1816 * b) reports some time as waiting, even if we never waited
1817 *
1818 * we first check if we already know the IO is complete.
1819 *
1820 * Note that operation->io_return is uninitialized for foreign IO,
1821 * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
1822 */
1823 if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
1824 !pgaio_wref_check_done(&operation->io_wref))
1825 {
1827
1828 pgaio_wref_wait(&operation->io_wref);
1829 needed_wait = true;
1830
1831 /*
1832 * The IO operation itself was already counted earlier, in
1833 * AsyncReadBuffers(), this just accounts for the wait time.
1834 */
1836 io_start, 0, 0);
1837 }
1838 else
1839 {
1841 }
1842
1843 if (unlikely(operation->foreign_io))
1844 {
1845 Buffer buffer = operation->buffers[operation->nblocks_done];
1850
1851 if (buf_state & BM_VALID)
1852 {
1853 BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
1854
1855 operation->nblocks_done += 1;
1856 Assert(operation->nblocks_done <= operation->nblocks);
1857
1858 /*
1859 * Track this as a 'hit' for this backend. The backend
1860 * performing the IO will track it as a 'read'.
1861 */
1863 operation->rel, operation->persistence,
1864 operation->smgr, operation->forknum,
1865 blocknum);
1866 }
1867
1868 /*
1869 * If the foreign IO failed and left the buffer invalid,
1870 * nblocks_done is not incremented. The retry loop below will
1871 * call AsyncReadBuffers() which will attempt the IO itself.
1872 */
1873 }
1874 else
1875 {
1876 /*
1877 * We now are sure the IO completed. Check the results. This
1878 * includes reporting on errors if there were any.
1879 */
1881 }
1882 }
1883
1884 /*
1885 * Most of the time, the one IO we already started, will read in
1886 * everything. But we need to deal with partial reads and buffers not
1887 * needing IO anymore.
1888 */
1889 if (operation->nblocks_done == operation->nblocks)
1890 break;
1891
1893
1894 /*
1895 * If the IO completed only partially, we need to perform additional
1896 * work, consider that a form of having had to wait.
1897 */
1898 needed_wait = true;
1899
1900 /*
1901 * This may only complete the IO partially, either because some
1902 * buffers were already valid, or because of a partial read.
1903 *
1904 * NB: In contrast to after the AsyncReadBuffers() call in
1905 * StartReadBuffers(), we do *not* reduce
1906 * ReadBuffersOperation->nblocks here, callers expect the full
1907 * operation to be completed at this point (as more operations may
1908 * have been queued).
1909 */
1911 }
1912
1914
1915 /* NB: READ_DONE tracepoint was already executed in completion callback */
1916 return needed_wait;
1917}
1918
1919/*
1920 * Initiate IO for the ReadBuffersOperation
1921 *
1922 * This function only starts a single IO at a time. The size of the IO may be
1923 * limited to below the to-be-read blocks, if one of the buffers has
1924 * concurrently been read in. If the first to-be-read buffer is already valid,
1925 * no IO will be issued.
1926 *
1927 * To support retries after partial reads, the first operation->nblocks_done
1928 * buffers are skipped.
1929 *
1930 * On return *nblocks_progress is updated to reflect the number of buffers
1931 * affected by the call. If the first buffer is valid, *nblocks_progress is
1932 * set to 1 and operation->nblocks_done is incremented.
1933 *
1934 * Returns true if IO was initiated or is already in progress (foreign IO),
1935 * false if the buffer was already valid.
1936 */
1937static bool
1939{
1940 Buffer *buffers = &operation->buffers[0];
1941 int flags = operation->flags;
1942 ForkNumber forknum = operation->forknum;
1943 char persistence = operation->persistence;
1944 int16 nblocks_done = operation->nblocks_done;
1945 BlockNumber blocknum = operation->blocknum + nblocks_done;
1946 Buffer *io_buffers = &operation->buffers[nblocks_done];
1947 int io_buffers_len = 0;
1949 uint32 ioh_flags = 0;
1955
1956 if (persistence == RELPERSISTENCE_TEMP)
1957 {
1960 }
1961 else
1962 {
1965 }
1966
1967 /*
1968 * When this IO is executed synchronously, either because the caller will
1969 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1970 * the AIO subsystem needs to know.
1971 */
1972 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1974
1975 if (persistence == RELPERSISTENCE_TEMP)
1977
1978 /*
1979 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1980 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1981 * set globally, but on a per-session basis. The completion callback,
1982 * which may be run in other processes, e.g. in IO workers, may have a
1983 * different value of the zero_damaged_pages GUC.
1984 *
1985 * XXX: We probably should eventually use a different flag for
1986 * zero_damaged_pages, so we can report different log levels / error codes
1987 * for zero_damaged_pages and ZERO_ON_ERROR.
1988 */
1991
1992 /*
1993 * For the same reason as with zero_damaged_pages we need to use this
1994 * backend's ignore_checksum_failure value.
1995 */
1998
1999
2000 /*
2001 * To be allowed to report stats in the local completion callback we need
2002 * to prepare to report stats now. This ensures we can safely report the
2003 * checksum failure even in a critical section.
2004 */
2005 pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
2006
2007 /*
2008 * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
2009 * might block, which we don't want after setting IO_IN_PROGRESS. If we
2010 * don't need to do the IO, we'll release the handle.
2011 *
2012 * If we need to wait for IO before we can get a handle, submit
2013 * already-staged IO first, so that other backends don't need to wait.
2014 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
2015 * wait for already submitted IO, which doesn't require additional locks,
2016 * but it could still cause undesirable waits.
2017 *
2018 * A secondary benefit is that this would allow us to measure the time in
2019 * pgaio_io_acquire() without causing undue timer overhead in the common,
2020 * non-blocking, case. However, currently the pgstats infrastructure
2021 * doesn't really allow that, as it a) asserts that an operation can't
2022 * have time without operations b) doesn't have an API to report
2023 * "accumulated" time.
2024 */
2026 if (unlikely(!ioh))
2027 {
2030 }
2031
2032 operation->foreign_io = false;
2033 pgaio_wref_clear(&operation->io_wref);
2034
2035 /*
2036 * Try to start IO on the first buffer in a new run of blocks. If AIO is
2037 * in progress, be it in this backend or another backend, we just
2038 * associate the wait reference with the operation and wait in
2039 * WaitReadBuffers(). This turns out to be important for performance in
2040 * two workloads:
2041 *
2042 * 1) A read stream that has to read the same block multiple times within
2043 * the readahead distance. This can happen e.g. for the table accesses of
2044 * an index scan.
2045 *
2046 * 2) Concurrent scans by multiple backends on the same relation.
2047 *
2048 * If we were to synchronously wait for the in-progress IO, we'd not be
2049 * able to keep enough I/O in flight.
2050 *
2051 * If we do find there is ongoing I/O for the buffer, we set up a 1-block
2052 * ReadBuffersOperation that WaitReadBuffers then can wait on.
2053 *
2054 * It's possible that another backend has started IO on the buffer but not
2055 * yet set its wait reference. In this case, we have no choice but to wait
2056 * for either the wait reference to be valid or the IO to be done.
2057 */
2058 status = StartBufferIO(buffers[nblocks_done], true, true,
2059 &operation->io_wref);
2061 {
2063 *nblocks_progress = 1;
2065 {
2066 /*
2067 * Someone has already completed this block, we're done.
2068 *
2069 * When IO is necessary, ->nblocks_done is updated in
2070 * ProcessReadBuffersResult(), but that is not called if no IO is
2071 * necessary. Thus update here.
2072 */
2073 operation->nblocks_done += 1;
2074 Assert(operation->nblocks_done <= operation->nblocks);
2075
2076 Assert(!pgaio_wref_valid(&operation->io_wref));
2077
2078 /*
2079 * Report and track this as a 'hit' for this backend, even though
2080 * it must have started out as a miss in PinBufferForBlock(). The
2081 * other backend will track this as a 'read'.
2082 */
2084 operation->rel, operation->persistence,
2085 operation->smgr, operation->forknum,
2086 blocknum);
2087 return false;
2088 }
2089
2090 /* The IO is already in-progress */
2092 Assert(pgaio_wref_valid(&operation->io_wref));
2093 operation->foreign_io = true;
2094
2095 return true;
2096 }
2097
2098 Assert(io_buffers[0] == buffers[nblocks_done]);
2099 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2100 io_buffers_len = 1;
2101
2102 /*
2103 * NB: As little code as possible should be added between the
2104 * StartBufferIO() above, the further StartBufferIO()s below and the
2105 * smgrstartreadv(), as some of the buffers are now marked as
2106 * IO_IN_PROGRESS and will thus cause other backends to wait.
2107 */
2108
2109 /*
2110 * How many neighboring-on-disk blocks can we scatter-read into other
2111 * buffers at the same time? In this case we don't wait if we see an I/O
2112 * already in progress (see comment above).
2113 */
2114 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2115 {
2116 /* Must be consecutive block numbers. */
2117 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2118 BufferGetBlockNumber(buffers[i]) - 1);
2119
2120 status = StartBufferIO(buffers[i], true, false, NULL);
2122 break;
2123
2124 Assert(io_buffers[io_buffers_len] == buffers[i]);
2125
2126 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2127 }
2128
2129 /* get a reference to wait for in WaitReadBuffers() */
2130 pgaio_io_get_wref(ioh, &operation->io_wref);
2131
2132 /* provide the list of buffers to the completion callbacks */
2134
2136 persistence == RELPERSISTENCE_TEMP ?
2139 flags);
2140
2142
2143 /* ---
2144 * Even though we're trying to issue IO asynchronously, track the time
2145 * in smgrstartreadv():
2146 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2147 * immediately
2148 * - the io method might not support the IO (e.g. worker IO for a temp
2149 * table)
2150 * ---
2151 */
2153 smgrstartreadv(ioh, operation->smgr, forknum,
2154 blocknum,
2158
2159 if (persistence == RELPERSISTENCE_TEMP)
2161 else
2163
2164 /*
2165 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
2166 * we could end up issuing a lot of IO in a short timespan, despite a low
2167 * cost limit.
2168 */
2169 if (VacuumCostActive)
2171
2173
2174 return true;
2175}
2176
2177/*
2178 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2179 * buffer. If no buffer exists already, selects a replacement victim and
2180 * evicts the old page, but does NOT read in new page.
2181 *
2182 * "strategy" can be a buffer replacement strategy object, or NULL for
2183 * the default strategy. The selected buffer's usage_count is advanced when
2184 * using the default strategy, but otherwise possibly not (see PinBuffer).
2185 *
2186 * The returned buffer is pinned and is already marked as holding the
2187 * desired page. If it already did have the desired page, *foundPtr is
2188 * set true. Otherwise, *foundPtr is set false.
2189 *
2190 * io_context is passed as an output parameter to avoid calling
2191 * IOContextForStrategy() when there is a shared buffers hit and no IO
2192 * statistics need be captured.
2193 *
2194 * No locks are held either at entry or exit.
2195 */
2197BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2198 BlockNumber blockNum,
2199 BufferAccessStrategy strategy,
2201{
2202 BufferTag newTag; /* identity of requested block */
2203 uint32 newHash; /* hash value for newTag */
2204 LWLock *newPartitionLock; /* buffer partition lock for it */
2205 int existing_buf_id;
2209 uint64 set_bits = 0;
2210
2211 /* Make sure we will have room to remember the buffer pin */
2214
2215 /* create a tag so we can lookup the buffer */
2216 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2217
2218 /* determine its hash code and partition lock ID */
2221
2222 /* see if the block is in the buffer pool already */
2225 if (existing_buf_id >= 0)
2226 {
2227 BufferDesc *buf;
2228 bool valid;
2229
2230 /*
2231 * Found it. Now, pin the buffer so no one can steal it from the
2232 * buffer pool, and check to see if the correct data has been loaded
2233 * into the buffer.
2234 */
2236
2237 valid = PinBuffer(buf, strategy, false);
2238
2239 /* Can release the mapping lock as soon as we've pinned it */
2241
2242 *foundPtr = true;
2243
2244 if (!valid)
2245 {
2246 /*
2247 * We can only get here if (a) someone else is still reading in
2248 * the page, (b) a previous read attempt failed, or (c) someone
2249 * called StartReadBuffers() but not yet WaitReadBuffers().
2250 */
2251 *foundPtr = false;
2252 }
2253
2254 return buf;
2255 }
2256
2257 /*
2258 * Didn't find it in the buffer pool. We'll have to initialize a new
2259 * buffer. Remember to unlock the mapping lock while doing the work.
2260 */
2262
2263 /*
2264 * Acquire a victim buffer. Somebody else might try to do the same, we
2265 * don't hold any conflicting locks. If so we'll have to undo our work
2266 * later.
2267 */
2270
2271 /*
2272 * Try to make a hashtable entry for the buffer under its new tag. If
2273 * somebody else inserted another buffer for the tag, we'll release the
2274 * victim buffer we acquired and use the already inserted one.
2275 */
2278 if (existing_buf_id >= 0)
2279 {
2281 bool valid;
2282
2283 /*
2284 * Got a collision. Someone has already done what we were about to do.
2285 * We'll just handle this as if it were found in the buffer pool in
2286 * the first place. First, give up the buffer we were planning to
2287 * use.
2288 *
2289 * We could do this after releasing the partition lock, but then we'd
2290 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2291 * before acquiring the lock, for the rare case of such a collision.
2292 */
2294
2295 /* remaining code should match code at top of routine */
2296
2298
2299 valid = PinBuffer(existing_buf_hdr, strategy, false);
2300
2301 /* Can release the mapping lock as soon as we've pinned it */
2303
2304 *foundPtr = true;
2305
2306 if (!valid)
2307 {
2308 /*
2309 * We can only get here if (a) someone else is still reading in
2310 * the page, (b) a previous read attempt failed, or (c) someone
2311 * called StartReadBuffers() but not yet WaitReadBuffers().
2312 */
2313 *foundPtr = false;
2314 }
2315
2316 return existing_buf_hdr;
2317 }
2318
2319 /*
2320 * Need to lock the buffer header too in order to change its tag.
2321 */
2323
2324 /* some sanity checks while we hold the buffer header lock */
2327
2328 victim_buf_hdr->tag = newTag;
2329
2330 /*
2331 * Make sure BM_PERMANENT is set for buffers that must be written at every
2332 * checkpoint. Unlogged buffers only need to be written at shutdown
2333 * checkpoints, except for their "init" forks, which need to be treated
2334 * just like permanent relations.
2335 */
2337 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2339
2341 set_bits, 0, 0);
2342
2344
2345 /*
2346 * Buffer contents are currently invalid.
2347 */
2348 *foundPtr = false;
2349
2350 return victim_buf_hdr;
2351}
2352
2353/*
2354 * InvalidateBuffer -- mark a shared buffer invalid.
2355 *
2356 * The buffer header spinlock must be held at entry. We drop it before
2357 * returning. (This is sane because the caller must have locked the
2358 * buffer in order to be sure it should be dropped.)
2359 *
2360 * This is used only in contexts such as dropping a relation. We assume
2361 * that no other backend could possibly be interested in using the page,
2362 * so the only reason the buffer might be pinned is if someone else is
2363 * trying to write it out. We have to let them finish before we can
2364 * reclaim the buffer.
2365 *
2366 * The buffer could get reclaimed by someone else while we are waiting
2367 * to acquire the necessary locks; if so, don't mess it up.
2368 */
2369static void
2371{
2373 uint32 oldHash; /* hash value for oldTag */
2374 LWLock *oldPartitionLock; /* buffer partition lock for it */
2377
2378 /* Save the original buffer tag before dropping the spinlock */
2379 oldTag = buf->tag;
2380
2382
2383 /*
2384 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2385 * worth storing the hashcode in BufferDesc so we need not recompute it
2386 * here? Probably not.
2387 */
2390
2391retry:
2392
2393 /*
2394 * Acquire exclusive mapping lock in preparation for changing the buffer's
2395 * association.
2396 */
2398
2399 /* Re-lock the buffer header */
2401
2402 /* If it's changed while we were waiting for lock, do nothing */
2403 if (!BufferTagsEqual(&buf->tag, &oldTag))
2404 {
2407 return;
2408 }
2409
2410 /*
2411 * We assume the reason for it to be pinned is that either we were
2412 * asynchronously reading the page in before erroring out or someone else
2413 * is flushing the page out. Wait for the IO to finish. (This could be
2414 * an infinite loop if the refcount is messed up... it would be nice to
2415 * time out after awhile, but there seems no way to be sure how many loops
2416 * may be needed. Note that if the other guy has pinned the buffer but
2417 * not yet done StartBufferIO, WaitIO will fall through and we'll
2418 * effectively be busy-looping here.)
2419 */
2421 {
2424 /* safety check: should definitely not be our *own* pin */
2426 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2427 WaitIO(buf);
2428 goto retry;
2429 }
2430
2431 /*
2432 * An invalidated buffer should not have any backends waiting to lock the
2433 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2434 */
2436
2437 /*
2438 * Clear out the buffer's tag and flags. We must do this to ensure that
2439 * linear scans of the buffer array don't think the buffer is valid.
2440 */
2442 ClearBufferTag(&buf->tag);
2443
2445 0,
2447 0);
2448
2449 /*
2450 * Remove the buffer from the lookup hashtable, if it was in there.
2451 */
2452 if (oldFlags & BM_TAG_VALID)
2454
2455 /*
2456 * Done with mapping lock.
2457 */
2459}
2460
2461/*
2462 * Helper routine for GetVictimBuffer()
2463 *
2464 * Needs to be called on a buffer with a valid tag, pinned, but without the
2465 * buffer header spinlock held.
2466 *
2467 * Returns true if the buffer can be reused, in which case the buffer is only
2468 * pinned by this backend and marked as invalid, false otherwise.
2469 */
2470static bool
2472{
2474 uint32 hash;
2476 BufferTag tag;
2477
2479
2480 /* have buffer pinned, so it's safe to read tag without lock */
2481 tag = buf_hdr->tag;
2482
2483 hash = BufTableHashCode(&tag);
2485
2487
2488 /* lock the buffer header */
2490
2491 /*
2492 * We have the buffer pinned nobody else should have been able to unset
2493 * this concurrently.
2494 */
2497 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2498
2499 /*
2500 * If somebody else pinned the buffer since, or even worse, dirtied it,
2501 * give up on this buffer: It's clearly in use.
2502 */
2504 {
2506
2509
2510 return false;
2511 }
2512
2513 /*
2514 * An invalidated buffer should not have any backends waiting to lock the
2515 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2516 */
2518
2519 /*
2520 * Clear out the buffer's tag and flags and usagecount. This is not
2521 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2522 * doing anything with the buffer. But currently it's beneficial, as the
2523 * cheaper pre-check for several linear scans of shared buffers use the
2524 * tag (see e.g. FlushDatabaseBuffers()).
2525 */
2526 ClearBufferTag(&buf_hdr->tag);
2528 0,
2530 0);
2531
2533
2534 /* finally delete buffer from the buffer mapping table */
2535 BufTableDelete(&tag, hash);
2536
2538
2543
2544 return true;
2545}
2546
2547static Buffer
2549{
2551 Buffer buf;
2553 bool from_ring;
2554
2555 /*
2556 * Ensure, before we pin a victim buffer, that there's a free refcount
2557 * entry and resource owner slot for the pin.
2558 */
2561
2562 /* we return here if a prospective victim buffer gets used concurrently */
2563again:
2564
2565 /*
2566 * Select a victim buffer. The buffer is returned pinned and owned by
2567 * this backend.
2568 */
2571
2572 /*
2573 * We shouldn't have any other pins for this buffer.
2574 */
2576
2577 /*
2578 * If the buffer was dirty, try to write it out. There is a race
2579 * condition here, another backend could dirty the buffer between
2580 * StrategyGetBuffer() checking that it is not in use and invalidating the
2581 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2582 * that the buffer is not dirty.
2583 */
2584 if (buf_state & BM_DIRTY)
2585 {
2588
2589 /*
2590 * We need a share-exclusive lock on the buffer contents to write it
2591 * out (else we might write invalid data, eg because someone else is
2592 * compacting the page contents while we write). We must use a
2593 * conditional lock acquisition here to avoid deadlock. Even though
2594 * the buffer was not pinned (and therefore surely not locked) when
2595 * StrategyGetBuffer returned it, someone else could have pinned and
2596 * (share-)exclusive-locked it by the time we get here. If we try to
2597 * get the lock unconditionally, we'd block waiting for them; if they
2598 * later block waiting for us, deadlock ensues. (This has been
2599 * observed to happen when two backends are both trying to split btree
2600 * index pages, and the second one just happens to be trying to split
2601 * the page the first one got from StrategyGetBuffer.)
2602 */
2604 {
2605 /*
2606 * Someone else has locked the buffer, so give it up and loop back
2607 * to get another one.
2608 */
2610 goto again;
2611 }
2612
2613 /*
2614 * If using a nondefault strategy, and this victim came from the
2615 * strategy ring, let the strategy decide whether to reject it when
2616 * reusing it would require a WAL flush. This only applies to
2617 * permanent buffers; unlogged buffers can have fake LSNs, so
2618 * XLogNeedsFlush() is not meaningful for them.
2619 *
2620 * We need to hold the content lock in at least share-exclusive mode
2621 * to safely inspect the page LSN, so this couldn't have been done
2622 * inside StrategyGetBuffer().
2623 */
2624 if (strategy && from_ring &&
2628 {
2630 goto again;
2631 }
2632
2633 /* OK, do the I/O */
2636
2638 &buf_hdr->tag);
2639 }
2640
2641
2642 if (buf_state & BM_VALID)
2643 {
2644 /*
2645 * When a BufferAccessStrategy is in use, blocks evicted from shared
2646 * buffers are counted as IOOP_EVICT in the corresponding context
2647 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2648 * strategy in two cases: 1) while initially claiming buffers for the
2649 * strategy ring 2) to replace an existing strategy ring buffer
2650 * because it is pinned or in use and cannot be reused.
2651 *
2652 * Blocks evicted from buffers already in the strategy ring are
2653 * counted as IOOP_REUSE in the corresponding strategy context.
2654 *
2655 * At this point, we can accurately count evictions and reuses,
2656 * because we have successfully claimed the valid buffer. Previously,
2657 * we may have been forced to release the buffer due to concurrent
2658 * pinners or erroring out.
2659 */
2661 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2662 }
2663
2664 /*
2665 * If the buffer has an entry in the buffer mapping table, delete it. This
2666 * can fail because another backend could have pinned or dirtied the
2667 * buffer.
2668 */
2670 {
2672 goto again;
2673 }
2674
2675 /* a final set of sanity checks */
2676#ifdef USE_ASSERT_CHECKING
2678
2681
2683#endif
2684
2685 return buf;
2686}
2687
2688/*
2689 * Return the maximum number of buffers that a backend should try to pin once,
2690 * to avoid exceeding its fair share. This is the highest value that
2691 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2692 * system with a very small buffer pool relative to max_connections.
2693 */
2694uint32
2696{
2697 return MaxProportionalPins;
2698}
2699
2700/*
2701 * Return the maximum number of additional buffers that this backend should
2702 * pin if it wants to stay under the per-backend limit, considering the number
2703 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2704 * return by this function can be zero.
2705 */
2706uint32
2708{
2710
2711 /*
2712 * We get the number of "overflowed" pins for free, but don't know the
2713 * number of pins in PrivateRefCountArray. The cost of calculating that
2714 * exactly doesn't seem worth it, so just assume the max.
2715 */
2717
2718 /* Is this backend already holding more than its fair share? */
2720 return 0;
2721
2723}
2724
2725/*
2726 * Limit the number of pins a batch operation may additionally acquire, to
2727 * avoid running out of pinnable buffers.
2728 *
2729 * One additional pin is always allowed, on the assumption that the operation
2730 * requires at least one to make progress.
2731 */
2732void
2734{
2735 uint32 limit;
2736
2737 if (*additional_pins <= 1)
2738 return;
2739
2740 limit = GetAdditionalPinLimit();
2741 limit = Max(limit, 1);
2742 if (limit < *additional_pins)
2743 *additional_pins = limit;
2744}
2745
2746/*
2747 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2748 * avoid duplicating the tracing and relpersistence related logic.
2749 */
2750static BlockNumber
2753 BufferAccessStrategy strategy,
2754 uint32 flags,
2757 Buffer *buffers,
2759{
2761
2763 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2764 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2765 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2766 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2767 extend_by);
2768
2769 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2772 buffers, &extend_by);
2773 else
2774 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2776 buffers, &extend_by);
2778
2780 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2781 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2782 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2783 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2784 *extended_by,
2785 first_block);
2786
2787 return first_block;
2788}
2789
2790/*
2791 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2792 * shared buffers.
2793 */
2794static BlockNumber
2797 BufferAccessStrategy strategy,
2798 uint32 flags,
2801 Buffer *buffers,
2803{
2807
2809
2810 /*
2811 * Acquire victim buffers for extension without holding extension lock.
2812 * Writing out victim buffers is the most expensive part of extending the
2813 * relation, particularly when doing so requires WAL flushes. Zeroing out
2814 * the buffers is also quite expensive, so do that before holding the
2815 * extension lock as well.
2816 *
2817 * These pages are pinned by us and not valid. While we hold the pin they
2818 * can't be acquired as victim buffers by another backend.
2819 */
2820 for (uint32 i = 0; i < extend_by; i++)
2821 {
2823
2824 buffers[i] = GetVictimBuffer(strategy, io_context);
2826
2827 /* new buffers are zero-filled */
2828 MemSet(buf_block, 0, BLCKSZ);
2829 }
2830
2831 /*
2832 * Lock relation against concurrent extensions, unless requested not to.
2833 *
2834 * We use the same extension lock for all forks. That's unnecessarily
2835 * restrictive, but currently extensions for forks don't happen often
2836 * enough to make it worth locking more granularly.
2837 *
2838 * Note that another backend might have extended the relation by the time
2839 * we get the lock.
2840 */
2841 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2843
2844 /*
2845 * If requested, invalidate size cache, so that smgrnblocks asks the
2846 * kernel.
2847 */
2848 if (flags & EB_CLEAR_SIZE_CACHE)
2849 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2850
2852
2853 /*
2854 * Now that we have the accurate relation size, check if the caller wants
2855 * us to extend to only up to a specific size. If there were concurrent
2856 * extensions, we might have acquired too many buffers and need to release
2857 * them.
2858 */
2860 {
2862
2864 extend_by = 0;
2865 else if ((uint64) first_block + extend_by > extend_upto)
2867
2868 for (uint32 i = extend_by; i < orig_extend_by; i++)
2869 {
2870 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2871
2873 }
2874
2875 if (extend_by == 0)
2876 {
2877 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2880 return first_block;
2881 }
2882 }
2883
2884 /* Fail if relation is already at maximum possible length */
2886 ereport(ERROR,
2888 errmsg("cannot extend relation %s beyond %u blocks",
2889 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2890 MaxBlockNumber)));
2891
2892 /*
2893 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2894 *
2895 * This needs to happen before we extend the relation, because as soon as
2896 * we do, other backends can start to read in those pages.
2897 */
2898 for (uint32 i = 0; i < extend_by; i++)
2899 {
2900 Buffer victim_buf = buffers[i];
2902 BufferTag tag;
2903 uint32 hash;
2905 int existing_id;
2906
2907 /* in case we need to pin an existing buffer below */
2910
2911 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2912 first_block + i);
2913 hash = BufTableHashCode(&tag);
2915
2917
2919
2920 /*
2921 * We get here only in the corner case where we are trying to extend
2922 * the relation but we found a pre-existing buffer. This can happen
2923 * because a prior attempt at extending the relation failed, and
2924 * because mdread doesn't complain about reads beyond EOF (when
2925 * zero_damaged_pages is ON) and so a previous attempt to read a block
2926 * beyond EOF could have left a "valid" zero-filled buffer.
2927 *
2928 * This has also been observed when relation was overwritten by
2929 * external process. Since the legitimate cases should always have
2930 * left a zero-filled buffer, complain if not PageIsNew.
2931 */
2932 if (existing_id >= 0)
2933 {
2936 bool valid;
2937
2938 /*
2939 * Pin the existing buffer before releasing the partition lock,
2940 * preventing it from being evicted.
2941 */
2942 valid = PinBuffer(existing_hdr, strategy, false);
2943
2946
2949
2950 if (valid && !PageIsNew((Page) buf_block))
2951 ereport(ERROR,
2952 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2953 existing_hdr->tag.blockNum,
2954 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2955
2956 /*
2957 * We *must* do smgr[zero]extend before succeeding, else the page
2958 * will not be reserved by the kernel, and the next P_NEW call
2959 * will decide to return the same page. Clear the BM_VALID bit,
2960 * do StartSharedBufferIO() and proceed.
2961 *
2962 * Loop to handle the very small possibility that someone re-sets
2963 * BM_VALID between our clearing it and StartSharedBufferIO
2964 * inspecting it.
2965 */
2966 while (true)
2967 {
2969
2971
2973
2975 break;
2976 }
2977 }
2978 else
2979 {
2981 uint64 set_bits = 0;
2982
2984
2985 /* some sanity checks while we hold the buffer header lock */
2988
2989 victim_buf_hdr->tag = tag;
2990
2992 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2994
2996 set_bits, 0,
2997 0);
2998
3000
3001 /* XXX: could combine the locked operations in it with the above */
3003 }
3004 }
3005
3007
3008 /*
3009 * Note: if smgrzeroextend fails, we will end up with buffers that are
3010 * allocated but not marked BM_VALID. The next relation extension will
3011 * still select the same block number (because the relation didn't get any
3012 * longer on disk) and so future attempts to extend the relation will find
3013 * the same buffers (if they have not been recycled) but come right back
3014 * here to try smgrzeroextend again.
3015 *
3016 * We don't need to set checksum for all-zero pages.
3017 */
3019
3020 /*
3021 * Release the file-extension lock; it's now OK for someone else to extend
3022 * the relation some more.
3023 *
3024 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
3025 * take noticeable time.
3026 */
3027 if (!(flags & EB_SKIP_EXTENSION_LOCK))
3029
3031 io_start, 1, extend_by * BLCKSZ);
3032
3033 /* Set BM_VALID, terminate IO, and wake up any waiters */
3034 for (uint32 i = 0; i < extend_by; i++)
3035 {
3036 Buffer buf = buffers[i];
3038 bool lock = false;
3039
3040 if (flags & EB_LOCK_FIRST && i == 0)
3041 lock = true;
3042 else if (flags & EB_LOCK_TARGET)
3043 {
3045 if (first_block + i + 1 == extend_upto)
3046 lock = true;
3047 }
3048
3049 if (lock)
3051
3052 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
3053 }
3054
3056
3058
3059 return first_block;
3060}
3061
3062/*
3063 * BufferIsLockedByMe
3064 *
3065 * Checks if this backend has the buffer locked in any mode.
3066 *
3067 * Buffer must be pinned.
3068 */
3069bool
3071{
3073
3075
3076 if (BufferIsLocal(buffer))
3077 {
3078 /* Content locks are not maintained for local buffers. */
3079 return true;
3080 }
3081 else
3082 {
3084 return BufferLockHeldByMe(bufHdr);
3085 }
3086}
3087
3088/*
3089 * BufferIsLockedByMeInMode
3090 *
3091 * Checks if this backend has the buffer locked in the specified mode.
3092 *
3093 * Buffer must be pinned.
3094 */
3095bool
3097{
3099
3101
3102 if (BufferIsLocal(buffer))
3103 {
3104 /* Content locks are not maintained for local buffers. */
3105 return true;
3106 }
3107 else
3108 {
3111 }
3112}
3113
3114/*
3115 * BufferIsDirty
3116 *
3117 * Checks if buffer is already dirty.
3118 *
3119 * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock,
3120 * the result may be stale before it's returned.)
3121 */
3122bool
3124{
3126
3128
3129 if (BufferIsLocal(buffer))
3130 {
3131 int bufid = -buffer - 1;
3132
3134 /* Content locks are not maintained for local buffers. */
3135 }
3136 else
3137 {
3141 }
3142
3143 return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3144}
3145
3146/*
3147 * MarkBufferDirty
3148 *
3149 * Marks buffer contents as dirty (actual write happens later).
3150 *
3151 * Buffer must be pinned and exclusive-locked. (If caller does not hold
3152 * exclusive lock, then somebody could be in process of writing the buffer,
3153 * leading to risk of bad data written to disk.)
3154 */
3155void
3157{
3161
3162 if (!BufferIsValid(buffer))
3163 elog(ERROR, "bad buffer ID: %d", buffer);
3164
3165 if (BufferIsLocal(buffer))
3166 {
3168 return;
3169 }
3170
3172
3175
3176 /*
3177 * NB: We have to wait for the buffer header spinlock to be not held, as
3178 * TerminateBufferIO() relies on the spinlock.
3179 */
3181 for (;;)
3182 {
3185
3187
3190
3192 buf_state))
3193 break;
3194 }
3195
3196 /*
3197 * If the buffer was not dirty already, do vacuum accounting.
3198 */
3199 if (!(old_buf_state & BM_DIRTY))
3200 {
3202 if (VacuumCostActive)
3204 }
3205}
3206
3207/*
3208 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3209 *
3210 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3211 * compared to calling the two routines separately. Now it's mainly just
3212 * a convenience function. However, if the passed buffer is valid and
3213 * already contains the desired block, we just return it as-is; and that
3214 * does save considerable work compared to a full release and reacquire.
3215 *
3216 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3217 * buffer actually needs to be released. This case is the same as ReadBuffer,
3218 * but can save some tests in the caller.
3219 */
3220Buffer
3222 Relation relation,
3223 BlockNumber blockNum)
3224{
3225 ForkNumber forkNum = MAIN_FORKNUM;
3227
3228 if (BufferIsValid(buffer))
3229 {
3231 if (BufferIsLocal(buffer))
3232 {
3234 if (bufHdr->tag.blockNum == blockNum &&
3235 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3236 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3237 return buffer;
3239 }
3240 else
3241 {
3243 /* we have pin, so it's ok to examine tag without spinlock */
3244 if (bufHdr->tag.blockNum == blockNum &&
3245 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3246 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3247 return buffer;
3249 }
3250 }
3251
3252 return ReadBuffer(relation, blockNum);
3253}
3254
3255/*
3256 * PinBuffer -- make buffer unavailable for replacement.
3257 *
3258 * For the default access strategy, the buffer's usage_count is incremented
3259 * when we first pin it; for other strategies we just make sure the usage_count
3260 * isn't zero. (The idea of the latter is that we don't want synchronized
3261 * heap scans to inflate the count, but we need it to not be zero to discourage
3262 * other backends from stealing buffers from our ring. As long as we cycle
3263 * through the ring faster than the global clock-sweep cycles, buffers in
3264 * our ring won't be chosen as victims for replacement by other backends.)
3265 *
3266 * This should be applied only to shared buffers, never local ones.
3267 *
3268 * Since buffers are pinned/unpinned very frequently, pin buffers without
3269 * taking the buffer header lock; instead update the state variable in loop of
3270 * CAS operations. Hopefully it's just a single CAS.
3271 *
3272 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3273 * must have been done already.
3274 *
3275 * Returns true if buffer is BM_VALID, else false. This provision allows
3276 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3277 * true, then a false return value also indicates that the buffer was
3278 * (recently) invalid and has not been pinned.
3279 */
3280static bool
3282 bool skip_if_not_valid)
3283{
3285 bool result;
3287
3290
3291 ref = GetPrivateRefCountEntry(b, true);
3292
3293 if (ref == NULL)
3294 {
3297
3299 for (;;)
3300 {
3302 return false;
3303
3304 /*
3305 * We're not allowed to increase the refcount while the buffer
3306 * header spinlock is held. Wait for the lock to be released.
3307 */
3309 {
3311
3312 /* perform checks at the top of the loop again */
3313 continue;
3314 }
3315
3317
3318 /* increase refcount */
3320
3321 if (strategy == NULL)
3322 {
3323 /* Default case: increase usagecount unless already max. */
3326 }
3327 else
3328 {
3329 /*
3330 * Ring buffers shouldn't evict others from pool. Thus we
3331 * don't make usagecount more than 1.
3332 */
3335 }
3336
3338 buf_state))
3339 {
3340 result = (buf_state & BM_VALID) != 0;
3341
3343 break;
3344 }
3345 }
3346 }
3347 else
3348 {
3349 /*
3350 * If we previously pinned the buffer, it is likely to be valid, but
3351 * it may not be if StartReadBuffers() was called and
3352 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3353 * the flags without locking. This is racy, but it's OK to return
3354 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3355 * it'll see that it's now valid.
3356 *
3357 * Note: We deliberately avoid a Valgrind client request here.
3358 * Individual access methods can optionally superimpose buffer page
3359 * client requests on top of our client requests to enforce that
3360 * buffers are only accessed while locked (and pinned). It's possible
3361 * that the buffer page is legitimately non-accessible here. We
3362 * cannot meddle with that.
3363 */
3364 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3365
3366 Assert(ref->data.refcount > 0);
3367 ref->data.refcount++;
3369 }
3370
3371 return result;
3372}
3373
3374/*
3375 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3376 * The spinlock is released before return.
3377 *
3378 * As this function is called with the spinlock held, the caller has to
3379 * previously call ReservePrivateRefCountEntry() and
3380 * ResourceOwnerEnlarge(CurrentResourceOwner);
3381 *
3382 * Currently, no callers of this function want to modify the buffer's
3383 * usage_count at all, so there's no need for a strategy parameter.
3384 * Also we don't bother with a BM_VALID test (the caller could check that for
3385 * itself).
3386 *
3387 * Also all callers only ever use this function when it's known that the
3388 * buffer can't have a preexisting pin by this backend. That allows us to skip
3389 * searching the private refcount array & hash, which is a boon, because the
3390 * spinlock is still held.
3391 *
3392 * Note: use of this routine is frequently mandatory, not just an optimization
3393 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3394 * its state can change under us.
3395 */
3396static void
3398{
3400
3401 /*
3402 * As explained, We don't expect any preexisting pins. That allows us to
3403 * manipulate the PrivateRefCount after releasing the spinlock
3404 */
3406
3407 /*
3408 * Since we hold the buffer spinlock, we can update the buffer state and
3409 * release the lock in one operation.
3410 */
3412
3414 0, 0, 1);
3415
3417}
3418
3419/*
3420 * Support for waking up another backend that is waiting for the cleanup lock
3421 * to be released using BM_PIN_COUNT_WAITER.
3422 *
3423 * See LockBufferForCleanup().
3424 *
3425 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3426 * not just reducing the backend-local pincount for the buffer).
3427 */
3428static void
3430{
3431 /*
3432 * Acquire the buffer header lock, re-check that there's a waiter. Another
3433 * backend could have unpinned this buffer, and already woken up the
3434 * waiter.
3435 *
3436 * There's no danger of the buffer being replaced after we unpinned it
3437 * above, as it's pinned by the waiter. The waiter removes
3438 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3439 * backend waking it up.
3440 */
3442
3445 {
3446 /* we just released the last pin other than the waiter's */
3447 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3448
3451 0);
3452 ProcSendSignal(wait_backend_pgprocno);
3453 }
3454 else
3456}
3457
3458/*
3459 * UnpinBuffer -- make buffer available for replacement.
3460 *
3461 * This should be applied only to shared buffers, never local ones. This
3462 * always adjusts CurrentResourceOwner.
3463 */
3464static void
3472
3473static void
3475{
3478
3480
3481 /* not moving as we're likely deleting it soon anyway */
3482 ref = GetPrivateRefCountEntry(b, false);
3483 Assert(ref != NULL);
3484 Assert(ref->data.refcount > 0);
3485 ref->data.refcount--;
3486 if (ref->data.refcount == 0)
3487 {
3489
3490 /*
3491 * Mark buffer non-accessible to Valgrind.
3492 *
3493 * Note that the buffer may have already been marked non-accessible
3494 * within access method code that enforces that buffers are only
3495 * accessed while a buffer lock is held.
3496 */
3498
3499 /*
3500 * I'd better not still hold the buffer content lock. Can't use
3501 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3502 */
3504
3505 /* decrement the shared reference count */
3507
3508 /* Support LockBufferForCleanup() */
3511
3513 }
3514}
3515
3516/*
3517 * Set up backend-local tracking of a buffer pinned the first time by this
3518 * backend.
3519 */
3520inline void
3522{
3524
3526 ref->data.refcount++;
3527
3529
3530 /*
3531 * This is the first pin for this page by this backend, mark its page as
3532 * defined to valgrind. While the page contents might not actually be
3533 * valid yet, we don't currently guarantee that such pages are marked
3534 * undefined or non-accessible.
3535 *
3536 * It's not necessarily the prettiest to do this here, but otherwise we'd
3537 * need this block of code in multiple places.
3538 */
3540 BLCKSZ);
3541}
3542
3543#define ST_SORT sort_checkpoint_bufferids
3544#define ST_ELEMENT_TYPE CkptSortItem
3545#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3546#define ST_SCOPE static
3547#define ST_DEFINE
3548#include "lib/sort_template.h"
3549
3550/*
3551 * BufferSync -- Write out all dirty buffers in the pool.
3552 *
3553 * This is called at checkpoint time to write out all dirty shared buffers.
3554 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3555 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3556 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3557 * even unlogged buffers, which are otherwise skipped. The remaining flags
3558 * currently have no effect here.
3559 */
3560static void
3561BufferSync(int flags)
3562{
3564 int buf_id;
3565 int num_to_scan;
3566 int num_spaces;
3567 int num_processed;
3568 int num_written;
3570 Oid last_tsid;
3572 int i;
3573 uint64 mask = BM_DIRTY;
3575
3576 /*
3577 * Unless this is a shutdown checkpoint or we have been explicitly told,
3578 * we write only permanent, dirty buffers. But at shutdown or end of
3579 * recovery, we write all dirty buffers.
3580 */
3583 mask |= BM_PERMANENT;
3584
3585 /*
3586 * Loop over all buffers, and mark the ones that need to be written with
3587 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3588 * can estimate how much work needs to be done.
3589 *
3590 * This allows us to write only those pages that were dirty when the
3591 * checkpoint began, and not those that get dirtied while it proceeds.
3592 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3593 * later in this function, or by normal backends or the bgwriter cleaning
3594 * scan, the flag is cleared. Any buffer dirtied after this point won't
3595 * have the flag set.
3596 *
3597 * Note that if we fail to write some buffer, we may leave buffers with
3598 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3599 * certainly need to be written for the next checkpoint attempt, too.
3600 */
3601 num_to_scan = 0;
3602 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3603 {
3605 uint64 set_bits = 0;
3606
3607 /*
3608 * Header spinlock is enough to examine BM_DIRTY, see comment in
3609 * SyncOneBuffer.
3610 */
3612
3613 if ((buf_state & mask) == mask)
3614 {
3615 CkptSortItem *item;
3616
3618
3619 item = &CkptBufferIds[num_to_scan++];
3620 item->buf_id = buf_id;
3621 item->tsId = bufHdr->tag.spcOid;
3622 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3623 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3624 item->blockNum = bufHdr->tag.blockNum;
3625 }
3626
3628 set_bits, 0,
3629 0);
3630
3631 /* Check for barrier events in case NBuffers is large. */
3634 }
3635
3636 if (num_to_scan == 0)
3637 return; /* nothing to do */
3638
3640
3642
3643 /*
3644 * Sort buffers that need to be written to reduce the likelihood of random
3645 * IO. The sorting is also important for the implementation of balancing
3646 * writes between tablespaces. Without balancing writes we'd potentially
3647 * end up writing to the tablespaces one-by-one; possibly overloading the
3648 * underlying system.
3649 */
3651
3652 num_spaces = 0;
3653
3654 /*
3655 * Allocate progress status for each tablespace with buffers that need to
3656 * be flushed. This requires the to-be-flushed array to be sorted.
3657 */
3659 for (i = 0; i < num_to_scan; i++)
3660 {
3661 CkptTsStatus *s;
3662 Oid cur_tsid;
3663
3665
3666 /*
3667 * Grow array of per-tablespace status structs, every time a new
3668 * tablespace is found.
3669 */
3671 {
3672 Size sz;
3673
3674 num_spaces++;
3675
3676 /*
3677 * Not worth adding grow-by-power-of-2 logic here - even with a
3678 * few hundred tablespaces this should be fine.
3679 */
3680 sz = sizeof(CkptTsStatus) * num_spaces;
3681
3682 if (per_ts_stat == NULL)
3684 else
3686
3687 s = &per_ts_stat[num_spaces - 1];
3688 memset(s, 0, sizeof(*s));
3689 s->tsId = cur_tsid;
3690
3691 /*
3692 * The first buffer in this tablespace. As CkptBufferIds is sorted
3693 * by tablespace all (s->num_to_scan) buffers in this tablespace
3694 * will follow afterwards.
3695 */
3696 s->index = i;
3697
3698 /*
3699 * progress_slice will be determined once we know how many buffers
3700 * are in each tablespace, i.e. after this loop.
3701 */
3702
3704 }
3705 else
3706 {
3707 s = &per_ts_stat[num_spaces - 1];
3708 }
3709
3710 s->num_to_scan++;
3711
3712 /* Check for barrier events. */
3715 }
3716
3717 Assert(num_spaces > 0);
3718
3719 /*
3720 * Build a min-heap over the write-progress in the individual tablespaces,
3721 * and compute how large a portion of the total progress a single
3722 * processed buffer is.
3723 */
3726 NULL);
3727
3728 for (i = 0; i < num_spaces; i++)
3729 {
3731
3732 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3733
3735 }
3736
3738
3739 /*
3740 * Iterate through to-be-checkpointed buffers and write the ones (still)
3741 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3742 * tablespaces; otherwise the sorting would lead to only one tablespace
3743 * receiving writes at a time, making inefficient use of the hardware.
3744 */
3745 num_processed = 0;
3746 num_written = 0;
3747 while (!binaryheap_empty(ts_heap))
3748 {
3752
3753 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3754 Assert(buf_id != -1);
3755
3756 bufHdr = GetBufferDescriptor(buf_id);
3757
3758 num_processed++;
3759
3760 /*
3761 * We don't need to acquire the lock here, because we're only looking
3762 * at a single bit. It's possible that someone else writes the buffer
3763 * and clears the flag right after we check, but that doesn't matter
3764 * since SyncOneBuffer will then do nothing. However, there is a
3765 * further race condition: it's conceivable that between the time we
3766 * examine the bit here and the time SyncOneBuffer acquires the lock,
3767 * someone else not only wrote the buffer but replaced it with another
3768 * page and dirtied it. In that improbable case, SyncOneBuffer will
3769 * write the buffer though we didn't need to. It doesn't seem worth
3770 * guarding against this, though.
3771 */
3773 {
3774 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3775 {
3778 num_written++;
3779 }
3780 }
3781
3782 /*
3783 * Measure progress independent of actually having to flush the buffer
3784 * - otherwise writing become unbalanced.
3785 */
3786 ts_stat->progress += ts_stat->progress_slice;
3787 ts_stat->num_scanned++;
3788 ts_stat->index++;
3789
3790 /* Have all the buffers from the tablespace been processed? */
3791 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3792 {
3794 }
3795 else
3796 {
3797 /* update heap with the new progress */
3799 }
3800
3801 /*
3802 * Sleep to throttle our I/O rate.
3803 *
3804 * (This will check for barrier events even if it doesn't sleep.)
3805 */
3806 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3807 }
3808
3809 /*
3810 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3811 * IOContext will always be IOCONTEXT_NORMAL.
3812 */
3814
3816 per_ts_stat = NULL;
3818
3819 /*
3820 * Update checkpoint statistics. As noted above, this doesn't include
3821 * buffers written by other backends or bgwriter scan.
3822 */
3824
3826}
3827
3828/*
3829 * BgBufferSync -- Write out some dirty buffers in the pool.
3830 *
3831 * This is called periodically by the background writer process.
3832 *
3833 * Returns true if it's appropriate for the bgwriter process to go into
3834 * low-power hibernation mode. (This happens if the strategy clock-sweep
3835 * has been "lapped" and no buffer allocations have occurred recently,
3836 * or if the bgwriter has been effectively disabled by setting
3837 * bgwriter_lru_maxpages to 0.)
3838 */
3839bool
3841{
3842 /* info obtained from freelist.c */
3843 int strategy_buf_id;
3846
3847 /*
3848 * Information saved between calls so we can determine the strategy
3849 * point's advance rate and avoid scanning already-cleaned buffers.
3850 */
3851 static bool saved_info_valid = false;
3852 static int prev_strategy_buf_id;
3854 static int next_to_clean;
3855 static uint32 next_passes;
3856
3857 /* Moving averages of allocation rate and clean-buffer density */
3858 static float smoothed_alloc = 0;
3859 static float smoothed_density = 10.0;
3860
3861 /* Potentially these could be tunables, but for now, not */
3862 float smoothing_samples = 16;
3863 float scan_whole_pool_milliseconds = 120000.0;
3864
3865 /* Used to compute how far we scan ahead */
3866 long strategy_delta;
3867 int bufs_to_lap;
3868 int bufs_ahead;
3869 float scans_per_alloc;
3872 int min_scan_buffers;
3873
3874 /* Variables for the scanning loop proper */
3875 int num_to_scan;
3876 int num_written;
3877 int reusable_buffers;
3878
3879 /* Variables for final smoothed_density update */
3880 long new_strategy_delta;
3882
3883 /*
3884 * Find out where the clock-sweep currently is, and how many buffer
3885 * allocations have happened since our last call.
3886 */
3888
3889 /* Report buffer alloc counts to pgstat */
3891
3892 /*
3893 * If we're not running the LRU scan, just stop after doing the stats
3894 * stuff. We mark the saved state invalid so that we can recover sanely
3895 * if LRU scan is turned back on later.
3896 */
3897 if (bgwriter_lru_maxpages <= 0)
3898 {
3899 saved_info_valid = false;
3900 return true;
3901 }
3902
3903 /*
3904 * Compute strategy_delta = how many buffers have been scanned by the
3905 * clock-sweep since last time. If first time through, assume none. Then
3906 * see if we are still ahead of the clock-sweep, and if so, how many
3907 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3908 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3909 * behavior when the passes counts wrap around.
3910 */
3911 if (saved_info_valid)
3912 {
3914
3917
3918 Assert(strategy_delta >= 0);
3919
3920 if ((int32) (next_passes - strategy_passes) > 0)
3921 {
3922 /* we're one pass ahead of the strategy point */
3924#ifdef BGW_DEBUG
3925 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3929#endif
3930 }
3931 else if (next_passes == strategy_passes &&
3933 {
3934 /* on same pass, but ahead or at least not behind */
3936#ifdef BGW_DEBUG
3937 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3941#endif
3942 }
3943 else
3944 {
3945 /*
3946 * We're behind, so skip forward to the strategy point and start
3947 * cleaning from there.
3948 */
3949#ifdef BGW_DEBUG
3950 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3954#endif
3958 }
3959 }
3960 else
3961 {
3962 /*
3963 * Initializing at startup or after LRU scanning had been off. Always
3964 * start at the strategy point.
3965 */
3966#ifdef BGW_DEBUG
3967 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3969#endif
3970 strategy_delta = 0;
3974 }
3975
3976 /* Update saved info for next time */
3979 saved_info_valid = true;
3980
3981 /*
3982 * Compute how many buffers had to be scanned for each new allocation, ie,
3983 * 1/density of reusable buffers, and track a moving average of that.
3984 *
3985 * If the strategy point didn't move, we don't update the density estimate
3986 */
3987 if (strategy_delta > 0 && recent_alloc > 0)
3988 {
3992 }
3993
3994 /*
3995 * Estimate how many reusable buffers there are between the current
3996 * strategy point and where we've scanned ahead to, based on the smoothed
3997 * density estimate.
3998 */
4001
4002 /*
4003 * Track a moving average of recent buffer allocations. Here, rather than
4004 * a true average we want a fast-attack, slow-decline behavior: we
4005 * immediately follow any increase.
4006 */
4007 if (smoothed_alloc <= (float) recent_alloc)
4009 else
4012
4013 /* Scale the estimate by a GUC to allow more aggressive tuning. */
4015
4016 /*
4017 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
4018 * eventually underflow to zero, and the underflows produce annoying
4019 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
4020 * zero, there's no point in tracking smaller and smaller values of
4021 * smoothed_alloc, so just reset it to exactly zero to avoid this
4022 * syndrome. It will pop back up as soon as recent_alloc increases.
4023 */
4024 if (upcoming_alloc_est == 0)
4025 smoothed_alloc = 0;
4026
4027 /*
4028 * Even in cases where there's been little or no buffer allocation
4029 * activity, we want to make a small amount of progress through the buffer
4030 * cache so that as many reusable buffers as possible are clean after an
4031 * idle period.
4032 *
4033 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
4034 * the BGW will be called during the scan_whole_pool time; slice the
4035 * buffer pool into that many sections.
4036 */
4038
4040 {
4041#ifdef BGW_DEBUG
4042 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
4044#endif
4046 }
4047
4048 /*
4049 * Now write out dirty reusable buffers, working forward from the
4050 * next_to_clean point, until we have lapped the strategy scan, or cleaned
4051 * enough buffers to match our estimate of the next cycle's allocation
4052 * requirements, or hit the bgwriter_lru_maxpages limit.
4053 */
4054
4055 num_to_scan = bufs_to_lap;
4056 num_written = 0;
4058
4059 /* Execute the LRU scan */
4060 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
4061 {
4063 wb_context);
4064
4065 if (++next_to_clean >= NBuffers)
4066 {
4067 next_to_clean = 0;
4068 next_passes++;
4069 }
4070 num_to_scan--;
4071
4072 if (sync_state & BUF_WRITTEN)
4073 {
4076 {
4078 break;
4079 }
4080 }
4081 else if (sync_state & BUF_REUSABLE)
4083 }
4084
4086
4087#ifdef BGW_DEBUG
4088 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
4091 bufs_to_lap - num_to_scan,
4094#endif
4095
4096 /*
4097 * Consider the above scan as being like a new allocation scan.
4098 * Characterize its density and update the smoothed one based on it. This
4099 * effectively halves the moving average period in cases where both the
4100 * strategy and the background writer are doing some useful scanning,
4101 * which is helpful because a long memory isn't as desirable on the
4102 * density estimates.
4103 */
4104 new_strategy_delta = bufs_to_lap - num_to_scan;
4106 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4107 {
4111
4112#ifdef BGW_DEBUG
4113 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4116#endif
4117 }
4118
4119 /* Return true if OK to hibernate */
4120 return (bufs_to_lap == 0 && recent_alloc == 0);
4121}
4122
4123/*
4124 * SyncOneBuffer -- process a single buffer during syncing.
4125 *
4126 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4127 * buffers marked recently used, as these are not replacement candidates.
4128 *
4129 * Returns a bitmask containing the following flag bits:
4130 * BUF_WRITTEN: we wrote the buffer.
4131 * BUF_REUSABLE: buffer is available for replacement, ie, it has
4132 * pin count 0 and usage count 0.
4133 *
4134 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4135 * after locking it, but we don't care all that much.)
4136 */
4137static int
4139{
4141 int result = 0;
4143 BufferTag tag;
4144
4145 /* Make sure we can handle the pin */
4148
4149 /*
4150 * Check whether buffer needs writing.
4151 *
4152 * We can make this check without taking the buffer content lock so long
4153 * as we mark pages dirty in access methods *before* logging changes with
4154 * XLogInsert(): if someone marks the buffer dirty just after our check we
4155 * don't worry because our checkpoint.redo points before log record for
4156 * upcoming changes and so we are not required to write such dirty buffer.
4157 */
4159
4162 {
4164 }
4165 else if (skip_recently_used)
4166 {
4167 /* Caller told us not to write recently-used buffers */
4169 return result;
4170 }
4171
4172 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4173 {
4174 /* It's clean, so nothing to do */
4176 return result;
4177 }
4178
4179 /*
4180 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4181 * nothing if the buffer is clean by the time we've locked it.)
4182 */
4184
4186
4187 tag = bufHdr->tag;
4188
4190
4191 /*
4192 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4193 * IOContext will always be IOCONTEXT_NORMAL.
4194 */
4196
4197 return result | BUF_WRITTEN;
4198}
4199
4200/*
4201 * AtEOXact_Buffers - clean up at end of transaction.
4202 *
4203 * As of PostgreSQL 8.0, buffer pins should get released by the
4204 * ResourceOwner mechanism. This routine is just a debugging
4205 * cross-check that no pins remain.
4206 */
4207void
4216
4217/*
4218 * Initialize access to shared buffer pool
4219 *
4220 * This is called during backend startup (whether standalone or under the
4221 * postmaster). It sets up for this backend's access to the already-existing
4222 * buffer pool.
4223 */
4224void
4226{
4227 /*
4228 * An advisory limit on the number of pins each backend should hold, based
4229 * on shared_buffers and the maximum number of connections possible.
4230 * That's very pessimistic, but outside toy-sized shared_buffers it should
4231 * allow plenty of pins. LimitAdditionalPins() and
4232 * GetAdditionalPinLimit() can be used to check the remaining balance.
4233 */
4235
4238
4240
4241 /*
4242 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4243 * the corresponding phase of backend shutdown.
4244 */
4245 Assert(MyProc != NULL);
4247}
4248
4249/*
4250 * During backend exit, ensure that we released all shared-buffer locks and
4251 * assert that we have no remaining pins.
4252 */
4253static void
4255{
4256 UnlockBuffers();
4257
4259
4260 /* localbuf.c needs a chance too */
4262}
4263
4264/*
4265 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4266 *
4267 * As of PostgreSQL 8.0, buffer pins should get released by the
4268 * ResourceOwner mechanism. This routine is just a debugging
4269 * cross-check that no pins remain.
4270 */
4271static void
4273{
4274#ifdef USE_ASSERT_CHECKING
4275 int RefCountErrors = 0;
4277 int i;
4278 char *s;
4279
4280 /* check the array */
4281 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4282 {
4284 {
4285 res = &PrivateRefCountArray[i];
4286
4288 elog(WARNING, "buffer refcount leak: %s", s);
4289 pfree(s);
4290
4292 }
4293 }
4294
4295 /* if necessary search the hash */
4297 {
4298 refcount_iterator iter;
4299
4301 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4302 {
4304 elog(WARNING, "buffer refcount leak: %s", s);
4305 pfree(s);
4307 }
4308 }
4309
4310 Assert(RefCountErrors == 0);
4311#endif
4312}
4313
4314#ifdef USE_ASSERT_CHECKING
4315/*
4316 * Check for exclusive-locked catalog buffers. This is the core of
4317 * AssertCouldGetRelation().
4318 *
4319 * A backend would self-deadlock on the content lock if the catalog scan read
4320 * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4321 * of catalogs used in relcache, because a catcache search on any catalog may
4322 * build that catalog's relcache entry. We don't have an inventory of
4323 * catalogs relcache uses, so just check buffers of most catalogs.
4324 *
4325 * It's better to minimize waits while holding an exclusive buffer lock, so it
4326 * would be nice to broaden this check not to be catalog-specific. However,
4327 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4328 * read tables. That is deadlock-free as long as there's no loop in the
4329 * dependency graph: modifying table A may cause an opclass to read table B,
4330 * but it must not cause a read of table A.
4331 */
4332void
4334{
4336
4337 /* check the array */
4338 for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4339 {
4341 {
4342 res = &PrivateRefCountArray[i];
4343
4344 if (res->buffer == InvalidBuffer)
4345 continue;
4346
4348 }
4349 }
4350
4351 /* if necessary search the hash */
4353 {
4354 refcount_iterator iter;
4355
4357 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4358 {
4360 }
4361 }
4362}
4363
4364static void
4366{
4368 BufferTag tag;
4369 Oid relid;
4370
4372 return;
4373
4374 tag = bufHdr->tag;
4375
4376 /*
4377 * This relNumber==relid assumption holds until a catalog experiences
4378 * VACUUM FULL or similar. After a command like that, relNumber will be
4379 * in the normal (non-catalog) range, and we lose the ability to detect
4380 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4381 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4382 * held lock.
4383 */
4384 relid = tag.relNumber;
4385
4386 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4387 return;
4388
4390}
4391#endif
4392
4393
4394/*
4395 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4396 */
4397char *
4399{
4400 BufferDesc *buf;
4402 char *result;
4403 ProcNumber backend;
4405
4407 if (BufferIsLocal(buffer))
4408 {
4411 backend = MyProcNumber;
4412 }
4413 else
4414 {
4417 backend = INVALID_PROC_NUMBER;
4418 }
4419
4420 /* theoretically we should lock the bufHdr here */
4421 buf_state = pg_atomic_read_u64(&buf->state);
4422
4423 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4424 buffer,
4426 BufTagGetForkNum(&buf->tag)).str,
4427 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4429 return result;
4430}
4431
4432/*
4433 * CheckPointBuffers
4434 *
4435 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4436 *
4437 * Note: temporary relations do not participate in checkpoints, so they don't
4438 * need to be flushed.
4439 */
4440void
4442{
4443 BufferSync(flags);
4444}
4445
4446/*
4447 * BufferGetBlockNumber
4448 * Returns the block number associated with a buffer.
4449 *
4450 * Note:
4451 * Assumes that the buffer is valid and pinned, else the
4452 * value may be obsolete immediately...
4453 */
4456{
4458
4460
4461 if (BufferIsLocal(buffer))
4463 else
4465
4466 /* pinned, so OK to read tag without spinlock */
4467 return bufHdr->tag.blockNum;
4468}
4469
4470/*
4471 * BufferGetTag
4472 * Returns the relfilelocator, fork number and block number associated with
4473 * a buffer.
4474 */
4475void
4478{
4480
4481 /* Do the same checks as BufferGetBlockNumber. */
4483
4484 if (BufferIsLocal(buffer))
4486 else
4488
4489 /* pinned, so OK to read tag without spinlock */
4490 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4491 *forknum = BufTagGetForkNum(&bufHdr->tag);
4492 *blknum = bufHdr->tag.blockNum;
4493}
4494
4495/*
4496 * FlushBuffer
4497 * Physically write out a shared buffer.
4498 *
4499 * NOTE: this actually just passes the buffer contents to the kernel; the
4500 * real write to disk won't happen until the kernel feels like it. This
4501 * is okay from our point of view since we can redo the changes from WAL.
4502 * However, we will need to force the changes to disk via fsync before
4503 * we can checkpoint WAL.
4504 *
4505 * The caller must hold a pin on the buffer and have
4506 * (share-)exclusively-locked the buffer contents.
4507 *
4508 * If the caller has an smgr reference for the buffer's relation, pass it
4509 * as the second parameter. If not, pass NULL.
4510 */
4511static void
4514{
4516 ErrorContextCallback errcallback;
4519
4522
4523 /*
4524 * Try to start an I/O operation. If StartBufferIO returns false, then
4525 * someone else flushed the buffer before we could, so we need not do
4526 * anything.
4527 */
4528 if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
4529 return;
4530
4531 /* Setup error traceback support for ereport() */
4533 errcallback.arg = buf;
4534 errcallback.previous = error_context_stack;
4535 error_context_stack = &errcallback;
4536
4537 /* Find smgr relation for buffer */
4538 if (reln == NULL)
4540
4542 buf->tag.blockNum,
4543 reln->smgr_rlocator.locator.spcOid,
4544 reln->smgr_rlocator.locator.dbOid,
4545 reln->smgr_rlocator.locator.relNumber);
4546
4547 /*
4548 * As we hold at least a share-exclusive lock on the buffer, the LSN
4549 * cannot change during the flush (and thus can't be torn).
4550 */
4552
4553 /*
4554 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4555 * rule that log updates must hit disk before any of the data-file changes
4556 * they describe do.
4557 *
4558 * However, this rule does not apply to unlogged relations, which will be
4559 * lost after a crash anyway. Most unlogged relation pages do not bear
4560 * LSNs since we never emit WAL records for them, and therefore flushing
4561 * up through the buffer LSN would be useless, but harmless. However,
4562 * some index AMs use LSNs internally to detect concurrent page
4563 * modifications, and therefore unlogged index pages bear "fake" LSNs
4564 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4565 * LSN counter could advance past the WAL insertion point; and if it did
4566 * happen, attempting to flush WAL through that location would fail, with
4567 * disastrous system-wide consequences. To make sure that can't happen,
4568 * skip the flush if the buffer isn't permanent.
4569 */
4570 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4572
4573 /*
4574 * Now it's safe to write the buffer to disk. Note that no one else should
4575 * have been able to write it, while we were busy with log flushing,
4576 * because we got the exclusive right to perform I/O by setting the
4577 * BM_IO_IN_PROGRESS bit.
4578 */
4580
4581 /* Update page checksum if desired. */
4582 PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
4583
4585
4587 BufTagGetForkNum(&buf->tag),
4588 buf->tag.blockNum,
4589 bufBlock,
4590 false);
4591
4592 /*
4593 * When a strategy is in use, only flushes of dirty buffers already in the
4594 * strategy ring are counted as strategy writes (IOCONTEXT
4595 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4596 * statistics tracking.
4597 *
4598 * If a shared buffer initially added to the ring must be flushed before
4599 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4600 *
4601 * If a shared buffer which was added to the ring later because the
4602 * current strategy buffer is pinned or in use or because all strategy
4603 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4604 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4605 * (from_ring will be false).
4606 *
4607 * When a strategy is not in use, the write can only be a "regular" write
4608 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4609 */
4612
4614
4615 /*
4616 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4617 */
4618 TerminateBufferIO(buf, true, 0, true, false);
4619
4621 buf->tag.blockNum,
4622 reln->smgr_rlocator.locator.spcOid,
4623 reln->smgr_rlocator.locator.dbOid,
4624 reln->smgr_rlocator.locator.relNumber);
4625
4626 /* Pop the error context stack */
4627 error_context_stack = errcallback.previous;
4628}
4629
4630/*
4631 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4632 * before/after calling FlushBuffer().
4633 */
4634static void
4644
4645/*
4646 * RelationGetNumberOfBlocksInFork
4647 * Determines the current number of pages in the specified relation fork.
4648 *
4649 * Note that the accuracy of the result will depend on the details of the
4650 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4651 * it might not be.
4652 */
4655{
4656 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4657 {
4658 /*
4659 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4660 * tableam returns the size in bytes - but for the purpose of this
4661 * routine, we want the number of blocks. Therefore divide, rounding
4662 * up.
4663 */
4665
4666 szbytes = table_relation_size(relation, forkNum);
4667
4668 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4669 }
4670 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4671 {
4672 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4673 }
4674 else
4675 Assert(false);
4676
4677 return 0; /* keep compiler quiet */
4678}
4679
4680/*
4681 * BufferIsPermanent
4682 * Determines whether a buffer will potentially still be around after
4683 * a crash. Caller must hold a buffer pin.
4684 */
4685bool
4687{
4689
4690 /* Local buffers are used only for temp relations. */
4691 if (BufferIsLocal(buffer))
4692 return false;
4693
4694 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4697
4698 /*
4699 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4700 * need not bother with the buffer header spinlock. Even if someone else
4701 * changes the buffer header state while we're doing this, the state is
4702 * changed atomically, so we'll read the old value or the new value, but
4703 * not random garbage.
4704 */
4706 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4707}
4708
4709/*
4710 * BufferGetLSNAtomic
4711 * Retrieves the LSN of the buffer atomically.
4712 *
4713 * This is necessary for some callers who may only hold a share lock on
4714 * the buffer. A share lock allows a concurrent backend to set hint bits
4715 * on the page, which in turn may require a WAL record to be emitted.
4716 *
4717 * On platforms with 8 byte atomic reads/writes, we don't need to do any
4718 * additional locking. On platforms not supporting such 8 byte atomic
4719 * reads/writes, we need to actually take the header lock.
4720 */
4723{
4724 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4727
4728#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4730#else
4731 {
4732 char *page = BufferGetPage(buffer);
4734 XLogRecPtr lsn;
4735
4736 /*
4737 * If we don't need locking for correctness, fastpath out.
4738 */
4740 return PageGetLSN(page);
4741
4744 lsn = PageGetLSN(page);
4746
4747 return lsn;
4748 }
4749#endif
4750}
4751
4752/* ---------------------------------------------------------------------
4753 * DropRelationBuffers
4754 *
4755 * This function removes from the buffer pool all the pages of the
4756 * specified relation forks that have block numbers >= firstDelBlock.
4757 * (In particular, with firstDelBlock = 0, all pages are removed.)
4758 * Dirty pages are simply dropped, without bothering to write them
4759 * out first. Therefore, this is NOT rollback-able, and so should be
4760 * used only with extreme caution!
4761 *
4762 * Currently, this is called only from smgr.c when the underlying file
4763 * is about to be deleted or truncated (firstDelBlock is needed for
4764 * the truncation case). The data in the affected pages would therefore
4765 * be deleted momentarily anyway, and there is no point in writing it.
4766 * It is the responsibility of higher-level code to ensure that the
4767 * deletion or truncation does not lose any data that could be needed
4768 * later. It is also the responsibility of higher-level code to ensure
4769 * that no other process could be trying to load more pages of the
4770 * relation into buffers.
4771 * --------------------------------------------------------------------
4772 */
4773void
4776{
4777 int i;
4778 int j;
4779 RelFileLocatorBackend rlocator;
4782
4783 rlocator = smgr_reln->smgr_rlocator;
4784
4785 /* If it's a local relation, it's localbuf.c's problem. */
4786 if (RelFileLocatorBackendIsTemp(rlocator))
4787 {
4788 if (rlocator.backend == MyProcNumber)
4789 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4791
4792 return;
4793 }
4794
4795 /*
4796 * To remove all the pages of the specified relation forks from the buffer
4797 * pool, we need to scan the entire buffer pool but we can optimize it by
4798 * finding the buffers from BufMapping table provided we know the exact
4799 * size of each fork of the relation. The exact size is required to ensure
4800 * that we don't leave any buffer for the relation being dropped as
4801 * otherwise the background writer or checkpointer can lead to a PANIC
4802 * error while flushing buffers corresponding to files that don't exist.
4803 *
4804 * To know the exact size, we rely on the size cached for each fork by us
4805 * during recovery which limits the optimization to recovery and on
4806 * standbys but we can easily extend it once we have shared cache for
4807 * relation size.
4808 *
4809 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4810 * and the future writes keeps the cached value up-to-date. See
4811 * smgrextend. It is possible that the value of the first lseek is smaller
4812 * than the actual number of existing blocks in the file due to buggy
4813 * Linux kernels that might not have accounted for the recent write. But
4814 * that should be fine because there must not be any buffers after that
4815 * file size.
4816 */
4817 for (i = 0; i < nforks; i++)
4818 {
4819 /* Get the number of blocks for a relation's fork */
4821
4823 {
4825 break;
4826 }
4827
4828 /* calculate the number of blocks to be invalidated */
4830 }
4831
4832 /*
4833 * We apply the optimization iff the total number of blocks to invalidate
4834 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4835 */
4838 {
4839 for (j = 0; j < nforks; j++)
4840 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4842 return;
4843 }
4844
4845 for (i = 0; i < NBuffers; i++)
4846 {
4848
4849 /*
4850 * We can make this a tad faster by prechecking the buffer tag before
4851 * we attempt to lock the buffer; this saves a lot of lock
4852 * acquisitions in typical cases. It should be safe because the
4853 * caller must have AccessExclusiveLock on the relation, or some other
4854 * reason to be certain that no one is loading new pages of the rel
4855 * into the buffer pool. (Otherwise we might well miss such pages
4856 * entirely.) Therefore, while the tag might be changing while we
4857 * look at it, it can't be changing *to* a value we care about, only
4858 * *away* from such a value. So false negatives are impossible, and
4859 * false positives are safe because we'll recheck after getting the
4860 * buffer lock.
4861 *
4862 * We could check forkNum and blockNum as well as the rlocator, but
4863 * the incremental win from doing so seems small.
4864 */
4865 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4866 continue;
4867
4869
4870 for (j = 0; j < nforks; j++)
4871 {
4872 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4873 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4874 bufHdr->tag.blockNum >= firstDelBlock[j])
4875 {
4876 InvalidateBuffer(bufHdr); /* releases spinlock */
4877 break;
4878 }
4879 }
4880 if (j >= nforks)
4882 }
4883}
4884
4885/* ---------------------------------------------------------------------
4886 * DropRelationsAllBuffers
4887 *
4888 * This function removes from the buffer pool all the pages of all
4889 * forks of the specified relations. It's equivalent to calling
4890 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4891 * --------------------------------------------------------------------
4892 */
4893void
4895{
4896 int i;
4897 int n = 0;
4898 SMgrRelation *rels;
4899 BlockNumber (*block)[MAX_FORKNUM + 1];
4902 bool cached = true;
4903 bool use_bsearch;
4904
4905 if (nlocators == 0)
4906 return;
4907
4908 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4909
4910 /* If it's a local relation, it's localbuf.c's problem. */
4911 for (i = 0; i < nlocators; i++)
4912 {
4913 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4914 {
4915 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4916 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4917 }
4918 else
4919 rels[n++] = smgr_reln[i];
4920 }
4921
4922 /*
4923 * If there are no non-local relations, then we're done. Release the
4924 * memory and return.
4925 */
4926 if (n == 0)
4927 {
4928 pfree(rels);
4929 return;
4930 }
4931
4932 /*
4933 * This is used to remember the number of blocks for all the relations
4934 * forks.
4935 */
4936 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4937 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4938
4939 /*
4940 * We can avoid scanning the entire buffer pool if we know the exact size
4941 * of each of the given relation forks. See DropRelationBuffers.
4942 */
4943 for (i = 0; i < n && cached; i++)
4944 {
4945 for (int j = 0; j <= MAX_FORKNUM; j++)
4946 {
4947 /* Get the number of blocks for a relation's fork. */
4948 block[i][j] = smgrnblocks_cached(rels[i], j);
4949
4950 /* We need to only consider the relation forks that exists. */
4951 if (block[i][j] == InvalidBlockNumber)
4952 {
4953 if (!smgrexists(rels[i], j))
4954 continue;
4955 cached = false;
4956 break;
4957 }
4958
4959 /* calculate the total number of blocks to be invalidated */
4960 nBlocksToInvalidate += block[i][j];
4961 }
4962 }
4963
4964 /*
4965 * We apply the optimization iff the total number of blocks to invalidate
4966 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4967 */
4969 {
4970 for (i = 0; i < n; i++)
4971 {
4972 for (int j = 0; j <= MAX_FORKNUM; j++)
4973 {
4974 /* ignore relation forks that doesn't exist */
4975 if (!BlockNumberIsValid(block[i][j]))
4976 continue;
4977
4978 /* drop all the buffers for a particular relation fork */
4979 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4980 j, block[i][j], 0);
4981 }
4982 }
4983
4984 pfree(block);
4985 pfree(rels);
4986 return;
4987 }
4988
4989 pfree(block);
4990 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4991 for (i = 0; i < n; i++)
4992 locators[i] = rels[i]->smgr_rlocator.locator;
4993
4994 /*
4995 * For low number of relations to drop just use a simple walk through, to
4996 * save the bsearch overhead. The threshold to use is rather a guess than
4997 * an exactly determined value, as it depends on many factors (CPU and RAM
4998 * speeds, amount of shared buffers etc.).
4999 */
5001
5002 /* sort the list of rlocators if necessary */
5003 if (use_bsearch)
5005
5006 for (i = 0; i < NBuffers; i++)
5007 {
5008 RelFileLocator *rlocator = NULL;
5010
5011 /*
5012 * As in DropRelationBuffers, an unlocked precheck should be safe and
5013 * saves some cycles.
5014 */
5015
5016 if (!use_bsearch)
5017 {
5018 int j;
5019
5020 for (j = 0; j < n; j++)
5021 {
5023 {
5024 rlocator = &locators[j];
5025 break;
5026 }
5027 }
5028 }
5029 else
5030 {
5031 RelFileLocator locator;
5032
5033 locator = BufTagGetRelFileLocator(&bufHdr->tag);
5034 rlocator = bsearch(&locator,
5035 locators, n, sizeof(RelFileLocator),
5037 }
5038
5039 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5040 if (rlocator == NULL)
5041 continue;
5042
5044 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
5045 InvalidateBuffer(bufHdr); /* releases spinlock */
5046 else
5048 }
5049
5050 pfree(locators);
5051 pfree(rels);
5052}
5053
5054/* ---------------------------------------------------------------------
5055 * FindAndDropRelationBuffers
5056 *
5057 * This function performs look up in BufMapping table and removes from the
5058 * buffer pool all the pages of the specified relation fork that has block
5059 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
5060 * pages are removed.)
5061 * --------------------------------------------------------------------
5062 */
5063static void
5067{
5068 BlockNumber curBlock;
5069
5070 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
5071 {
5072 uint32 bufHash; /* hash value for tag */
5073 BufferTag bufTag; /* identity of requested block */
5074 LWLock *bufPartitionLock; /* buffer partition lock for it */
5075 int buf_id;
5077
5078 /* create a tag so we can lookup the buffer */
5079 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
5080
5081 /* determine its hash code and partition lock ID */
5084
5085 /* Check that it is in the buffer pool. If not, do nothing. */
5087 buf_id = BufTableLookup(&bufTag, bufHash);
5089
5090 if (buf_id < 0)
5091 continue;
5092
5093 bufHdr = GetBufferDescriptor(buf_id);
5094
5095 /*
5096 * We need to lock the buffer header and recheck if the buffer is
5097 * still associated with the same block because the buffer could be
5098 * evicted by some other backend loading blocks for a different
5099 * relation after we release lock on the BufMapping table.
5100 */
5102
5103 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5104 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5105 bufHdr->tag.blockNum >= firstDelBlock)
5106 InvalidateBuffer(bufHdr); /* releases spinlock */
5107 else
5109 }
5110}
5111
5112/* ---------------------------------------------------------------------
5113 * DropDatabaseBuffers
5114 *
5115 * This function removes all the buffers in the buffer cache for a
5116 * particular database. Dirty pages are simply dropped, without
5117 * bothering to write them out first. This is used when we destroy a
5118 * database, to avoid trying to flush data to disk when the directory
5119 * tree no longer exists. Implementation is pretty similar to
5120 * DropRelationBuffers() which is for destroying just one relation.
5121 * --------------------------------------------------------------------
5122 */
5123void
5125{
5126 int i;
5127
5128 /*
5129 * We needn't consider local buffers, since by assumption the target
5130 * database isn't our own.
5131 */
5132
5133 for (i = 0; i < NBuffers; i++)
5134 {
5136
5137 /*
5138 * As in DropRelationBuffers, an unlocked precheck should be safe and
5139 * saves some cycles.
5140 */
5141 if (bufHdr->tag.dbOid != dbid)
5142 continue;
5143
5145 if (bufHdr->tag.dbOid == dbid)
5146 InvalidateBuffer(bufHdr); /* releases spinlock */
5147 else
5149 }
5150}
5151
5152/* ---------------------------------------------------------------------
5153 * FlushRelationBuffers
5154 *
5155 * This function writes all dirty pages of a relation out to disk
5156 * (or more accurately, out to kernel disk buffers), ensuring that the
5157 * kernel has an up-to-date view of the relation.
5158 *
5159 * Generally, the caller should be holding AccessExclusiveLock on the
5160 * target relation to ensure that no other backend is busy dirtying
5161 * more blocks of the relation; the effects can't be expected to last
5162 * after the lock is released.
5163 *
5164 * XXX currently it sequentially searches the buffer pool, should be
5165 * changed to more clever ways of searching. This routine is not
5166 * used in any performance-critical code paths, so it's not worth
5167 * adding additional overhead to normal paths to make it go faster.
5168 * --------------------------------------------------------------------
5169 */
5170void
5172{
5173 int i;
5175 SMgrRelation srel = RelationGetSmgr(rel);
5176
5177 if (RelationUsesLocalBuffers(rel))
5178 {
5179 for (i = 0; i < NLocBuffer; i++)
5180 {
5182
5184 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5185 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5186 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5187 {
5188 ErrorContextCallback errcallback;
5189
5190 /* Setup error traceback support for ereport() */
5192 errcallback.arg = bufHdr;
5193 errcallback.previous = error_context_stack;
5194 error_context_stack = &errcallback;
5195
5196 /* Make sure we can handle the pin */
5199
5200 /*
5201 * Pin/unpin mostly to make valgrind work, but it also seems
5202 * like the right thing to do.
5203 */
5204 PinLocalBuffer(bufHdr, false);
5205
5206
5207 FlushLocalBuffer(bufHdr, srel);
5208
5210
5211 /* Pop the error context stack */
5212 error_context_stack = errcallback.previous;
5213 }
5214 }
5215
5216 return;
5217 }
5218
5219 for (i = 0; i < NBuffers; i++)
5220 {
5222
5224
5225 /*
5226 * As in DropRelationBuffers, an unlocked precheck should be safe and
5227 * saves some cycles.
5228 */
5230 continue;
5231
5232 /* Make sure we can handle the pin */
5235
5237 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5239 {
5243 }
5244 else
5246 }
5247}
5248
5249/* ---------------------------------------------------------------------
5250 * FlushRelationsAllBuffers
5251 *
5252 * This function flushes out of the buffer pool all the pages of all
5253 * forks of the specified smgr relations. It's equivalent to calling
5254 * FlushRelationBuffers once per relation. The relations are assumed not
5255 * to use local buffers.
5256 * --------------------------------------------------------------------
5257 */
5258void
5260{
5261 int i;
5263 bool use_bsearch;
5264
5265 if (nrels == 0)
5266 return;
5267
5268 /* fill-in array for qsort */
5270
5271 for (i = 0; i < nrels; i++)
5272 {
5273 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5274
5275 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5276 srels[i].srel = smgrs[i];
5277 }
5278
5279 /*
5280 * Save the bsearch overhead for low number of relations to sync. See
5281 * DropRelationsAllBuffers for details.
5282 */
5284
5285 /* sort the list of SMgrRelations if necessary */
5286 if (use_bsearch)
5287 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5288
5289 for (i = 0; i < NBuffers; i++)
5290 {
5294
5295 /*
5296 * As in DropRelationBuffers, an unlocked precheck should be safe and
5297 * saves some cycles.
5298 */
5299
5300 if (!use_bsearch)
5301 {
5302 int j;
5303
5304 for (j = 0; j < nrels; j++)
5305 {
5306 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5307 {
5308 srelent = &srels[j];
5309 break;
5310 }
5311 }
5312 }
5313 else
5314 {
5315 RelFileLocator rlocator;
5316
5317 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5318 srelent = bsearch(&rlocator,
5319 srels, nrels, sizeof(SMgrSortArray),
5321 }
5322
5323 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5324 if (srelent == NULL)
5325 continue;
5326
5327 /* Make sure we can handle the pin */
5330
5332 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5334 {
5338 }
5339 else
5341 }
5342
5343 pfree(srels);
5344}
5345
5346/* ---------------------------------------------------------------------
5347 * RelationCopyStorageUsingBuffer
5348 *
5349 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5350 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5351 *
5352 * Refer comments atop CreateAndCopyRelationData() for details about
5353 * 'permanent' parameter.
5354 * --------------------------------------------------------------------
5355 */
5356static void
5359 ForkNumber forkNum, bool permanent)
5360{
5361 Buffer srcBuf;
5362 Buffer dstBuf;
5363 Page srcPage;
5364 Page dstPage;
5365 bool use_wal;
5366 BlockNumber nblocks;
5367 BlockNumber blkno;
5374
5375 /*
5376 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5377 * can skip it when copying any fork of an unlogged relation other than
5378 * the init fork.
5379 */
5380 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5381
5382 /* Get number of blocks in the source relation. */
5384 forkNum);
5385
5386 /* Nothing to copy; just return. */
5387 if (nblocks == 0)
5388 return;
5389
5390 /*
5391 * Bulk extend the destination relation of the same size as the source
5392 * relation before starting to copy block by block.
5393 */
5394 memset(buf.data, 0, BLCKSZ);
5395 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5396 buf.data, true);
5397
5398 /* This is a bulk operation, so use buffer access strategies. */
5401
5402 /* Initialize streaming read */
5403 p.current_blocknum = 0;
5404 p.last_exclusive = nblocks;
5406
5407 /*
5408 * It is safe to use batchmode as block_range_read_stream_cb takes no
5409 * locks.
5410 */
5414 src_smgr,
5416 forkNum,
5418 &p,
5419 0);
5420
5421 /* Iterate over each block of the source relation file. */
5422 for (blkno = 0; blkno < nblocks; blkno++)
5423 {
5425
5426 /* Read block from source relation. */
5430
5434 permanent);
5436
5438
5439 /* Copy page data from the source to the destination. */
5442
5443 /* WAL-log the copied page. */
5444 if (use_wal)
5446
5448
5451 }
5454
5457}
5458
5459/* ---------------------------------------------------------------------
5460 * CreateAndCopyRelationData
5461 *
5462 * Create destination relation storage and copy all forks from the
5463 * source relation to the destination.
5464 *
5465 * Pass permanent as true for permanent relations and false for
5466 * unlogged relations. Currently this API is not supported for
5467 * temporary relations.
5468 * --------------------------------------------------------------------
5469 */
5470void
5472 RelFileLocator dst_rlocator, bool permanent)
5473{
5474 char relpersistence;
5477
5478 /* Set the relpersistence. */
5479 relpersistence = permanent ?
5481
5484
5485 /*
5486 * Create and copy all forks of the relation. During create database we
5487 * have a separate cleanup mechanism which deletes complete database
5488 * directory. Therefore, each individual relation doesn't need to be
5489 * registered for cleanup.
5490 */
5491 RelationCreateStorage(dst_rlocator, relpersistence, false);
5492
5493 /* copy main fork. */
5495 permanent);
5496
5497 /* copy those extra forks that exist */
5498 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5499 forkNum <= MAX_FORKNUM; forkNum++)
5500 {
5501 if (smgrexists(src_rel, forkNum))
5502 {
5503 smgrcreate(dst_rel, forkNum, false);
5504
5505 /*
5506 * WAL log creation if the relation is persistent, or this is the
5507 * init fork of an unlogged relation.
5508 */
5509 if (permanent || forkNum == INIT_FORKNUM)
5510 log_smgrcreate(&dst_rlocator, forkNum);
5511
5512 /* Copy a fork's data, block by block. */
5514 permanent);
5515 }
5516 }
5517}
5518
5519/* ---------------------------------------------------------------------
5520 * FlushDatabaseBuffers
5521 *
5522 * This function writes all dirty pages of a database out to disk
5523 * (or more accurately, out to kernel disk buffers), ensuring that the
5524 * kernel has an up-to-date view of the database.
5525 *
5526 * Generally, the caller should be holding an appropriate lock to ensure
5527 * no other backend is active in the target database; otherwise more
5528 * pages could get dirtied.
5529 *
5530 * Note we don't worry about flushing any pages of temporary relations.
5531 * It's assumed these wouldn't be interesting.
5532 * --------------------------------------------------------------------
5533 */
5534void
5536{
5537 int i;
5539
5540 for (i = 0; i < NBuffers; i++)
5541 {
5543
5545
5546 /*
5547 * As in DropRelationBuffers, an unlocked precheck should be safe and
5548 * saves some cycles.
5549 */
5550 if (bufHdr->tag.dbOid != dbid)
5551 continue;
5552
5553 /* Make sure we can handle the pin */
5556
5558 if (bufHdr->tag.dbOid == dbid &&
5560 {
5564 }
5565 else
5567 }
5568}
5569
5570/*
5571 * Flush a previously, share-exclusively or exclusively, locked and pinned
5572 * buffer to the OS.
5573 */
5574void
5576{
5578
5579 /* currently not needed, but no fundamental reason not to support */
5581
5583
5585
5587
5589}
5590
5591/*
5592 * ReleaseBuffer -- release the pin on a buffer
5593 */
5594void
5596{
5597 if (!BufferIsValid(buffer))
5598 elog(ERROR, "bad buffer ID: %d", buffer);
5599
5600 if (BufferIsLocal(buffer))
5602 else
5604}
5605
5606/*
5607 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5608 *
5609 * This is just a, more efficient, shorthand for a common combination.
5610 */
5611void
5613{
5614 int mode;
5615 BufferDesc *buf;
5617 uint64 sub;
5619
5621
5622 if (BufferIsLocal(buffer))
5623 {
5625 return;
5626 }
5627
5629
5631
5633
5634 /* compute state modification for lock release */
5636
5637 /* compute state modification for pin release */
5639 Assert(ref != NULL);
5640 Assert(ref->data.refcount > 0);
5641 ref->data.refcount--;
5642
5643 /* no more backend local pins, reduce shared pin count */
5644 if (likely(ref->data.refcount == 0))
5645 {
5646 /* See comment in UnpinBufferNoOwner() */
5648
5649 sub |= BUF_REFCOUNT_ONE;
5651 }
5652
5653 /* perform the lock and pin release in one atomic op */
5654 lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
5655
5656 /* wake up waiters for the lock */
5658
5659 /* wake up waiter for the pin release */
5662
5663 /*
5664 * Now okay to allow cancel/die interrupts again, which were held when the
5665 * lock was acquired.
5666 */
5668}
5669
5670/*
5671 * IncrBufferRefCount
5672 * Increment the pin count on a buffer that we have *already* pinned
5673 * at least once.
5674 *
5675 * This function cannot be used on a buffer we do not have pinned,
5676 * because it doesn't change the shared buffer state.
5677 */
5678void
5695
5696/*
5697 * Shared-buffer only helper for MarkBufferDirtyHint() and
5698 * BufferSetHintBits16().
5699 *
5700 * This is separated out because it turns out that the repeated checks for
5701 * local buffers, repeated GetBufferDescriptor() and repeated reading of the
5702 * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
5703 */
5704static inline void
5706 bool buffer_std)
5707{
5708 Page page = BufferGetPage(buffer);
5709
5711
5712 /* here, either share-exclusive or exclusive lock is OK */
5715
5716 /*
5717 * This routine might get called many times on the same page, if we are
5718 * making the first scan after commit of an xact that added/deleted many
5719 * tuples. So, be as quick as we can if the buffer is already dirty.
5720 *
5721 * As we are holding (at least) a share-exclusive lock, nobody could have
5722 * cleaned or dirtied the page concurrently, so we can just rely on the
5723 * previously fetched value here without any danger of races.
5724 */
5725 if (unlikely(!(lockstate & BM_DIRTY)))
5726 {
5728 bool wal_log = false;
5730
5731 /*
5732 * If we need to protect hint bit updates from torn writes, WAL-log a
5733 * full page image of the page. This full page image is only necessary
5734 * if the hint bit update is the first change to the page since the
5735 * last checkpoint.
5736 *
5737 * We don't check full_page_writes here because that logic is included
5738 * when we call XLogInsert() since the value changes dynamically.
5739 */
5741 {
5742 /*
5743 * If we must not write WAL, due to a relfilelocator-specific
5744 * condition or being in recovery, don't dirty the page. We can
5745 * set the hint, just not dirty the page as a result so the hint
5746 * is lost when we evict the page or shutdown.
5747 *
5748 * See src/backend/storage/page/README for longer discussion.
5749 */
5750 if (RecoveryInProgress() ||
5752 return;
5753
5754 wal_log = true;
5755 }
5756
5757 /*
5758 * We must mark the page dirty before we emit the WAL record, as per
5759 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5760 * flush the buffer, even if we haven't inserted the WAL record yet.
5761 * As we hold at least a share-exclusive lock, checkpoints will wait
5762 * for this backend to be done with the buffer before continuing. If
5763 * we did it the other way round, a checkpoint could start between
5764 * writing the WAL record and marking the buffer dirty.
5765 */
5767
5768 /*
5769 * It should not be possible for the buffer to already be dirty, see
5770 * comment above.
5771 */
5775 BM_DIRTY,
5776 0, 0);
5777
5778 /*
5779 * If the block is already dirty because we either made a change or
5780 * set a hint already, then we don't need to write a full page image.
5781 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5782 * would increase the call rate. Bulk setting of hint bits would
5783 * reduce the call rate...
5784 */
5785 if (wal_log)
5787
5788 if (XLogRecPtrIsValid(lsn))
5789 {
5790 /*
5791 * Set the page LSN if we wrote a backup block. To allow backends
5792 * that only hold a share lock on the buffer to read the LSN in a
5793 * tear-free manner, we set the page LSN while holding the buffer
5794 * header lock. This allows any reader of an LSN who holds only a
5795 * share lock to also obtain a buffer header lock before using
5796 * PageGetLSN() to read the LSN in a tear free way. This is done
5797 * in BufferGetLSNAtomic().
5798 *
5799 * If checksums are enabled, you might think we should reset the
5800 * checksum here. That will happen when the page is written
5801 * sometime later in this checkpoint cycle.
5802 */
5804 PageSetLSN(page, lsn);
5806 }
5807
5809 if (VacuumCostActive)
5811 }
5812}
5813
5814/*
5815 * MarkBufferDirtyHint
5816 *
5817 * Mark a buffer dirty for non-critical changes.
5818 *
5819 * This is essentially the same as MarkBufferDirty, except:
5820 *
5821 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5822 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5823 * 2. The caller might have only a share-exclusive-lock instead of an
5824 * exclusive-lock on the buffer's content lock.
5825 * 3. This function does not guarantee that the buffer is always marked dirty
5826 * (it e.g. can't always on a hot standby), so it cannot be used for
5827 * important changes.
5828 */
5829inline void
5831{
5833
5835
5836 if (!BufferIsValid(buffer))
5837 elog(ERROR, "bad buffer ID: %d", buffer);
5838
5839 if (BufferIsLocal(buffer))
5840 {
5842 return;
5843 }
5844
5846 pg_atomic_read_u64(&bufHdr->state),
5847 buffer_std);
5848}
5849
5850/*
5851 * Release buffer content locks for shared buffers.
5852 *
5853 * Used to clean up after errors.
5854 *
5855 * Currently, we can expect that resource owner cleanup, via
5856 * ResOwnerReleaseBuffer(), took care of releasing buffer content locks per
5857 * se; the only thing we need to deal with here is clearing any PIN_COUNT
5858 * request that was in progress.
5859 */
5860void
5862{
5864
5865 if (buf)
5866 {
5868 uint64 unset_bits = 0;
5869
5871
5872 /*
5873 * Don't complain if flag bit not set; it could have been reset but we
5874 * got a cancel/die interrupt before getting the signal.
5875 */
5876 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5877 buf->wait_backend_pgprocno == MyProcNumber)
5879
5881 0, unset_bits,
5882 0);
5883
5885 }
5886}
5887
5888/*
5889 * Acquire the buffer content lock in the specified mode
5890 *
5891 * If the lock is not available, sleep until it is.
5892 *
5893 * Side effect: cancel/die interrupts are held off until lock release.
5894 *
5895 * This uses almost the same locking approach as lwlock.c's
5896 * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5897 * detailed discussion.
5898 *
5899 * The reason that this, and most of the other BufferLock* functions, get both
5900 * the Buffer and BufferDesc* as parameters, is that looking up one from the
5901 * other repeatedly shows up noticeably in profiles.
5902 *
5903 * Callers should provide a constant for mode, for more efficient code
5904 * generation.
5905 */
5906static inline void
5908{
5909 PrivateRefCountEntry *entry;
5910 int extraWaits = 0;
5911
5912 /*
5913 * Get reference to the refcount entry before we hold the lock, it seems
5914 * better to do before holding the lock.
5915 */
5916 entry = GetPrivateRefCountEntry(buffer, true);
5917
5918 /*
5919 * We better not already hold a lock on the buffer.
5920 */
5922
5923 /*
5924 * Lock out cancel/die interrupts until we exit the code section protected
5925 * by the content lock. This ensures that interrupts will not interfere
5926 * with manipulations of data structures in shared memory.
5927 */
5929
5930 for (;;)
5931 {
5932 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5933 bool mustwait;
5934
5935 /*
5936 * Try to grab the lock the first time, we're not in the waitqueue
5937 * yet/anymore.
5938 */
5940
5941 if (likely(!mustwait))
5942 {
5943 break;
5944 }
5945
5946 /*
5947 * Ok, at this point we couldn't grab the lock on the first try. We
5948 * cannot simply queue ourselves to the end of the list and wait to be
5949 * woken up because by now the lock could long have been released.
5950 * Instead add us to the queue and try to grab the lock again. If we
5951 * succeed we need to revert the queuing and be happy, otherwise we
5952 * recheck the lock. If we still couldn't grab it, we know that the
5953 * other locker will see our queue entries when releasing since they
5954 * existed before we checked for the lock.
5955 */
5956
5957 /* add to the queue */
5959
5960 /* we're now guaranteed to be woken up if necessary */
5962
5963 /* ok, grabbed the lock the second time round, need to undo queueing */
5964 if (!mustwait)
5965 {
5967 break;
5968 }
5969
5970 switch (mode)
5971 {
5974 break;
5977 break;
5978 case BUFFER_LOCK_SHARE:
5980 break;
5981 case BUFFER_LOCK_UNLOCK:
5983
5984 }
5986
5987 /*
5988 * Wait until awakened.
5989 *
5990 * It is possible that we get awakened for a reason other than being
5991 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5992 * Once we've gotten the lock, re-increment the sema by the number of
5993 * additional signals received.
5994 */
5995 for (;;)
5996 {
5999 break;
6000 extraWaits++;
6001 }
6002
6004
6005 /* Retrying, allow BufferLockReleaseSub to release waiters again. */
6007 }
6008
6009 /* Remember that we now hold this lock */
6010 entry->data.lockmode = mode;
6011
6012 /*
6013 * Fix the process wait semaphore's count for any absorbed wakeups.
6014 */
6015 while (unlikely(extraWaits-- > 0))
6017}
6018
6019/*
6020 * Release a previously acquired buffer content lock.
6021 */
6022static void
6024{
6027 uint64 sub;
6028
6030
6031 /*
6032 * Release my hold on lock, after that it can immediately be acquired by
6033 * others, even if we still have to wakeup other waiters.
6034 */
6036
6038
6040
6041 /*
6042 * Now okay to allow cancel/die interrupts.
6043 */
6045}
6046
6047
6048/*
6049 * Acquire the content lock for the buffer, but only if we don't have to wait.
6050 *
6051 * It is allowed to try to conditionally acquire a lock on a buffer that this
6052 * backend has already locked, but the lock acquisition will always fail, even
6053 * if the new lock acquisition does not conflict with an already held lock
6054 * (e.g. two share locks). This is because we currently do not have space to
6055 * track multiple lock ownerships of the same buffer within one backend. That
6056 * is ok for the current uses of BufferLockConditional().
6057 */
6058static bool
6060{
6062 bool mustwait;
6063
6064 /*
6065 * As described above, if we're trying to lock a buffer this backend
6066 * already has locked, return false, independent of the existing and
6067 * desired lock level.
6068 */
6069 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
6070 return false;
6071
6072 /*
6073 * Lock out cancel/die interrupts until we exit the code section protected
6074 * by the content lock. This ensures that interrupts will not interfere
6075 * with manipulations of data structures in shared memory.
6076 */
6078
6079 /* Check for the lock */
6081
6082 if (mustwait)
6083 {
6084 /* Failed to get lock, so release interrupt holdoff */
6086 }
6087 else
6088 {
6089 entry->data.lockmode = mode;
6090 }
6091
6092 return !mustwait;
6093}
6094
6095/*
6096 * Internal function that tries to atomically acquire the content lock in the
6097 * passed in mode.
6098 *
6099 * This function will not block waiting for a lock to become free - that's the
6100 * caller's job.
6101 *
6102 * Similar to LWLockAttemptLock().
6103 */
6104static inline bool
6106{
6108
6109 /*
6110 * Read once outside the loop, later iterations will get the newer value
6111 * via compare & exchange.
6112 */
6114
6115 /* loop until we've determined whether we could acquire the lock or not */
6116 while (true)
6117 {
6119 bool lock_free;
6120
6122
6124 {
6125 lock_free = (old_state & BM_LOCK_MASK) == 0;
6126 if (lock_free)
6128 }
6130 {
6132 if (lock_free)
6134 }
6135 else
6136 {
6138 if (lock_free)
6140 }
6141
6142 /*
6143 * Attempt to swap in the state we are expecting. If we didn't see
6144 * lock to be free, that's just the old value. If we saw it as free,
6145 * we'll attempt to mark it acquired. The reason that we always swap
6146 * in the value is that this doubles as a memory barrier. We could try
6147 * to be smarter and only swap in values if we saw the lock as free,
6148 * but benchmark haven't shown it as beneficial so far.
6149 *
6150 * Retry if the value changed since we last looked at it.
6151 */
6154 {
6155 if (lock_free)
6156 {
6157 /* Great! Got the lock. */
6158 return false;
6159 }
6160 else
6161 return true; /* somebody else has the lock */
6162 }
6163 }
6164
6166}
6167
6168/*
6169 * Add ourselves to the end of the content lock's wait queue.
6170 */
6171static void
6173{
6174 /*
6175 * If we don't have a PGPROC structure, there's no way to wait. This
6176 * should never occur, since MyProc should only be null during shared
6177 * memory initialization.
6178 */
6179 if (MyProc == NULL)
6180 elog(PANIC, "cannot wait without a PGPROC structure");
6181
6183 elog(PANIC, "queueing for lock while waiting on another one");
6184
6186
6187 /* setting the flag is protected by the spinlock */
6189
6190 /*
6191 * These are currently used both for lwlocks and buffer content locks,
6192 * which is acceptable, although not pretty, because a backend can't wait
6193 * for both types of locks at the same time.
6194 */
6197
6198 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6199
6200 /* Can release the mutex now */
6202}
6203
6204/*
6205 * Remove ourselves from the waitlist.
6206 *
6207 * This is used if we queued ourselves because we thought we needed to sleep
6208 * but, after further checking, we discovered that we don't actually need to
6209 * do so.
6210 */
6211static void
6213{
6214 bool on_waitlist;
6215
6217
6219 if (on_waitlist)
6220 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6221
6222 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6224 {
6226 }
6227
6228 /* XXX: combine with fetch_and above? */
6230
6231 /* clear waiting state again, nice for debugging */
6232 if (on_waitlist)
6234 else
6235 {
6236 int extraWaits = 0;
6237
6238
6239 /*
6240 * Somebody else dequeued us and has or will wake us up. Deal with the
6241 * superfluous absorption of a wakeup.
6242 */
6243
6244 /*
6245 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6246 * removed ourselves - they'll have set it.
6247 */
6249
6250 /*
6251 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6252 * get reset at some inconvenient point later. Most of the time this
6253 * will immediately return.
6254 */
6255 for (;;)
6256 {
6259 break;
6260 extraWaits++;
6261 }
6262
6263 /*
6264 * Fix the process wait semaphore's count for any absorbed wakeups.
6265 */
6266 while (extraWaits-- > 0)
6268 }
6269}
6270
6271/*
6272 * Stop treating lock as held by current backend.
6273 *
6274 * After calling this function it's the callers responsibility to ensure that
6275 * the lock gets released, even in case of an error. This only is desirable if
6276 * the lock is going to be released in a different process than the process
6277 * that acquired it.
6278 */
6279static inline void
6285
6286/*
6287 * Stop treating lock as held by current backend.
6288 *
6289 * This is the code that can be shared between actually releasing a lock
6290 * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6291 * without releasing the lock (BufferLockDisown()).
6292 */
6293static inline int
6295{
6298
6300 if (ref == NULL)
6301 elog(ERROR, "lock %d is not held", buffer);
6302 mode = ref->data.lockmode;
6303 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6304
6305 return mode;
6306}
6307
6308/*
6309 * Wakeup all the lockers that currently have a chance to acquire the lock.
6310 *
6311 * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6312 */
6313static void
6315{
6316 bool new_wake_in_progress = false;
6317 bool wake_share_exclusive = true;
6320
6322
6323 /* lock wait list while collecting backends to wake up */
6325
6326 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6327 {
6328 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6329
6330 /*
6331 * Already woke up a conflicting lock, so skip over this wait list
6332 * entry.
6333 */
6335 continue;
6337 continue;
6338
6339 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6340 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6341
6342 /*
6343 * Prevent additional wakeups until retryer gets to run. Backends that
6344 * are just waiting for the lock to become free don't retry
6345 * automatically.
6346 */
6347 new_wake_in_progress = true;
6348
6349 /*
6350 * Signal that the process isn't on the wait list anymore. This allows
6351 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6352 * proclist_delete(), rather than having to check if it has been
6353 * removed from the list.
6354 */
6355 Assert(waiter->lwWaiting == LW_WS_WAITING);
6357
6358 /*
6359 * Don't wakeup further waiters after waking a conflicting waiter.
6360 */
6361 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6362 {
6363 /*
6364 * Share locks conflict with exclusive locks.
6365 */
6366 wake_exclusive = false;
6367 }
6368 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6369 {
6370 /*
6371 * Share-exclusive locks conflict with share-exclusive and
6372 * exclusive locks.
6373 */
6374 wake_exclusive = false;
6375 wake_share_exclusive = false;
6376 }
6377 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6378 {
6379 /*
6380 * Exclusive locks conflict with all other locks, there's no point
6381 * in waking up anybody else.
6382 */
6383 break;
6384 }
6385 }
6386
6388
6389 /* unset required flags, and release lock, in one fell swoop */
6390 {
6393
6395 while (true)
6396 {
6398
6399 /* compute desired flags */
6400
6403 else
6405
6406 if (proclist_is_empty(&buf_hdr->lock_waiters))
6408
6409 desired_state &= ~BM_LOCKED; /* release lock */
6410
6413 break;
6414 }
6415 }
6416
6417 /* Awaken any waiters I removed from the queue. */
6418 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6419 {
6420 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6421
6422 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6423
6424 /*
6425 * Guarantee that lwWaiting being unset only becomes visible once the
6426 * unlink from the link has completed. Otherwise the target backend
6427 * could be woken up for other reason and enqueue for a new lock - if
6428 * that happens before the list unlink happens, the list would end up
6429 * being corrupted.
6430 *
6431 * The barrier pairs with the LockBufHdr() when enqueuing for another
6432 * lock.
6433 */
6435 waiter->lwWaiting = LW_WS_NOT_WAITING;
6436 PGSemaphoreUnlock(waiter->sem);
6437 }
6438}
6439
6440/*
6441 * Compute subtraction from buffer state for a release of a held lock in
6442 * `mode`.
6443 *
6444 * This is separated from BufferLockUnlock() as we want to combine the lock
6445 * release with other atomic operations when possible, leading to the lock
6446 * release being done in multiple places, each needing to compute what to
6447 * subtract from the lock state.
6448 */
6449static inline uint64
6451{
6452 /*
6453 * Turns out that a switch() leads gcc to generate sufficiently worse code
6454 * for this to show up in profiles...
6455 */
6457 return BM_LOCK_VAL_EXCLUSIVE;
6460 else
6461 {
6463 return BM_LOCK_VAL_SHARED;
6464 }
6465
6466 return 0; /* keep compiler quiet */
6467}
6468
6469/*
6470 * Handle work that needs to be done after releasing a lock that was held in
6471 * `mode`, where `lockstate` is the result of the atomic operation modifying
6472 * the state variable.
6473 *
6474 * This is separated from BufferLockUnlock() as we want to combine the lock
6475 * release with other atomic operations when possible, leading to the lock
6476 * release being done in multiple places.
6477 */
6478static void
6480{
6481 bool check_waiters = false;
6482 bool wake_exclusive = false;
6483
6484 /* nobody else can have that kind of lock */
6486
6487 /*
6488 * If we're still waiting for backends to get scheduled, don't wake them
6489 * up again. Otherwise check if we need to look through the waitqueue to
6490 * wake other backends.
6491 */
6494 {
6495 if ((lockstate & BM_LOCK_MASK) == 0)
6496 {
6497 /*
6498 * We released a lock and the lock was, in that moment, free. We
6499 * therefore can wake waiters for any kind of lock.
6500 */
6501 check_waiters = true;
6502 wake_exclusive = true;
6503 }
6505 {
6506 /*
6507 * We released the lock, but another backend still holds a lock.
6508 * We can't have released an exclusive lock, as there couldn't
6509 * have been other lock holders. If we released a share lock, no
6510 * waiters need to be woken up, as there must be other share
6511 * lockers. However, if we held a share-exclusive lock, another
6512 * backend now could acquire a share-exclusive lock.
6513 */
6514 check_waiters = true;
6515 wake_exclusive = false;
6516 }
6517 }
6518
6519 /*
6520 * As waking up waiters requires the spinlock to be acquired, only do so
6521 * if necessary.
6522 */
6523 if (check_waiters)
6525}
6526
6527/*
6528 * BufferLockHeldByMeInMode - test whether my process holds the content lock
6529 * in the specified mode
6530 *
6531 * This is meant as debug support only.
6532 */
6533static bool
6535{
6536 PrivateRefCountEntry *entry =
6538
6539 if (!entry)
6540 return false;
6541 else
6542 return entry->data.lockmode == mode;
6543}
6544
6545/*
6546 * BufferLockHeldByMe - test whether my process holds the content lock in any
6547 * mode
6548 *
6549 * This is meant as debug support only.
6550 */
6551static bool
6553{
6554 PrivateRefCountEntry *entry =
6556
6557 if (!entry)
6558 return false;
6559 else
6560 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6561}
6562
6563/*
6564 * Release the content lock for the buffer.
6565 */
6566void
6568{
6570
6572 if (BufferIsLocal(buffer))
6573 return; /* local buffers need no lock */
6574
6577}
6578
6579/*
6580 * Acquire the content_lock for the buffer.
6581 */
6582void
6584{
6586
6587 /*
6588 * We can't wait if we haven't got a PGPROC. This should only occur
6589 * during bootstrap or shared memory initialization. Put an Assert here
6590 * to catch unsafe coding practices.
6591 */
6593
6594 /* handled in LockBuffer() wrapper */
6596
6598 if (BufferIsLocal(buffer))
6599 return; /* local buffers need no lock */
6600
6602
6603 /*
6604 * Test the most frequent lock modes first. While a switch (mode) would be
6605 * nice, at least gcc generates considerably worse code for it.
6606 *
6607 * Call BufferLockAcquire() with a constant argument for mode, to generate
6608 * more efficient code for the different lock modes.
6609 */
6610 if (mode == BUFFER_LOCK_SHARE)
6612 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6616 else
6617 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6618}
6619
6620/*
6621 * Acquire the content_lock for the buffer, but only if we don't have to wait.
6622 *
6623 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6624 */
6625bool
6627{
6628 BufferDesc *buf;
6629
6631 if (BufferIsLocal(buffer))
6632 return true; /* act as though we got it */
6633
6635
6637}
6638
6639/*
6640 * Verify that this backend is pinning the buffer exactly once.
6641 *
6642 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6643 * holds a pin on the buffer. We do not care whether some other backend does.
6644 */
6645void
6647{
6648 if (BufferIsLocal(buffer))
6649 {
6650 if (LocalRefCount[-buffer - 1] != 1)
6651 elog(ERROR, "incorrect local pin count: %d",
6652 LocalRefCount[-buffer - 1]);
6653 }
6654 else
6655 {
6656 if (GetPrivateRefCount(buffer) != 1)
6657 elog(ERROR, "incorrect local pin count: %d",
6659 }
6660}
6661
6662/*
6663 * LockBufferForCleanup - lock a buffer in preparation for deleting items
6664 *
6665 * Items may be deleted from a disk page only when the caller (a) holds an
6666 * exclusive lock on the buffer and (b) has observed that no other backend
6667 * holds a pin on the buffer. If there is a pin, then the other backend
6668 * might have a pointer into the buffer (for example, a heapscan reference
6669 * to an item --- see README for more details). It's OK if a pin is added
6670 * after the cleanup starts, however; the newly-arrived backend will be
6671 * unable to look at the page until we release the exclusive lock.
6672 *
6673 * To implement this protocol, a would-be deleter must pin the buffer and
6674 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6675 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6676 * it has successfully observed pin count = 1.
6677 */
6678void
6680{
6682 TimestampTz waitStart = 0;
6683 bool waiting = false;
6684 bool logged_recovery_conflict = false;
6685
6688
6690
6691 /*
6692 * We do not yet need to be worried about in-progress AIOs holding a pin,
6693 * as we, so far, only support doing reads via AIO and this function can
6694 * only be called once the buffer is valid (i.e. no read can be in
6695 * flight).
6696 */
6697
6698 /* Nobody else to wait for */
6699 if (BufferIsLocal(buffer))
6700 return;
6701
6703
6704 for (;;)
6705 {
6707 uint64 unset_bits = 0;
6708
6709 /* Try to acquire lock */
6712
6715 {
6716 /* Successfully acquired exclusive lock with pincount 1 */
6718
6719 /*
6720 * Emit the log message if recovery conflict on buffer pin was
6721 * resolved but the startup process waited longer than
6722 * deadlock_timeout for it.
6723 */
6726 waitStart, GetCurrentTimestamp(),
6727 NULL, false);
6728
6729 if (waiting)
6730 {
6731 /* reset ps display to remove the suffix if we added one */
6733 waiting = false;
6734 }
6735 return;
6736 }
6737 /* Failed, so mark myself as waiting for pincount 1 */
6739 {
6742 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6743 }
6744 bufHdr->wait_backend_pgprocno = MyProcNumber;
6748 0);
6750
6751 /* Wait to be signaled by UnpinBuffer() */
6752 if (InHotStandby)
6753 {
6754 if (!waiting)
6755 {
6756 /* adjust the process title to indicate that it's waiting */
6757 set_ps_display_suffix("waiting");
6758 waiting = true;
6759 }
6760
6761 /*
6762 * Emit the log message if the startup process is waiting longer
6763 * than deadlock_timeout for recovery conflict on buffer pin.
6764 *
6765 * Skip this if first time through because the startup process has
6766 * not started waiting yet in this case. So, the wait start
6767 * timestamp is set after this logic.
6768 */
6769 if (waitStart != 0 && !logged_recovery_conflict)
6770 {
6772
6773 if (TimestampDifferenceExceeds(waitStart, now,
6775 {
6777 waitStart, now, NULL, true);
6779 }
6780 }
6781
6782 /*
6783 * Set the wait start timestamp if logging is enabled and first
6784 * time through.
6785 */
6786 if (log_recovery_conflict_waits && waitStart == 0)
6787 waitStart = GetCurrentTimestamp();
6788
6789 /* Publish the bufid that Startup process waits on */
6791 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6793 /* Reset the published bufid */
6795 }
6796 else
6798
6799 /*
6800 * Remove flag marking us as waiter. Normally this will not be set
6801 * anymore, but ProcWaitForSignal() can return for other signals as
6802 * well. We take care to only reset the flag if we're the waiter, as
6803 * theoretically another backend could have started waiting. That's
6804 * impossible with the current usages due to table level locking, but
6805 * better be safe.
6806 */
6808 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6809 bufHdr->wait_backend_pgprocno == MyProcNumber)
6811
6813 0, unset_bits,
6814 0);
6815
6817 /* Loop back and try again */
6818 }
6819}
6820
6821/*
6822 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6823 * requests cancellation of all pin holders that are blocking it.
6824 */
6825bool
6827{
6829
6830 /*
6831 * If we get woken slowly then it's possible that the Startup process was
6832 * already woken by other backends before we got here. Also possible that
6833 * we get here by multiple interrupts or interrupts at inappropriate
6834 * times, so make sure we do nothing if the bufid is not set.
6835 */
6836 if (bufid < 0)
6837 return false;
6838
6839 if (GetPrivateRefCount(bufid + 1) > 0)
6840 return true;
6841
6842 return false;
6843}
6844
6845/*
6846 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6847 *
6848 * We won't loop, but just check once to see if the pin count is OK. If
6849 * not, return false with no lock held.
6850 */
6851bool
6853{
6856 refcount;
6857
6859
6860 /* see AIO related comment in LockBufferForCleanup() */
6861
6862 if (BufferIsLocal(buffer))
6863 {
6864 refcount = LocalRefCount[-buffer - 1];
6865 /* There should be exactly one pin */
6866 Assert(refcount > 0);
6867 if (refcount != 1)
6868 return false;
6869 /* Nobody else to wait for */
6870 return true;
6871 }
6872
6873 /* There should be exactly one local pin */
6874 refcount = GetPrivateRefCount(buffer);
6875 Assert(refcount);
6876 if (refcount != 1)
6877 return false;
6878
6879 /* Try to acquire lock */
6881 return false;
6882
6886
6887 Assert(refcount > 0);
6888 if (refcount == 1)
6889 {
6890 /* Successfully acquired exclusive lock with pincount 1 */
6892 return true;
6893 }
6894
6895 /* Failed, so release the lock */
6898 return false;
6899}
6900
6901/*
6902 * IsBufferCleanupOK - as above, but we already have the lock
6903 *
6904 * Check whether it's OK to perform cleanup on a buffer we've already
6905 * locked. If we observe that the pin count is 1, our exclusive lock
6906 * happens to be a cleanup lock, and we can proceed with anything that
6907 * would have been allowable had we sought a cleanup lock originally.
6908 */
6909bool
6911{
6914
6916
6917 /* see AIO related comment in LockBufferForCleanup() */
6918
6919 if (BufferIsLocal(buffer))
6920 {
6921 /* There should be exactly one pin */
6922 if (LocalRefCount[-buffer - 1] != 1)
6923 return false;
6924 /* Nobody else to wait for */
6925 return true;
6926 }
6927
6928 /* There should be exactly one local pin */
6929 if (GetPrivateRefCount(buffer) != 1)
6930 return false;
6931
6933
6934 /* caller must hold exclusive lock on buffer */
6936
6938
6941 {
6942 /* pincount is OK. */
6944 return true;
6945 }
6946
6948 return false;
6949}
6950
6951/*
6952 * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
6953 *
6954 * This checks if the current lock mode already suffices to allow hint bits
6955 * being set and, if not, whether the current lock can be upgraded.
6956 *
6957 * Updates *lockstate when returning true.
6958 */
6959static inline bool
6961{
6965
6967
6968 if (ref == NULL)
6969 elog(ERROR, "buffer is not pinned");
6970
6971 mode = ref->data.lockmode;
6972 if (mode == BUFFER_LOCK_UNLOCK)
6973 elog(ERROR, "buffer is not locked");
6974
6975 /* we're done if we are already holding a sufficient lock level */
6977 {
6979 return true;
6980 }
6981
6982 /*
6983 * We are only holding a share lock right now, try to upgrade it to
6984 * SHARE_EXCLUSIVE.
6985 */
6987
6989 while (true)
6990 {
6992
6994
6995 /*
6996 * Can't upgrade if somebody else holds the lock in exclusive or
6997 * share-exclusive mode.
6998 */
7000 {
7001 return false;
7002 }
7003
7004 /* currently held lock state */
7006
7007 /* new lock level */
7009
7012 {
7013 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
7015
7016 return true;
7017 }
7018 }
7019}
7020
7021/*
7022 * Try to acquire the right to set hint bits on the buffer.
7023 *
7024 * To be allowed to set hint bits, this backend needs to hold either a
7025 * share-exclusive or an exclusive lock. In case this backend only holds a
7026 * share lock, this function will try to upgrade the lock to
7027 * share-exclusive. The caller is only allowed to set hint bits if true is
7028 * returned.
7029 *
7030 * Once BufferBeginSetHintBits() has returned true, hint bits may be set
7031 * without further calls to BufferBeginSetHintBits(), until the buffer is
7032 * unlocked.
7033 *
7034 *
7035 * Requiring a share-exclusive lock to set hint bits prevents setting hint
7036 * bits on buffers that are currently being written out, which could corrupt
7037 * the checksum on the page. Flushing buffers also requires a share-exclusive
7038 * lock.
7039 *
7040 * Due to a lock >= share-exclusive being required to set hint bits, only one
7041 * backend can set hint bits at a time. Allowing multiple backends to set hint
7042 * bits would require more complicated locking: For setting hint bits we'd
7043 * need to store the count of backends currently setting hint bits, for I/O we
7044 * would need another lock-level conflicting with the hint-setting
7045 * lock-level. Given that the share-exclusive lock for setting hint bits is
7046 * only held for a short time, that backends often would just set the same
7047 * hint bits and that the cost of occasionally not setting hint bits in hotly
7048 * accessed pages is fairly low, this seems like an acceptable tradeoff.
7049 */
7050bool
7052{
7055
7056 if (BufferIsLocal(buffer))
7057 {
7058 /*
7059 * NB: Will need to check if there is a write in progress, once it is
7060 * possible for writes to be done asynchronously.
7061 */
7062 return true;
7063 }
7064
7066
7068}
7069
7070/*
7071 * End a phase of setting hint bits on this buffer, started with
7072 * BufferBeginSetHintBits().
7073 *
7074 * This would strictly speaking not be required (i.e. the caller could do
7075 * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
7076 * checks.
7077 */
7078void
7088
7089/*
7090 * Try to set hint bits on a single 16bit value in a buffer.
7091 *
7092 * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
7093 * dirty and return true. Otherwise false is returned.
7094 *
7095 * *ptr needs to be a pointer to memory within the buffer.
7096 *
7097 * This is a bit faster than BufferBeginSetHintBits() /
7098 * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
7099 * than the former when setting hint bits multiple times in the same buffer.
7100 */
7101bool
7103{
7106#ifdef USE_ASSERT_CHECKING
7107 char *page;
7108
7109 /* verify that the address is on the page */
7110 page = BufferGetPage(buffer);
7111 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
7112#endif
7113
7114 if (BufferIsLocal(buffer))
7115 {
7116 *ptr = val;
7117
7119
7120 return true;
7121 }
7122
7124
7126 {
7127 *ptr = val;
7128
7130
7131 return true;
7132 }
7133
7134 return false;
7135}
7136
7137
7138/*
7139 * Functions for buffer I/O handling
7140 *
7141 * Also note that these are used only for shared buffers, not local ones.
7142 */
7143
7144/*
7145 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
7146 */
7147static void
7149{
7151
7152 /*
7153 * Should never end up here with unsubmitted IO, as no AIO unaware code
7154 * may be used while in batch mode and AIO aware code needs to have
7155 * submitted all staged IO to avoid deadlocks & slowness.
7156 */
7158
7160 for (;;)
7161 {
7164
7165 /*
7166 * It may not be necessary to acquire the spinlock to check the flag
7167 * here, but since this test is essential for correctness, we'd better
7168 * play it safe.
7169 */
7171
7172 /*
7173 * Copy the wait reference while holding the spinlock. This protects
7174 * against a concurrent TerminateBufferIO() in another backend from
7175 * clearing the wref while it's being read.
7176 */
7177 iow = buf->io_wref;
7179
7180 /* no IO in progress, we don't need to wait */
7182 break;
7183
7184 /*
7185 * The buffer has asynchronous IO in progress, wait for it to
7186 * complete.
7187 */
7188 if (pgaio_wref_valid(&iow))
7189 {
7191
7192 /*
7193 * The AIO subsystem internally uses condition variables and thus
7194 * might remove this backend from the BufferDesc's CV. While that
7195 * wouldn't cause a correctness issue (the first CV sleep just
7196 * immediately returns if not already registered), it seems worth
7197 * avoiding unnecessary loop iterations, given that we take care
7198 * to do so at the start of the function.
7199 */
7201 continue;
7202 }
7203
7204 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7206 }
7208}
7209
7210/*
7211 * StartSharedBufferIO: begin I/O on this buffer
7212 * (Assumptions)
7213 * The buffer is Pinned
7214 *
7215 * In several scenarios the buffer may already be undergoing I/O in this or
7216 * another backend. How to best handle that depends on the caller's
7217 * situation. It might be appropriate to wait synchronously (e.g., because the
7218 * buffer is about to be invalidated); wait asynchronously, using the buffer's
7219 * IO wait reference (e.g., because the caller is doing readahead and doesn't
7220 * need the buffer to be ready immediately); or to not wait at all (e.g.,
7221 * because the caller is trying to combine IO for this buffer with another
7222 * buffer).
7223 *
7224 * How and whether to wait is controlled by the wait and io_wref
7225 * parameters. In detail:
7226 *
7227 * - If the caller passes a non-NULL io_wref and the buffer has an I/O wait
7228 * reference, the *io_wref is set to the buffer's io_wref and
7229 * BUFFER_IO_IN_PROGRESS is returned. This is done regardless of the wait
7230 * parameter.
7231 *
7232 * - If the caller passes a NULL io_wref (i.e. the caller does not want to
7233 * asynchronously wait for the completion of the IO), wait = false and the
7234 * buffer is undergoing IO, BUFFER_IO_IN_PROGRESS is returned.
7235 *
7236 * - If wait = true and either the buffer does not have a wait reference,
7237 * or the caller passes io_wref = NULL, WaitIO() is used to wait for the IO
7238 * to complete. To avoid the potential of deadlocks and unnecessary delays,
7239 * all staged I/O is submitted before waiting.
7240 *
7241 * Input operations are only attempted on buffers that are not BM_VALID, and
7242 * output operations only on buffers that are BM_VALID and BM_DIRTY, so we can
7243 * always tell if the work is already done. If no I/O is necessary,
7244 * BUFFER_IO_ALREADY_DONE is returned.
7245 *
7246 * If we successfully marked the buffer as BM_IO_IN_PROGRESS,
7247 * BUFFER_IO_READY_FOR_IO is returned.
7248 */
7251{
7253
7255
7256 for (;;)
7257 {
7259
7261 break;
7262
7263 /* Join the existing IO */
7264 if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
7265 {
7266 *io_wref = buf->io_wref;
7268
7269 return BUFFER_IO_IN_PROGRESS;
7270 }
7271 else if (!wait)
7272 {
7274 return BUFFER_IO_IN_PROGRESS;
7275 }
7276 else
7277 {
7278 /*
7279 * With wait = true, we always have to wait if the caller has
7280 * passed io_wref = NULL.
7281 *
7282 * Even with io_wref != NULL, we have to wait if the buffer's wait
7283 * ref is not valid but the IO is in progress, someone else
7284 * started IO but hasn't set the wait ref yet. We have no choice
7285 * but to wait until the IO completes.
7286 */
7288
7289 /*
7290 * If this backend currently has staged IO, submit it before
7291 * waiting for in-progress IO, to avoid potential deadlocks and
7292 * unnecessary delays.
7293 */
7295
7296 WaitIO(buf);
7297 }
7298 }
7299
7300 /* Once we get here, there is definitely no I/O active on this buffer */
7301
7302 /* Check if someone else already did the I/O */
7303 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7304 {
7307 }
7308
7309 /*
7310 * No IO in progress and not already done; we will start IO. It's possible
7311 * that the IO was in progress but we're not done, because the IO errored
7312 * out. We'll do the IO ourselves.
7313 */
7316 0);
7317
7320
7322}
7323
7324/*
7325 * Wrapper around StartSharedBufferIO / StartLocalBufferIO. Only to be used
7326 * when the caller doesn't otherwise need to care about local vs shared. See
7327 * StartSharedBufferIO() for details.
7328 */
7331{
7333
7334 if (BufferIsLocal(buffer))
7335 {
7337
7338 return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
7339 }
7340 else
7341 {
7343
7344 return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
7345 }
7346}
7347
7348/*
7349 * TerminateBufferIO: release a buffer we were doing I/O on
7350 * (Assumptions)
7351 * My process is executing IO for the buffer
7352 * BM_IO_IN_PROGRESS bit is set for the buffer
7353 * The buffer is Pinned
7354 *
7355 * If clear_dirty is true, we clear the buffer's BM_DIRTY flag. This is
7356 * appropriate when terminating a successful write.
7357 *
7358 * set_flag_bits gets ORed into the buffer's flags. It must include
7359 * BM_IO_ERROR in a failure case. For successful completion it could
7360 * be 0, or BM_VALID if we just finished reading in the page.
7361 *
7362 * If forget_owner is true, we release the buffer I/O from the current
7363 * resource owner. (forget_owner=false is used when the resource owner itself
7364 * is being released)
7365 */
7366void
7368 bool forget_owner, bool release_aio)
7369{
7372 int refcount_change = 0;
7373
7375
7378
7379 /* Clear earlier errors, if this IO failed, it'll be marked again */
7381
7382 if (clear_dirty)
7384
7385 if (release_aio)
7386 {
7387 /* release ownership by the AIO subsystem */
7389 refcount_change = -1;
7390 pgaio_wref_clear(&buf->io_wref);
7391 }
7392
7396
7397 if (forget_owner)
7400
7402
7403 /*
7404 * Support LockBufferForCleanup()
7405 *
7406 * We may have just released the last pin other than the waiter's. In most
7407 * cases, this backend holds another pin on the buffer. But, if, for
7408 * example, this backend is completing an IO issued by another backend, it
7409 * may be time to wake the waiter.
7410 */
7413}
7414
7415/*
7416 * AbortBufferIO: Clean up active buffer I/O after an error.
7417 *
7418 * All LWLocks & content locks we might have held have been released, but we
7419 * haven't yet released buffer pins, so the buffer is still pinned.
7420 *
7421 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7422 * possible the error condition wasn't related to the I/O.
7423 *
7424 * Note: this does not remove the buffer I/O from the resource owner.
7425 * That's correct when we're releasing the whole resource owner, but
7426 * beware if you use this in other contexts.
7427 */
7428static void
7430{
7433
7436
7437 if (!(buf_state & BM_VALID))
7438 {
7441 }
7442 else
7443 {
7446
7447 /* Issue notice if this is not the first failure... */
7448 if (buf_state & BM_IO_ERROR)
7449 {
7450 /* Buffer is pinned, so we can read tag without spinlock */
7453 errmsg("could not write block %u of %s",
7454 buf_hdr->tag.blockNum,
7456 BufTagGetForkNum(&buf_hdr->tag)).str),
7457 errdetail("Multiple failures --- write error might be permanent.")));
7458 }
7459 }
7460
7461 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7462}
7463
7464/*
7465 * Error context callback for errors occurring during shared buffer writes.
7466 */
7467static void
7469{
7471
7472 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7473 if (bufHdr != NULL)
7474 errcontext("writing block %u of relation \"%s\"",
7475 bufHdr->tag.blockNum,
7477 BufTagGetForkNum(&bufHdr->tag)).str);
7478}
7479
7480/*
7481 * Error context callback for errors occurring during local buffer writes.
7482 */
7483static void
7485{
7487
7488 if (bufHdr != NULL)
7489 errcontext("writing block %u of relation \"%s\"",
7490 bufHdr->tag.blockNum,
7493 BufTagGetForkNum(&bufHdr->tag)).str);
7494}
7495
7496/*
7497 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7498 */
7499static int
7500rlocator_comparator(const void *p1, const void *p2)
7501{
7502 RelFileLocator n1 = *(const RelFileLocator *) p1;
7503 RelFileLocator n2 = *(const RelFileLocator *) p2;
7504
7505 if (n1.relNumber < n2.relNumber)
7506 return -1;
7507 else if (n1.relNumber > n2.relNumber)
7508 return 1;
7509
7510 if (n1.dbOid < n2.dbOid)
7511 return -1;
7512 else if (n1.dbOid > n2.dbOid)
7513 return 1;
7514
7515 if (n1.spcOid < n2.spcOid)
7516 return -1;
7517 else if (n1.spcOid > n2.spcOid)
7518 return 1;
7519 else
7520 return 0;
7521}
7522
7523/*
7524 * Lock buffer header - set BM_LOCKED in buffer state.
7525 */
7526uint64
7528{
7530
7532
7533 while (true)
7534 {
7535 /*
7536 * Always try once to acquire the lock directly, without setting up
7537 * the spin-delay infrastructure. The work necessary for that shows up
7538 * in profiles and is rarely necessary.
7539 */
7541 if (likely(!(old_buf_state & BM_LOCKED)))
7542 break; /* got lock */
7543
7544 /* and then spin without atomic operations until lock is released */
7545 {
7547
7549
7550 while (old_buf_state & BM_LOCKED)
7551 {
7554 }
7556 }
7557
7558 /*
7559 * Retry. The lock might obviously already be re-acquired by the time
7560 * we're attempting to get it again.
7561 */
7562 }
7563
7564 return old_buf_state | BM_LOCKED;
7565}
7566
7567/*
7568 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7569 * state at that point.
7570 *
7571 * Obviously the buffer could be locked by the time the value is returned, so
7572 * this is primarily useful in CAS style loops.
7573 */
7576{
7579
7581
7582 buf_state = pg_atomic_read_u64(&buf->state);
7583
7584 while (buf_state & BM_LOCKED)
7585 {
7587 buf_state = pg_atomic_read_u64(&buf->state);
7588 }
7589
7591
7592 return buf_state;
7593}
7594
7595/*
7596 * BufferTag comparator.
7597 */
7598static inline int
7600{
7601 int ret;
7604
7607
7609
7610 if (ret != 0)
7611 return ret;
7612
7614 return -1;
7616 return 1;
7617
7618 if (ba->blockNum < bb->blockNum)
7619 return -1;
7620 if (ba->blockNum > bb->blockNum)
7621 return 1;
7622
7623 return 0;
7624}
7625
7626/*
7627 * Comparator determining the writeout order in a checkpoint.
7628 *
7629 * It is important that tablespaces are compared first, the logic balancing
7630 * writes between tablespaces relies on it.
7631 */
7632static inline int
7634{
7635 /* compare tablespace */
7636 if (a->tsId < b->tsId)
7637 return -1;
7638 else if (a->tsId > b->tsId)
7639 return 1;
7640 /* compare relation */
7641 if (a->relNumber < b->relNumber)
7642 return -1;
7643 else if (a->relNumber > b->relNumber)
7644 return 1;
7645 /* compare fork */
7646 else if (a->forkNum < b->forkNum)
7647 return -1;
7648 else if (a->forkNum > b->forkNum)
7649 return 1;
7650 /* compare block number */
7651 else if (a->blockNum < b->blockNum)
7652 return -1;
7653 else if (a->blockNum > b->blockNum)
7654 return 1;
7655 /* equal page IDs are unlikely, but not impossible */
7656 return 0;
7657}
7658
7659/*
7660 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7661 * progress.
7662 */
7663static int
7665{
7668
7669 /* we want a min-heap, so return 1 for the a < b */
7670 if (sa->progress < sb->progress)
7671 return 1;
7672 else if (sa->progress == sb->progress)
7673 return 0;
7674 else
7675 return -1;
7676}
7677
7678/*
7679 * Initialize a writeback context, discarding potential previous state.
7680 *
7681 * *max_pending is a pointer instead of an immediate value, so the coalesce
7682 * limits can easily changed by the GUC mechanism, and so calling code does
7683 * not have to check the current configuration. A value of 0 means that no
7684 * writeback control will be performed.
7685 */
7686void
7687WritebackContextInit(WritebackContext *context, int *max_pending)
7688{
7689 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7690
7691 context->max_pending = max_pending;
7692 context->nr_pending = 0;
7693}
7694
7695/*
7696 * Add buffer to list of pending writeback requests.
7697 */
7698void
7700 BufferTag *tag)
7701{
7702 PendingWriteback *pending;
7703
7704 /*
7705 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7706 * point in tracking in that case.
7707 */
7709 !enableFsync)
7710 return;
7711
7712 /*
7713 * Add buffer to the pending writeback array, unless writeback control is
7714 * disabled.
7715 */
7716 if (*wb_context->max_pending > 0)
7717 {
7719
7720 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7721
7722 pending->tag = *tag;
7723 }
7724
7725 /*
7726 * Perform pending flushes if the writeback limit is exceeded. This
7727 * includes the case where previously an item has been added, but control
7728 * is now disabled.
7729 */
7730 if (wb_context->nr_pending >= *wb_context->max_pending)
7732}
7733
7734#define ST_SORT sort_pending_writebacks
7735#define ST_ELEMENT_TYPE PendingWriteback
7736#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7737#define ST_SCOPE static
7738#define ST_DEFINE
7739#include "lib/sort_template.h"
7740
7741/*
7742 * Issue all pending writeback requests, previously scheduled with
7743 * ScheduleBufferTagForWriteback, to the OS.
7744 *
7745 * Because this is only used to improve the OSs IO scheduling we try to never
7746 * error out - it's just a hint.
7747 */
7748void
7750{
7752 int i;
7753
7754 if (wb_context->nr_pending == 0)
7755 return;
7756
7757 /*
7758 * Executing the writes in-order can make them a lot faster, and allows to
7759 * merge writeback requests to consecutive blocks into larger writebacks.
7760 */
7761 sort_pending_writebacks(wb_context->pending_writebacks,
7762 wb_context->nr_pending);
7763
7765
7766 /*
7767 * Coalesce neighbouring writes, but nothing else. For that we iterate
7768 * through the, now sorted, array of pending flushes, and look forward to
7769 * find all neighbouring (or identical) writes.
7770 */
7771 for (i = 0; i < wb_context->nr_pending; i++)
7772 {
7776 int ahead;
7777 BufferTag tag;
7779 Size nblocks = 1;
7780
7781 cur = &wb_context->pending_writebacks[i];
7782 tag = cur->tag;
7784
7785 /*
7786 * Peek ahead, into following writeback requests, to see if they can
7787 * be combined with the current one.
7788 */
7789 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7790 {
7791
7792 next = &wb_context->pending_writebacks[i + ahead + 1];
7793
7794 /* different file, stop */
7796 BufTagGetRelFileLocator(&next->tag)) ||
7797 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7798 break;
7799
7800 /* ok, block queued twice, skip */
7801 if (cur->tag.blockNum == next->tag.blockNum)
7802 continue;
7803
7804 /* only merge consecutive writes */
7805 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7806 break;
7807
7808 nblocks++;
7809 cur = next;
7810 }
7811
7812 i += ahead;
7813
7814 /* and finally tell the kernel to write the data to storage */
7816 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7817 }
7818
7819 /*
7820 * Assume that writeback requests are only issued for buffers containing
7821 * blocks of permanent relations.
7822 */
7824 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7825
7826 wb_context->nr_pending = 0;
7827}
7828
7829/* ResourceOwner callbacks */
7830
7831static void
7838
7839static char *
7841{
7843
7844 return psprintf("lost track of buffer IO on buffer %d", buffer);
7845}
7846
7847/*
7848 * Release buffer as part of resource owner cleanup. This will only be called
7849 * if the buffer is pinned. If this backend held the content lock at the time
7850 * of the error we also need to release that (note that it is not possible to
7851 * hold a content lock without a pin).
7852 */
7853static void
7855{
7857
7858 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7859 if (!BufferIsValid(buffer))
7860 elog(ERROR, "bad buffer ID: %d", buffer);
7861
7862 if (BufferIsLocal(buffer))
7864 else
7865 {
7867
7869
7870 /* not having a private refcount would imply resowner corruption */
7871 Assert(ref != NULL);
7872
7873 /*
7874 * If the buffer was locked at the time of the resowner release,
7875 * release the lock now. This should only happen after errors.
7876 */
7877 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7878 {
7880
7881 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7883 }
7884
7886 }
7887}
7888
7889static char *
7894
7895/*
7896 * Helper function to evict unpinned buffer whose buffer header lock is
7897 * already acquired.
7898 */
7899static bool
7901{
7903 bool result;
7904
7905 *buffer_flushed = false;
7906
7909
7910 if ((buf_state & BM_VALID) == 0)
7911 {
7912 UnlockBufHdr(desc);
7913 return false;
7914 }
7915
7916 /* Check that it's not pinned already. */
7918 {
7919 UnlockBufHdr(desc);
7920 return false;
7921 }
7922
7923 PinBuffer_Locked(desc); /* releases spinlock */
7924
7925 /* If it was dirty, try to clean it once. */
7926 if (buf_state & BM_DIRTY)
7927 {
7929 *buffer_flushed = true;
7930 }
7931
7932 /* This will return false if it becomes dirty or someone else pins it. */
7934
7935 UnpinBuffer(desc);
7936
7937 return result;
7938}
7939
7940/*
7941 * Try to evict the current block in a shared buffer.
7942 *
7943 * This function is intended for testing/development use only!
7944 *
7945 * To succeed, the buffer must not be pinned on entry, so if the caller had a
7946 * particular block in mind, it might already have been replaced by some other
7947 * block by the time this function runs. It's also unpinned on return, so the
7948 * buffer might be occupied again by the time control is returned, potentially
7949 * even by the same block. This inherent raciness without other interlocking
7950 * makes the function unsuitable for non-testing usage.
7951 *
7952 * *buffer_flushed is set to true if the buffer was dirty and has been
7953 * flushed, false otherwise. However, *buffer_flushed=true does not
7954 * necessarily mean that we flushed the buffer, it could have been flushed by
7955 * someone else.
7956 *
7957 * Returns true if the buffer was valid and it has now been made invalid.
7958 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7959 * or if the buffer becomes dirty again while we're trying to write it out.
7960 */
7961bool
7963{
7964 BufferDesc *desc;
7965
7967
7968 /* Make sure we can pin the buffer. */
7971
7972 desc = GetBufferDescriptor(buf - 1);
7973 LockBufHdr(desc);
7974
7976}
7977
7978/*
7979 * Try to evict all the shared buffers.
7980 *
7981 * This function is intended for testing/development use only! See
7982 * EvictUnpinnedBuffer().
7983 *
7984 * The buffers_* parameters are mandatory and indicate the total count of
7985 * buffers that:
7986 * - buffers_evicted - were evicted
7987 * - buffers_flushed - were flushed
7988 * - buffers_skipped - could not be evicted
7989 */
7990void
7993{
7994 *buffers_evicted = 0;
7995 *buffers_skipped = 0;
7996 *buffers_flushed = 0;
7997
7998 for (int buf = 1; buf <= NBuffers; buf++)
7999 {
8000 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8002 bool buffer_flushed;
8003
8005
8007 if (!(buf_state & BM_VALID))
8008 continue;
8009
8012
8013 LockBufHdr(desc);
8014
8016 (*buffers_evicted)++;
8017 else
8018 (*buffers_skipped)++;
8019
8020 if (buffer_flushed)
8021 (*buffers_flushed)++;
8022 }
8023}
8024
8025/*
8026 * Try to evict all the shared buffers containing provided relation's pages.
8027 *
8028 * This function is intended for testing/development use only! See
8029 * EvictUnpinnedBuffer().
8030 *
8031 * The caller must hold at least AccessShareLock on the relation to prevent
8032 * the relation from being dropped.
8033 *
8034 * The buffers_* parameters are mandatory and indicate the total count of
8035 * buffers that:
8036 * - buffers_evicted - were evicted
8037 * - buffers_flushed - were flushed
8038 * - buffers_skipped - could not be evicted
8039 */
8040void
8043{
8045
8046 *buffers_skipped = 0;
8047 *buffers_evicted = 0;
8048 *buffers_flushed = 0;
8049
8050 for (int buf = 1; buf <= NBuffers; buf++)
8051 {
8052 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8054 bool buffer_flushed;
8055
8057
8058 /* An unlocked precheck should be safe and saves some cycles. */
8059 if ((buf_state & BM_VALID) == 0 ||
8061 continue;
8062
8063 /* Make sure we can pin the buffer. */
8066
8067 buf_state = LockBufHdr(desc);
8068
8069 /* recheck, could have changed without the lock */
8070 if ((buf_state & BM_VALID) == 0 ||
8072 {
8073 UnlockBufHdr(desc);
8074 continue;
8075 }
8076
8078 (*buffers_evicted)++;
8079 else
8080 (*buffers_skipped)++;
8081
8082 if (buffer_flushed)
8083 (*buffers_flushed)++;
8084 }
8085}
8086
8087/*
8088 * Helper function to mark unpinned buffer dirty whose buffer header lock is
8089 * already acquired.
8090 */
8091static bool
8094{
8096 bool result = false;
8097
8098 *buffer_already_dirty = false;
8099
8102
8103 if ((buf_state & BM_VALID) == 0)
8104 {
8105 UnlockBufHdr(desc);
8106 return false;
8107 }
8108
8109 /* Check that it's not pinned already. */
8111 {
8112 UnlockBufHdr(desc);
8113 return false;
8114 }
8115
8116 /* Pin the buffer and then release the buffer spinlock */
8117 PinBuffer_Locked(desc);
8118
8119 /* If it was not already dirty, mark it as dirty. */
8120 if (!(buf_state & BM_DIRTY))
8121 {
8124 result = true;
8125 BufferLockUnlock(buf, desc);
8126 }
8127 else
8128 *buffer_already_dirty = true;
8129
8130 UnpinBuffer(desc);
8131
8132 return result;
8133}
8134
8135/*
8136 * Try to mark the provided shared buffer as dirty.
8137 *
8138 * This function is intended for testing/development use only!
8139 *
8140 * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
8141 *
8142 * The buffer_already_dirty parameter is mandatory and indicate if the buffer
8143 * could not be dirtied because it is already dirty.
8144 *
8145 * Returns true if the buffer has successfully been marked as dirty.
8146 */
8147bool
8149{
8150 BufferDesc *desc;
8151 bool buffer_dirtied = false;
8152
8154
8155 /* Make sure we can pin the buffer. */
8158
8159 desc = GetBufferDescriptor(buf - 1);
8160 LockBufHdr(desc);
8161
8163 /* Both can not be true at the same time */
8165
8166 return buffer_dirtied;
8167}
8168
8169/*
8170 * Try to mark all the shared buffers containing provided relation's pages as
8171 * dirty.
8172 *
8173 * This function is intended for testing/development use only! See
8174 * MarkDirtyUnpinnedBuffer().
8175 *
8176 * The buffers_* parameters are mandatory and indicate the total count of
8177 * buffers that:
8178 * - buffers_dirtied - were dirtied
8179 * - buffers_already_dirty - were already dirty
8180 * - buffers_skipped - could not be dirtied because of a reason different
8181 * than a buffer being already dirty.
8182 */
8183void
8188{
8190
8191 *buffers_dirtied = 0;
8193 *buffers_skipped = 0;
8194
8195 for (int buf = 1; buf <= NBuffers; buf++)
8196 {
8197 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8200
8202
8203 /* An unlocked precheck should be safe and saves some cycles. */
8204 if ((buf_state & BM_VALID) == 0 ||
8206 continue;
8207
8208 /* Make sure we can pin the buffer. */
8211
8212 buf_state = LockBufHdr(desc);
8213
8214 /* recheck, could have changed without the lock */
8215 if ((buf_state & BM_VALID) == 0 ||
8217 {
8218 UnlockBufHdr(desc);
8219 continue;
8220 }
8221
8223 (*buffers_dirtied)++;
8224 else if (buffer_already_dirty)
8225 (*buffers_already_dirty)++;
8226 else
8227 (*buffers_skipped)++;
8228 }
8229}
8230
8231/*
8232 * Try to mark all the shared buffers as dirty.
8233 *
8234 * This function is intended for testing/development use only! See
8235 * MarkDirtyUnpinnedBuffer().
8236 *
8237 * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
8238 * parameters.
8239 */
8240void
8244{
8245 *buffers_dirtied = 0;
8247 *buffers_skipped = 0;
8248
8249 for (int buf = 1; buf <= NBuffers; buf++)
8250 {
8251 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8254
8256
8258 if (!(buf_state & BM_VALID))
8259 continue;
8260
8263
8264 LockBufHdr(desc);
8265
8267 (*buffers_dirtied)++;
8268 else if (buffer_already_dirty)
8269 (*buffers_already_dirty)++;
8270 else
8271 (*buffers_skipped)++;
8272 }
8273}
8274
8275/*
8276 * Generic implementation of the AIO handle staging callback for readv/writev
8277 * on local/shared buffers.
8278 *
8279 * Each readv/writev can target multiple buffers. The buffers have already
8280 * been registered with the IO handle.
8281 *
8282 * To make the IO ready for execution ("staging"), we need to ensure that the
8283 * targeted buffers are in an appropriate state while the IO is ongoing. For
8284 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
8285 * in this backend could lead to this backend's buffer pin being released as
8286 * part of error handling, which in turn could lead to the buffer being
8287 * replaced while IO is ongoing.
8288 */
8291{
8292 uint64 *io_data;
8293 uint8 handle_data_len;
8296
8297 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8298
8300
8301 /* iterate over all buffers affected by the vectored readv/writev */
8302 for (int i = 0; i < handle_data_len; i++)
8303 {
8305 BufferDesc *buf_hdr = is_temp ?
8309
8310 /*
8311 * Check that all the buffers are actually ones that could conceivably
8312 * be done in one IO, i.e. are sequential. This is the last
8313 * buffer-aware code before IO is actually executed and confusion
8314 * about which buffers are targeted by IO can be hard to debug, making
8315 * it worth doing extra-paranoid checks.
8316 */
8317 if (i == 0)
8318 first = buf_hdr->tag;
8319 else
8320 {
8321 Assert(buf_hdr->tag.relNumber == first.relNumber);
8322 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8323 }
8324
8325 if (is_temp)
8327 else
8329
8330 /* verify the buffer is in the expected state */
8332 if (is_write)
8333 {
8336 }
8337 else
8338 {
8341 }
8342
8343 /* temp buffers don't use BM_IO_IN_PROGRESS */
8344 if (!is_temp)
8346
8348
8349 /*
8350 * Reflect that the buffer is now owned by the AIO subsystem.
8351 *
8352 * For local buffers: This can't be done just via LocalRefCount, as
8353 * one might initially think, as this backend could error out while
8354 * AIO is still in progress, releasing all the pins by the backend
8355 * itself.
8356 *
8357 * This pin is released again in TerminateBufferIO().
8358 */
8359 buf_hdr->io_wref = io_ref;
8360
8361 if (is_temp)
8362 {
8365 }
8366 else
8368
8369 /*
8370 * Ensure the content lock that prevents buffer modifications while
8371 * the buffer is being written out is not released early due to an
8372 * error.
8373 */
8374 if (is_write && !is_temp)
8375 {
8377
8378 /*
8379 * Lock is now owned by AIO subsystem.
8380 */
8382 }
8383
8384 /*
8385 * Stop tracking this buffer via the resowner - the AIO system now
8386 * keeps track.
8387 */
8388 if (!is_temp)
8390 }
8391}
8392
8393/*
8394 * Decode readv errors as encoded by buffer_readv_encode_error().
8395 */
8396static inline void
8398 bool *zeroed_any,
8399 bool *ignored_any,
8403{
8404 uint32 rem_error = result.error_data;
8405
8406 /* see static asserts in buffer_readv_encode_error */
8407#define READV_COUNT_BITS 7
8408#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8409
8410 *zeroed_any = rem_error & 1;
8411 rem_error >>= 1;
8412
8413 *ignored_any = rem_error & 1;
8414 rem_error >>= 1;
8415
8418
8421
8424}
8425
8426/*
8427 * Helper to encode errors for buffer_readv_complete()
8428 *
8429 * Errors are encoded as follows:
8430 * - bit 0 indicates whether any page was zeroed (1) or not (0)
8431 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8432 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8433 * - next READV_COUNT_BITS bits indicate the number of checksum failures
8434 * - next READV_COUNT_BITS bits indicate the first offset of the first page
8435 * that was errored or zeroed or, if no errors/zeroes, the first ignored
8436 * checksum
8437 */
8438static inline void
8440 bool is_temp,
8441 bool zeroed_any,
8442 bool ignored_any,
8449{
8450
8451 uint8 shift = 0;
8455
8457 "PG_IOV_MAX is bigger than reserved space for error data");
8459 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8460
8461 /*
8462 * We only have space to encode one offset - but luckily that's good
8463 * enough. If there is an error, the error is the interesting offset, same
8464 * with a zeroed buffer vs an ignored buffer.
8465 */
8466 if (error_count > 0)
8468 else if (zeroed_count > 0)
8470 else
8472
8473 Assert(!zeroed_any || error_count == 0);
8474
8475 result->error_data = 0;
8476
8477 result->error_data |= zeroed_any << shift;
8478 shift += 1;
8479
8480 result->error_data |= ignored_any << shift;
8481 shift += 1;
8482
8483 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8484 shift += READV_COUNT_BITS;
8485
8486 result->error_data |= ((uint32) checkfail_count) << shift;
8487 shift += READV_COUNT_BITS;
8488
8489 result->error_data |= ((uint32) first_off) << shift;
8490 shift += READV_COUNT_BITS;
8491
8492 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8494
8495 if (error_count > 0)
8496 result->status = PGAIO_RS_ERROR;
8497 else
8498 result->status = PGAIO_RS_WARNING;
8499
8500 /*
8501 * The encoding is complicated enough to warrant cross-checking it against
8502 * the decode function.
8503 */
8504#ifdef USE_ASSERT_CHECKING
8505 {
8506 bool zeroed_any_2,
8511
8516 &first_off_2);
8522 }
8523#endif
8524
8525#undef READV_COUNT_BITS
8526#undef READV_COUNT_MASK
8527}
8528
8529/*
8530 * Helper for AIO readv completion callbacks, supporting both shared and temp
8531 * buffers. Gets called once for each buffer in a multi-page read.
8532 */
8535 uint8 flags, bool failed, bool is_temp,
8536 bool *buffer_invalid,
8537 bool *failed_checksum,
8538 bool *ignored_checksum,
8539 bool *zeroed_buffer)
8540{
8541 BufferDesc *buf_hdr = is_temp ?
8544 BufferTag tag = buf_hdr->tag;
8545 char *bufdata = BufferGetBlock(buffer);
8547 int piv_flags;
8548
8549 /* check that the buffer is in the expected state for a read */
8550#ifdef USE_ASSERT_CHECKING
8551 {
8553
8556 /* temp buffers don't use BM_IO_IN_PROGRESS */
8557 if (!is_temp)
8560 }
8561#endif
8562
8563 *buffer_invalid = false;
8564 *failed_checksum = false;
8565 *ignored_checksum = false;
8566 *zeroed_buffer = false;
8567
8568 /*
8569 * We ask PageIsVerified() to only log the message about checksum errors,
8570 * as the completion might be run in any backend (or IO workers). We will
8571 * report checksum errors in buffer_readv_report().
8572 */
8574
8575 /* the local zero_damaged_pages may differ from the definer's */
8578
8579 /*
8580 * If the buffers are marked for zero on error, we want to log that in
8581 * case of a checksum failure.
8582 */
8583 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8585
8586 /* Check for garbage data. */
8587 if (!failed)
8588 {
8589 /*
8590 * If the buffer is not currently pinned by this backend, e.g. because
8591 * we're completing this IO after an error, the buffer data will have
8592 * been marked as inaccessible when the buffer was unpinned. The AIO
8593 * subsystem holds a pin, but that doesn't prevent the buffer from
8594 * having been marked as inaccessible. The completion might also be
8595 * executed in a different process.
8596 */
8597#ifdef USE_VALGRIND
8598 if (!BufferIsPinned(buffer))
8600#endif
8601
8602 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8604 {
8605 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8606 {
8607 memset(bufdata, 0, BLCKSZ);
8608 *zeroed_buffer = true;
8609 }
8610 else
8611 {
8612 *buffer_invalid = true;
8613 /* mark buffer as having failed */
8614 failed = true;
8615 }
8616 }
8617 else if (*failed_checksum)
8618 *ignored_checksum = true;
8619
8620 /* undo what we did above */
8621#ifdef USE_VALGRIND
8622 if (!BufferIsPinned(buffer))
8624#endif
8625
8626 /*
8627 * Immediately log a message about the invalid page, but only to the
8628 * server log. The reason to do so immediately is that this may be
8629 * executed in a different backend than the one that originated the
8630 * request. The reason to do so immediately is that the originator
8631 * might not process the query result immediately (because it is busy
8632 * doing another part of query processing) or at all (e.g. if it was
8633 * cancelled or errored out due to another IO also failing). The
8634 * definer of the IO will emit an ERROR or WARNING when processing the
8635 * IO's results
8636 *
8637 * To avoid duplicating the code to emit these log messages, we reuse
8638 * buffer_readv_report().
8639 */
8641 {
8642 PgAioResult result_one = {0};
8643
8648 *zeroed_buffer ? 1 : 0,
8649 *failed_checksum ? 1 : 0,
8652 }
8653 }
8654
8655 /* Terminate I/O and set BM_VALID. */
8656 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8657 if (is_temp)
8659 else
8660 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8661
8662 /*
8663 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8664 * callback may not be executed in the same backend that called
8665 * BUFFER_READ_START. The alternative would be to defer calling the
8666 * tracepoint to a later point (e.g. the local completion callback for
8667 * shared buffer reads), which seems even less helpful.
8668 */
8670 tag.blockNum,
8671 tag.spcOid,
8672 tag.dbOid,
8673 tag.relNumber,
8675 false);
8676}
8677
8678/*
8679 * Perform completion handling of a single AIO read. This read may cover
8680 * multiple blocks / buffers.
8681 *
8682 * Shared between shared and local buffers, to reduce code duplication.
8683 */
8686 uint8 cb_data, bool is_temp)
8687{
8693 uint8 error_count = 0;
8694 uint8 zeroed_count = 0;
8695 uint8 ignored_count = 0;
8697 uint64 *io_data;
8698 uint8 handle_data_len;
8699
8700 if (is_temp)
8701 {
8702 Assert(td->smgr.is_temp);
8704 }
8705 else
8706 Assert(!td->smgr.is_temp);
8707
8708 /*
8709 * Iterate over all the buffers affected by this IO and call the
8710 * per-buffer completion function for each buffer.
8711 */
8712 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8713 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8714 {
8716 bool failed;
8717 bool failed_verification = false;
8718 bool failed_checksum = false;
8719 bool zeroed_buffer = false;
8720 bool ignored_checksum = false;
8721
8723
8724 /*
8725 * If the entire I/O failed on a lower-level, each buffer needs to be
8726 * marked as failed. In case of a partial read, the first few buffers
8727 * may be ok.
8728 */
8729 failed =
8731 || prior_result.result <= buf_off;
8732
8733 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8737 &zeroed_buffer);
8738
8739 /*
8740 * Track information about the number of different kinds of error
8741 * conditions across all pages, as there can be multiple pages failing
8742 * verification as part of one IO.
8743 */
8746 if (zeroed_buffer && zeroed_count++ == 0)
8748 if (ignored_checksum && ignored_count++ == 0)
8750 if (failed_checksum)
8752 }
8753
8754 /*
8755 * If the smgr read succeeded [partially] and page verification failed for
8756 * some of the pages, adjust the IO's result state appropriately.
8757 */
8758 if (prior_result.status != PGAIO_RS_ERROR &&
8759 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8760 {
8762 zeroed_count > 0, ignored_count > 0,
8767 }
8768
8769 /*
8770 * For shared relations this reporting is done in
8771 * shared_buffer_readv_complete_local().
8772 */
8773 if (is_temp && checkfail_count > 0)
8776
8777 return result;
8778}
8779
8780/*
8781 * AIO error reporting callback for aio_shared_buffer_readv_cb and
8782 * aio_local_buffer_readv_cb.
8783 *
8784 * The error is encoded / decoded in buffer_readv_encode_error() /
8785 * buffer_readv_decode_error().
8786 */
8787static void
8789 int elevel)
8790{
8791 int nblocks = td->smgr.nblocks;
8792 BlockNumber first = td->smgr.blockNum;
8793 BlockNumber last = first + nblocks - 1;
8796 RelPathStr rpath =
8798 bool zeroed_any,
8802 first_off;
8804 const char *msg_one,
8805 *msg_mult,
8806 *det_mult,
8807 *hint_mult;
8808
8812 &first_off);
8813
8814 /*
8815 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8816 * special case, it's too irregular to be emitted the same way as the
8817 * other cases.
8818 */
8819 if (zeroed_any && ignored_any)
8820 {
8822 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8823 Assert(result.status != PGAIO_RS_ERROR);
8825
8826 ereport(elevel,
8828 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8829 affected_count, checkfail_count, first, last, rpath.str),
8830 affected_count > 1 ?
8831 errdetail("Block %u held the first zeroed page.",
8832 first + first_off) : 0,
8833 errhint_plural("See server log for details about the other %d invalid block.",
8834 "See server log for details about the other %d invalid blocks.",
8837 return;
8838 }
8839
8840 /*
8841 * The other messages are highly repetitive. To avoid duplicating a long
8842 * and complicated ereport(), gather the translated format strings
8843 * separately and then do one common ereport.
8844 */
8845 if (result.status == PGAIO_RS_ERROR)
8846 {
8847 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8849 msg_one = _("invalid page in block %u of relation \"%s\"");
8850 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8851 det_mult = _("Block %u held the first invalid page.");
8852 hint_mult = _("See server log for the other %u invalid block(s).");
8853 }
8854 else if (zeroed_any && !ignored_any)
8855 {
8857 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8858 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8859 det_mult = _("Block %u held the first zeroed page.");
8860 hint_mult = _("See server log for the other %u zeroed block(s).");
8861 }
8862 else if (!zeroed_any && ignored_any)
8863 {
8865 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8866 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8867 det_mult = _("Block %u held the first ignored page.");
8868 hint_mult = _("See server log for the other %u ignored block(s).");
8869 }
8870 else
8872
8873 ereport(elevel,
8875 affected_count == 1 ?
8876 errmsg_internal(msg_one, first + first_off, rpath.str) :
8877 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8880}
8881
8882static void
8887
8888static PgAioResult
8894
8895/*
8896 * We need a backend-local completion callback for shared buffers, to be able
8897 * to report checksum errors correctly. Unfortunately that can only safely
8898 * happen if the reporting backend has previously called
8899 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8900 * the backend that started the IO. Hence this callback.
8901 */
8902static PgAioResult
8932
8933static void
8938
8939static PgAioResult
8945
8946/* readv callback is passed READ_BUFFERS_* flags as callback data */
8949 .complete_shared = shared_buffer_readv_complete,
8950 /* need a local callback to report checksum failures */
8951 .complete_local = shared_buffer_readv_complete_local,
8952 .report = buffer_readv_report,
8953};
8954
8955/* readv callback is passed READ_BUFFERS_* flags as callback data */
8958
8959 /*
8960 * Note that this, in contrast to the shared_buffers case, uses
8961 * complete_local, as only the issuing backend has access to the required
8962 * datastructures. This is important in case the IO completion may be
8963 * consumed incidentally by another backend.
8964 */
8965 .complete_local = local_buffer_readv_complete,
8966 .report = buffer_readv_report,
8967};
int io_method
Definition aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
bool pgaio_have_staged(void)
Definition aio.c:1117
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
void pgaio_submit_staged(void)
Definition aio.c:1133
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ IOMETHOD_SYNC
Definition aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_OK
Definition aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define pg_write_barrier()
Definition atomics.h:155
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1789
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1649
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1613
int BgWriterDelay
Definition bgwriter.c:59
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
#define MaxBlockNumber
Definition block.h:35
static int32 next
Definition blutils.c:225
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
#define BufferIsLocal(buffer)
Definition buf.h:37
CkptSortItem * CkptBufferIds
Definition buf_init.c:28
WritebackContext BackendWritebackContext
Definition buf_init.c:27
#define BM_MAX_USAGE_COUNT
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
#define BM_PERMANENT
#define BUF_USAGECOUNT_MASK
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BM_LOCK_VAL_SHARED
#define BUF_REFCOUNT_ONE
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_LOCK_VAL_EXCLUSIVE
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
#define BM_PIN_COUNT_WAITER
#define BM_DIRTY
#define BM_LOCK_WAKE_IN_PROGRESS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
#define BUF_STATE_GET_USAGECOUNT(state)
#define BM_LOCK_MASK
StartBufferIOResult
@ BUFFER_IO_IN_PROGRESS
@ BUFFER_IO_ALREADY_DONE
@ BUFFER_IO_READY_FOR_IO
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
#define BUF_STATE_GET_REFCOUNT(state)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_LOCK_HAS_WAITERS
#define BM_IO_ERROR
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BM_LOCK_VAL_SHARE_EXCLUSIVE
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:154
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:96
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:84
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:124
bool track_io_timing
Definition bufmgr.c:192
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7854
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6646
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition bufmgr.c:5259
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5679
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5705
void DropDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5124
bool BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
Definition bufmgr.c:7102
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition bufmgr.c:7633
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8685
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4455
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
Definition bufmgr.c:7079
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition bufmgr.c:4774
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition bufmgr.c:3221
static int ReservedRefCountSlot
Definition bufmgr.c:268
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8903
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1371
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1656
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:787
static uint32 PrivateRefCountClock
Definition bufmgr.c:267
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4512
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7832
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8940
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1618
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7991
int io_max_combine_limit
Definition bufmgr.c:217
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4635
const ResourceOwnerDesc buffer_io_resowner_desc
Definition bufmgr.c:285
bool zero_damaged_pages
Definition bufmgr.c:189
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3397
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:8041
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8534
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7890
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5907
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6105
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition bufmgr.c:7599
bool IsBufferCleanupOK(Buffer buffer)
Definition bufmgr.c:6910
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7840
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3096
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6280
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:970
void AtEOXact_Buffers(bool isCommit)
Definition bufmgr.c:4208
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7429
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition bufmgr.c:8947
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6023
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1002
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1276
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1714
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7575
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1137
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2197
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4272
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition bufmgr.c:5471
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition bufmgr.c:4894
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6212
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7500
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6534
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition bufmgr.c:1031
static pg_attribute_always_inline void TrackBufferHit(IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
Definition bufmgr.c:1683
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition bufmgr.c:8956
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2471
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4254
int io_combine_limit_guc
Definition bufmgr.c:216
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7664
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4476
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6059
const ResourceOwnerDesc buffer_resowner_desc
Definition bufmgr.c:294
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8290
void UnlockBuffer(Buffer buffer)
Definition bufmgr.c:6567
#define BUF_REUSABLE
Definition bufmgr.c:85
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7484
static void BufferSync(int flags)
Definition bufmgr.c:3561
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1938
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8934
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4398
void CheckPointBuffers(int flags)
Definition bufmgr.c:4441
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3123
static uint32 MaxProportionalPins
Definition bufmgr.c:271
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6172
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2795
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6294
bool BgBufferSync(WritebackContext *wb_context)
Definition bufmgr.c:3840
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7527
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3429
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
Definition bufmgr.c:1223
bool BufferIsPermanent(Buffer buffer)
Definition bufmgr.c:4686
void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:8241
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8883
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6479
void UnlockBuffers(void)
Definition bufmgr.c:5861
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:697
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8889
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2548
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6626
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition bufmgr.c:4654
StartBufferIOResult StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7250
int bgwriter_flush_after
Definition bufmgr.c:224
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5595
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:3070
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3281
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:5064
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4722
void LockBufferInternal(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:6583
bool HoldingBufferPinThatDelaysRecovery(void)
Definition bufmgr.c:6826
bool MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
Definition bufmgr.c:8148
int checkpoint_flush_after
Definition bufmgr.c:223
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5612
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3474
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7468
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7699
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7687
StartBufferIOResult StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7330
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3156
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7900
bool BufferBeginSetHintBits(Buffer buffer)
Definition bufmgr.c:7051
int backend_flush_after
Definition bufmgr.c:225
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2733
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8788
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
static void BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
Definition bufmgr.c:6314
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
bool WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1759
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2751
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6679
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6960
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5830
void FlushRelationBuffers(Relation rel)
Definition bufmgr.c:5171
#define READV_COUNT_BITS
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6450
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7749
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition bufmgr.c:7962
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:954
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition bufmgr.c:818
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
int maintenance_io_concurrency
Definition bufmgr.c:207
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3465
void FlushDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5535
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2370
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5357
int effective_io_concurrency
Definition bufmgr.c:200
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6552
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7367
void MarkDirtyRelUnpinnedBuffers(Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:8184
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1637
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:8092
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8397
#define READV_COUNT_MASK
static int PrivateRefCountEntryLast
Definition bufmgr.c:269
int io_combine_limit
Definition bufmgr.c:215
void InitBufferManagerAccess(void)
Definition bufmgr.c:4225
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8439
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4138
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2707
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:879
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3521
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6852
int bgwriter_lru_maxpages
Definition bufmgr.c:190
uint32 GetPinLimit(void)
Definition bufmgr.c:2695
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7148
#define BUF_WRITTEN
Definition bufmgr.c:84
void FlushOneBuffer(Buffer buffer)
Definition bufmgr.c:5575
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define P_NEW
Definition bufmgr.h:200
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
#define DEFAULT_IO_COMBINE_LIMIT
Definition bufmgr.h:176
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:435
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
BufferLockMode
Definition bufmgr.h:206
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:217
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:212
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:222
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:207
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:175
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition bufmgr.h:170
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition bufmgr.h:171
void * Block
Definition bufmgr.h:26
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
ReadBufferMode
Definition bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
@ RBM_NORMAL
Definition bufmgr.h:46
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
bool ignore_checksum_failure
Definition bufpage.c:27
void PageSetChecksum(Page page, BlockNumber blkno)
Definition bufpage.c:1518
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:500
#define PIV_ZERO_BUFFERS_ON_ERROR
Definition bufpage.h:502
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:416
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:501
#define pg_noinline
Definition c.h:321
#define likely(x)
Definition c.h:437
uint8_t uint8
Definition c.h:622
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
#define Max(x, y)
Definition c.h:1085
#define Assert(condition)
Definition c.h:943
double float8
Definition c.h:714
#define pg_attribute_always_inline
Definition c.h:305
int16_t int16
Definition c.h:619
int32_t int32
Definition c.h:620
uint64_t uint64
Definition c.h:625
uint16_t uint16
Definition c.h:623
#define pg_unreachable()
Definition c.h:367
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
#define lengthof(array)
Definition c.h:873
#define MemSet(start, val, len)
Definition c.h:1107
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1008
size_t Size
Definition c.h:689
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
static DataChecksumsWorkerOperation operation
int64 TimestampTz
Definition timestamp.h:39
struct cursor * cur
Definition ecpg.c:29
Datum arg
Definition elog.c:1323
ErrorContextCallback * error_context_stack
Definition elog.c:100
int errcode(int sqlerrcode)
Definition elog.c:875
#define _(x)
Definition elog.c:96
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
#define errcontext
Definition elog.h:200
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
#define DEBUG3
Definition elog.h:29
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define LOG_SERVER_ONLY
Definition elog.h:33
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:37
#define DEBUG2
Definition elog.h:30
#define PANIC
Definition elog.h:44
#define DEBUG1
Definition elog.h:31
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
#define palloc_array(type, count)
Definition fe_memutils.h:91
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:331
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:426
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:184
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:608
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:712
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:752
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
int NBuffers
Definition globals.c:144
bool enableFsync
Definition globals.c:131
ProcNumber MyProcNumber
Definition globals.c:92
int VacuumCostPageMiss
Definition globals.c:155
bool VacuumCostActive
Definition globals.c:161
bool IsUnderPostmaster
Definition globals.c:122
int VacuumCostBalance
Definition globals.c:160
int MaxBackends
Definition globals.c:149
int VacuumCostPageDirty
Definition globals.c:156
int VacuumCostPageHit
Definition globals.c:154
const char * str
long val
Definition informix.c:689
BufferUsage pgBufferUsage
Definition instrument.c:25
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
int b
Definition isn.c:74
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
int32 * LocalRefCount
Definition localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:865
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1027
StartBufferIOResult StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition localbuf.c:532
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1038
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:829
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:500
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:726
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:586
int NLocBuffer
Definition localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:355
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:872
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:689
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
@ LW_WS_WAITING
Definition lwlock.h:31
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1635
void pfree(void *pointer)
Definition mcxt.c:1619
void * palloc(Size size)
Definition mcxt.c:1390
MemoryContext CurrentMemoryContext
Definition mcxt.c:161
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define RESUME_INTERRUPTS()
Definition miscadmin.h:138
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
#define HOLD_INTERRUPTS()
Definition miscadmin.h:136
#define END_CRIT_SECTION()
Definition miscadmin.h:154
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
static PgChecksumMode mode
static int64 current_size
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
const void * data
#define PG_IOV_MAX
Definition pg_iovec.h:47
static char buf[DEFAULT_XLOG_SEG_SIZE]
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:742
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_EXTEND
Definition pgstat.h:318
@ IOOP_READ
Definition pgstat.h:319
@ IOOP_WRITEBACK
Definition pgstat.h:315
@ IOOP_HIT
Definition pgstat.h:313
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
@ IOOP_WRITE
Definition pgstat.h:320
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:747
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define qsort(a, b, c, d)
Definition port.h:496
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:333
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:313
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define PointerGetDatum(X)
Definition postgres.h:354
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define NUM_AUXILIARY_PROCS
Definition proc.h:527
#define GetPGProcByNumber(n)
Definition proc.h:504
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
int ProcNumber
Definition procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition procsignal.c:503
void set_ps_display_remove_suffix(void)
Definition ps_status.c:440
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:388
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:724
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:578
#define RelationUsesLocalBuffers(relation)
Definition rel.h:648
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:678
#define RelationIsValid(relation)
Definition rel.h:491
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
#define free(a)
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2027
PGPROC * MyProc
Definition proc.c:71
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:771
int DeadlockTimeout
Definition proc.c:62
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:2015
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:795
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:49
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187
BufferTag tag
pg_atomic_uint64 state
int64 shared_blks_dirtied
Definition instrument.h:28
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 shared_blks_written
Definition instrument.h:29
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
int ckpt_bufs_written
Definition xlog.h:179
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_scanned
Definition bufmgr.c:169
float8 progress
Definition bufmgr.c:163
int num_to_scan
Definition bufmgr.c:167
struct ErrorContextCallback * previous
Definition elog.h:299
void(* callback)(void *arg)
Definition elog.h:300
Definition proc.h:179
uint8 lwWaitMode
Definition proc.h:284
PGSemaphore sem
Definition proc.h:258
uint8 lwWaiting
Definition proc.h:283
PgAioHandleCallbackStage stage
Definition aio.h:219
uint32 status
Definition aio_types.h:108
PgAioResult result
Definition aio_types.h:132
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248
PgStat_Counter buffers_written
Definition pgstat.h:270
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
const char * name
Definition resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
SMgrRelation srel
Definition bufmgr.c:185
RelFileLocator rlocator
Definition bufmgr.c:184
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1938
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@131 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83
static volatile sig_atomic_t waiting
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
bool RecoveryInProgress(void)
Definition xlog.c:6832
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3159
CheckpointStatsData CheckpointStats
Definition xlog.c:216
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2801
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:155
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:152
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:151
#define XLogIsNeeded()
Definition xlog.h:112
#define XLogHintBitIsNeeded()
Definition xlog.h:123
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
#define InHotStandby
Definition xlogutils.h:60