PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "common/hashfn.h"
49#include "executor/instrument.h"
50#include "lib/binaryheap.h"
51#include "miscadmin.h"
52#include "pg_trace.h"
53#include "pgstat.h"
54#include "postmaster/bgwriter.h"
55#include "storage/aio.h"
57#include "storage/bufmgr.h"
58#include "storage/fd.h"
59#include "storage/ipc.h"
60#include "storage/lmgr.h"
61#include "storage/proc.h"
62#include "storage/proclist.h"
63#include "storage/procsignal.h"
64#include "storage/read_stream.h"
65#include "storage/smgr.h"
66#include "storage/standby.h"
67#include "utils/memdebug.h"
68#include "utils/ps_status.h"
69#include "utils/rel.h"
70#include "utils/resowner.h"
71#include "utils/timestamp.h"
72#include "utils/wait_event.h"
73
74
75/* Note: these two macros only work on shared buffers, not local ones! */
76#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
77#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
78
79/* Note: this macro only works on local buffers, not shared ones! */
80#define LocalBufHdrGetBlock(bufHdr) \
81 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
82
83/* Bits in SyncOneBuffer's return value */
84#define BUF_WRITTEN 0x01
85#define BUF_REUSABLE 0x02
86
87#define RELS_BSEARCH_THRESHOLD 20
88
89/*
90 * This is the size (in the number of blocks) above which we scan the
91 * entire buffer pool to remove the buffers for all the pages of relation
92 * being dropped. For the relations with size below this threshold, we find
93 * the buffers by doing lookups in BufMapping table.
94 */
95#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
96
97/*
98 * This is separated out from PrivateRefCountEntry to allow for copying all
99 * the data members via struct assignment.
100 */
102{
103 /*
104 * How many times has the buffer been pinned by this backend.
105 */
107
108 /*
109 * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
110 * the buffer is not locked.
111 */
114
116{
117 /*
118 * Note that this needs to be same as the entry's corresponding
119 * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
120 * store it in both places as this is used for the hashtable key and
121 * because it is more convenient (passing around a PrivateRefCountEntry
122 * suffices to identify the buffer) and faster (checking the keys array is
123 * faster when checking many entries, checking the entry is faster if just
124 * checking a single entry).
125 */
127
128 char status;
129
132
133#define SH_PREFIX refcount
134#define SH_ELEMENT_TYPE PrivateRefCountEntry
135#define SH_KEY_TYPE Buffer
136#define SH_KEY buffer
137#define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
138#define SH_EQUAL(tb, a, b) ((a) == (b))
139#define SH_SCOPE static inline
140#define SH_DECLARE
141#define SH_DEFINE
142#include "lib/simplehash.h"
143
144/* 64 bytes, about the size of a cache line on common systems */
145#define REFCOUNT_ARRAY_ENTRIES 8
146
147/*
148 * Status of buffers to checkpoint for a particular tablespace, used
149 * internally in BufferSync.
150 */
151typedef struct CkptTsStatus
152{
153 /* oid of the tablespace */
155
156 /*
157 * Checkpoint progress for this tablespace. To make progress comparable
158 * between tablespaces the progress is, for each tablespace, measured as a
159 * number between 0 and the total number of to-be-checkpointed pages. Each
160 * page checkpointed in this tablespace increments this space's progress
161 * by progress_slice.
162 */
165
166 /* number of to-be checkpointed pages in this tablespace */
168 /* already processed pages in this tablespace */
170
171 /* current offset in CkptBufferIds for this tablespace */
172 int index;
174
175/*
176 * Type for array used to sort SMgrRelations
177 *
178 * FlushRelationsAllBuffers shares the same comparator function with
179 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
180 * compatible.
181 */
182typedef struct SMgrSortArray
183{
184 RelFileLocator rlocator; /* This must be the first member */
187
188/* GUC variables */
192bool track_io_timing = false;
193
194/*
195 * How many buffers PrefetchBuffer callers should try to stay ahead of their
196 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
197 * for buffers not belonging to tablespaces that have their
198 * effective_io_concurrency parameter set.
199 */
201
202/*
203 * Like effective_io_concurrency, but used by maintenance code paths that might
204 * benefit from a higher setting because they work on behalf of many sessions.
205 * Overridden by the tablespace setting of the same name.
206 */
208
209/*
210 * Limit on how many blocks should be handled in single I/O operations.
211 * StartReadBuffers() callers should respect it, as should other operations
212 * that call smgr APIs directly. It is computed as the minimum of underlying
213 * GUCs io_combine_limit_guc and io_max_combine_limit.
214 */
218
219/*
220 * GUC variables about triggering kernel writeback for buffers written; OS
221 * dependent defaults are set via the GUC mechanism.
222 */
226
227/* local state for LockBufferForCleanup */
229
230/*
231 * Backend-Private refcount management:
232 *
233 * Each buffer also has a private refcount that keeps track of the number of
234 * times the buffer is pinned in the current process. This is so that the
235 * shared refcount needs to be modified only once if a buffer is pinned more
236 * than once by an individual backend. It's also used to check that no
237 * buffers are still pinned at the end of transactions and when exiting. We
238 * also use this mechanism to track whether this backend has a buffer locked,
239 * and, if so, in what mode.
240 *
241 *
242 * To avoid - as we used to - requiring an array with NBuffers entries to keep
243 * track of local buffers, we use a small sequentially searched array
244 * (PrivateRefCountArrayKeys, with the corresponding data stored in
245 * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
246 * keep track of backend local pins.
247 *
248 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
249 * refcounts are kept track of in the array; after that, new array entries
250 * displace old ones into the hash table. That way a frequently used entry
251 * can't get "stuck" in the hashtable while infrequent ones clog the array.
252 *
253 * Note that in most scenarios the number of pinned buffers will not exceed
254 * REFCOUNT_ARRAY_ENTRIES.
255 *
256 *
257 * To enter a buffer into the refcount tracking mechanism first reserve a free
258 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
259 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
260 * memory allocations in NewPrivateRefCountEntry() which can be important
261 * because in some scenarios it's called with a spinlock held...
262 */
268static int ReservedRefCountSlot = -1;
270
272
273static void ReservePrivateRefCountEntry(void);
278
279/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
280static void ResOwnerReleaseBufferIO(Datum res);
281static char *ResOwnerPrintBufferIO(Datum res);
282static void ResOwnerReleaseBuffer(Datum res);
283static char *ResOwnerPrintBuffer(Datum res);
284
286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};
293
295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};
302
303/*
304 * Ensure that the PrivateRefCountArray has sufficient space to store one more
305 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
306 * a new entry - but it's perfectly fine to not use a reserved entry.
307 */
308static void
310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
383
384/*
385 * Fill a previously reserved refcount entry.
386 */
389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}
411
412/*
413 * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
414 * inlining. This particularly seems to be true if the compiler is capable of
415 * auto-vectorizing the code, as that imposes additional stack-alignment
416 * requirements etc.
417 */
420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
498
499/*
500 * Return the PrivateRefCount entry for the passed buffer.
501 *
502 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
503 * do_move is true, and the entry resides in the hashtable the entry is
504 * optimized for frequent access by moving it to the array.
505 */
506static inline PrivateRefCountEntry *
508{
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
534}
535
536/*
537 * Returns how many times the passed buffer is pinned by this backend.
538 *
539 * Only works for shared memory buffers!
540 */
541static inline int32
543{
545
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}
559
560/*
561 * Release resources used to track the reference count of a buffer which we no
562 * longer have pinned and don't want to pin again immediately.
563 */
564static void
566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
591
592/*
593 * BufferIsPinned
594 * True iff the buffer is pinned (also checks for valid buffer number).
595 *
596 * NOTE: what we check here is that *this* backend holds a pin on
597 * the buffer. We do not care whether some other backend does.
598 */
599#define BufferIsPinned(bufnum) \
600( \
601 !BufferIsValid(bufnum) ? \
602 false \
603 : \
604 BufferIsLocal(bufnum) ? \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
607 (GetPrivateRefCount(bufnum) > 0) \
608)
609
610
613 ForkNumber forkNum, BlockNumber blockNum,
617 BufferAccessStrategy strategy,
618 uint32 flags,
621 Buffer *buffers,
625 BufferAccessStrategy strategy,
626 uint32 flags,
629 Buffer *buffers,
631static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
632 bool skip_if_not_valid);
633static void PinBuffer_Locked(BufferDesc *buf);
634static void UnpinBuffer(BufferDesc *buf);
635static void UnpinBufferNoOwner(BufferDesc *buf);
636static void BufferSync(int flags);
637static int SyncOneBuffer(int buf_id, bool skip_recently_used,
639static void WaitIO(BufferDesc *buf);
640static void AbortBufferIO(Buffer buffer);
641static void shared_buffer_write_error_callback(void *arg);
642static void local_buffer_write_error_callback(void *arg);
643static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
644 char relpersistence,
645 ForkNumber forkNum,
646 BlockNumber blockNum,
647 BufferAccessStrategy strategy,
650static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
651
654 Relation rel, char persistence, SMgrRelation smgr,
655 ForkNumber forknum, BlockNumber blocknum);
661static void FindAndDropRelationBuffers(RelFileLocator rlocator,
662 ForkNumber forkNum,
667 ForkNumber forkNum, bool permanent);
668static void AtProcExit_Buffers(int code, Datum arg);
669static void CheckForBufferLeaks(void);
670#ifdef USE_ASSERT_CHECKING
672#endif
673static int rlocator_comparator(const void *p1, const void *p2);
674static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
675static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
676static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
677
683static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
688static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
691
692
693/*
694 * Implementation of PrefetchBuffer() for shared buffers.
695 */
698 ForkNumber forkNum,
699 BlockNumber blockNum)
700{
702 BufferTag newTag; /* identity of requested block */
703 uint32 newHash; /* hash value for newTag */
704 LWLock *newPartitionLock; /* buffer partition lock for it */
705 int buf_id;
706
707 Assert(BlockNumberIsValid(blockNum));
708
709 /* create a tag so we can lookup the buffer */
710 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
711 forkNum, blockNum);
712
713 /* determine its hash code and partition lock ID */
716
717 /* see if the block is in the buffer pool already */
719 buf_id = BufTableLookup(&newTag, newHash);
721
722 /* If not in buffers, initiate prefetch */
723 if (buf_id < 0)
724 {
725#ifdef USE_PREFETCH
726 /*
727 * Try to initiate an asynchronous read. This returns false in
728 * recovery if the relation file doesn't exist.
729 */
730 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
731 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
732 {
733 result.initiated_io = true;
734 }
735#endif /* USE_PREFETCH */
736 }
737 else
738 {
739 /*
740 * Report the buffer it was in at that time. The caller may be able
741 * to avoid a buffer table lookup, but it's not pinned and it must be
742 * rechecked!
743 */
744 result.recent_buffer = buf_id + 1;
745 }
746
747 /*
748 * If the block *is* in buffers, we do nothing. This is not really ideal:
749 * the block might be just about to be evicted, which would be stupid
750 * since we know we are going to need it soon. But the only easy answer
751 * is to bump the usage_count, which does not seem like a great solution:
752 * when the caller does ultimately touch the block, usage_count would get
753 * bumped again, resulting in too much favoritism for blocks that are
754 * involved in a prefetch sequence. A real fix would involve some
755 * additional per-buffer state, and it's not clear that there's enough of
756 * a problem to justify that.
757 */
758
759 return result;
760}
761
762/*
763 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
764 *
765 * This is named by analogy to ReadBuffer but doesn't actually allocate a
766 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
767 * block will not be delayed by the I/O. Prefetching is optional.
768 *
769 * There are three possible outcomes:
770 *
771 * 1. If the block is already cached, the result includes a valid buffer that
772 * could be used by the caller to avoid the need for a later buffer lookup, but
773 * it's not pinned, so the caller must recheck it.
774 *
775 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
776 * true. Currently there is no way to know if the data was already cached by
777 * the kernel and therefore didn't really initiate I/O, and no way to know when
778 * the I/O completes other than using synchronous ReadBuffer().
779 *
780 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
781 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
782 * lack of a kernel facility), direct I/O is enabled, or the underlying
783 * relation file wasn't found and we are in recovery. (If the relation file
784 * wasn't found and we are not in recovery, an error is raised).
785 */
788{
790 Assert(BlockNumberIsValid(blockNum));
791
793 {
794 /* see comments in ReadBufferExtended */
798 errmsg("cannot access temporary tables of other sessions")));
799
800 /* pass it off to localbuf.c */
801 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803 else
804 {
805 /* pass it to the shared buffer version */
806 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
807 }
808}
809
810/*
811 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
812 *
813 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
814 * successful. Return true if the buffer is valid and still has the expected
815 * tag. In that case, the buffer is pinned and the usage count is bumped.
816 */
817bool
819 Buffer recent_buffer)
820{
822 BufferTag tag;
824
825 Assert(BufferIsValid(recent_buffer));
826
829 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
830
831 if (BufferIsLocal(recent_buffer))
832 {
833 int b = -recent_buffer - 1;
834
837
838 /* Is it still valid and holding the right tag? */
839 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
840 {
841 PinLocalBuffer(bufHdr, true);
842
844
845 return true;
846 }
847 }
848 else
849 {
850 bufHdr = GetBufferDescriptor(recent_buffer - 1);
851
852 /*
853 * Is it still valid and holding the right tag? We do an unlocked tag
854 * comparison first, to make it unlikely that we'll increment the
855 * usage counter of the wrong buffer, if someone calls us with a very
856 * out of date recent_buffer. Then we'll check it again if we get the
857 * pin.
858 */
859 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
860 PinBuffer(bufHdr, NULL, true))
861 {
862 if (BufferTagsEqual(&tag, &bufHdr->tag))
863 {
865 return true;
866 }
868 }
869 }
870
871 return false;
872}
873
874/*
875 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
876 * fork with RBM_NORMAL mode and default strategy.
877 */
878Buffer
883
884/*
885 * ReadBufferExtended -- returns a buffer containing the requested
886 * block of the requested relation. If the blknum
887 * requested is P_NEW, extend the relation file and
888 * allocate a new block. (Caller is responsible for
889 * ensuring that only one backend tries to extend a
890 * relation at the same time!)
891 *
892 * Returns: the buffer number for the buffer containing
893 * the block read. The returned buffer has been pinned.
894 * Does not return on error --- elog's instead.
895 *
896 * Assume when this function is called, that reln has been opened already.
897 *
898 * In RBM_NORMAL mode, the page is read from disk, and the page header is
899 * validated. An error is thrown if the page header is not valid. (But
900 * note that an all-zero page is considered "valid"; see
901 * PageIsVerified().)
902 *
903 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
904 * valid, the page is zeroed instead of throwing an error. This is intended
905 * for non-critical data, where the caller is prepared to repair errors.
906 *
907 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
908 * filled with zeros instead of reading it from disk. Useful when the caller
909 * is going to fill the page from scratch, since this saves I/O and avoids
910 * unnecessary failure if the page-on-disk has corrupt page headers.
911 * The page is returned locked to ensure that the caller has a chance to
912 * initialize the page before it's made visible to others.
913 * Caution: do not use this mode to read a page that is beyond the relation's
914 * current physical EOF; that is likely to cause problems in md.c when
915 * the page is modified and written out. P_NEW is OK, though.
916 *
917 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
918 * a cleanup-strength lock on the page.
919 *
920 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
921 *
922 * If strategy is not NULL, a nondefault buffer access strategy is used.
923 * See buffer/README for details.
924 */
925inline Buffer
928{
929 Buffer buf;
930
931 /*
932 * Reject attempts to read non-local temporary relations; we would be
933 * likely to get wrong data since we have no visibility into the owning
934 * session's local buffers.
935 */
939 errmsg("cannot access temporary tables of other sessions")));
940
941 /*
942 * Read the buffer, and update pgstat counters to reflect a cache hit or
943 * miss.
944 */
946 forkNum, blockNum, mode, strategy);
947
948 return buf;
949}
950
951
952/*
953 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
954 * a relcache entry for the relation.
955 *
956 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
957 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
958 * cannot be used for temporary relations (and making that work might be
959 * difficult, unless we only want to read temporary relations for our own
960 * ProcNumber).
961 */
962Buffer
965 BufferAccessStrategy strategy, bool permanent)
966{
967 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
968
969 return ReadBuffer_common(NULL, smgr,
971 forkNum, blockNum,
972 mode, strategy);
973}
974
975/*
976 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
977 */
978Buffer
980 ForkNumber forkNum,
981 BufferAccessStrategy strategy,
982 uint32 flags)
983{
984 Buffer buf;
985 uint32 extend_by = 1;
986
987 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
988 &buf, &extend_by);
989
990 return buf;
991}
992
993/*
994 * Extend relation by multiple blocks.
995 *
996 * Tries to extend the relation by extend_by blocks. Depending on the
997 * availability of resources the relation may end up being extended by a
998 * smaller number of pages (unless an error is thrown, always by at least one
999 * page). *extended_by is updated to the number of pages the relation has been
1000 * extended to.
1001 *
1002 * buffers needs to be an array that is at least extend_by long. Upon
1003 * completion, the first extend_by array elements will point to a pinned
1004 * buffer.
1005 *
1006 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
1007 * locked. This is useful for callers that want a buffer that is guaranteed to
1008 * be empty.
1009 */
1013 BufferAccessStrategy strategy,
1014 uint32 flags,
1016 Buffer *buffers,
1018{
1019 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1020 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1021 Assert(extend_by > 0);
1022
1023 if (bmr.relpersistence == '\0')
1024 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1025
1026 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1028 buffers, extended_by);
1029}
1030
1031/*
1032 * Extend the relation so it is at least extend_to blocks large, return buffer
1033 * (extend_to - 1).
1034 *
1035 * This is useful for callers that want to write a specific page, regardless
1036 * of the current size of the relation (e.g. useful for visibilitymap and for
1037 * crash recovery).
1038 */
1039Buffer
1042 BufferAccessStrategy strategy,
1043 uint32 flags,
1046{
1048 uint32 extended_by = 0;
1050 Buffer buffers[64];
1051
1052 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1053 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1055
1056 if (bmr.relpersistence == '\0')
1057 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1058
1059 /*
1060 * If desired, create the file if it doesn't exist. If
1061 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1062 * an smgrexists call.
1063 */
1064 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1065 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1066 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1068 {
1070
1071 /* recheck, fork might have been created concurrently */
1074
1076 }
1077
1078 /*
1079 * If requested, invalidate size cache, so that smgrnblocks asks the
1080 * kernel.
1081 */
1082 if (flags & EB_CLEAR_SIZE_CACHE)
1083 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1084
1085 /*
1086 * Estimate how many pages we'll need to extend by. This avoids acquiring
1087 * unnecessarily many victim buffers.
1088 */
1090
1091 /*
1092 * Since no-one else can be looking at the page contents yet, there is no
1093 * difference between an exclusive lock and a cleanup-strength lock. Note
1094 * that we pass the original mode to ReadBuffer_common() below, when
1095 * falling back to reading the buffer to a concurrent relation extension.
1096 */
1098 flags |= EB_LOCK_TARGET;
1099
1100 while (current_size < extend_to)
1101 {
1102 uint32 num_pages = lengthof(buffers);
1104
1105 if ((uint64) current_size + num_pages > extend_to)
1106 num_pages = extend_to - current_size;
1107
1108 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1109 num_pages, extend_to,
1110 buffers, &extended_by);
1111
1113 Assert(num_pages != 0 || current_size >= extend_to);
1114
1115 for (uint32 i = 0; i < extended_by; i++)
1116 {
1117 if (first_block + i != extend_to - 1)
1118 ReleaseBuffer(buffers[i]);
1119 else
1120 buffer = buffers[i];
1121 }
1122 }
1123
1124 /*
1125 * It's possible that another backend concurrently extended the relation.
1126 * In that case read the buffer.
1127 *
1128 * XXX: Should we control this via a flag?
1129 */
1130 if (buffer == InvalidBuffer)
1131 {
1132 Assert(extended_by == 0);
1133 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1134 fork, extend_to - 1, mode, strategy);
1135 }
1136
1137 return buffer;
1138}
1139
1140/*
1141 * Lock and optionally zero a buffer, as part of the implementation of
1142 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1143 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1144 */
1145static void
1147{
1149 bool need_to_zero;
1152
1154
1155 if (already_valid)
1156 {
1157 /*
1158 * If the caller already knew the buffer was valid, we can skip some
1159 * header interaction. The caller just wants to lock the buffer.
1160 */
1161 need_to_zero = false;
1162 }
1163 else
1164 {
1165 if (isLocalBuf)
1166 {
1167 /* Simple case for non-shared buffers. */
1169 sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
1170 }
1171 else
1172 {
1173 /*
1174 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1175 * concurrently. Even though we aren't doing I/O, that ensures
1176 * that we don't zero a page that someone else has pinned. An
1177 * exclusive content lock wouldn't be enough, because readers are
1178 * allowed to drop the content lock after determining that a tuple
1179 * is visible (see buffer access rules in README).
1180 */
1182 sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
1183 }
1184
1187 }
1188
1189 if (need_to_zero)
1190 {
1192
1193 /*
1194 * Grab the buffer content lock before marking the page as valid, to
1195 * make sure that no other backend sees the zeroed page before the
1196 * caller has had a chance to initialize it.
1197 *
1198 * Since no-one else can be looking at the page contents yet, there is
1199 * no difference between an exclusive lock and a cleanup-strength
1200 * lock. (Note that we cannot use LockBuffer() or
1201 * LockBufferForCleanup() here, because they assert that the buffer is
1202 * already valid.)
1203 */
1204 if (!isLocalBuf)
1206
1207 /* Set BM_VALID, terminate IO, and wake up any waiters */
1208 if (isLocalBuf)
1209 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1210 else
1211 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1212 }
1213 else if (!isLocalBuf)
1214 {
1215 /*
1216 * The buffer is valid, so we can't zero it. The caller still expects
1217 * the page to be locked on return.
1218 */
1219 if (mode == RBM_ZERO_AND_LOCK)
1221 else
1223 }
1224}
1225
1226/*
1227 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1228 * already present, or false if more work is required to either read it in or
1229 * zero it.
1230 */
1233 SMgrRelation smgr,
1234 char persistence,
1235 ForkNumber forkNum,
1236 BlockNumber blockNum,
1237 BufferAccessStrategy strategy,
1240 bool *foundPtr)
1241{
1243
1244 Assert(blockNum != P_NEW);
1245
1246 /* Persistence should be set before */
1247 Assert((persistence == RELPERSISTENCE_TEMP ||
1248 persistence == RELPERSISTENCE_PERMANENT ||
1249 persistence == RELPERSISTENCE_UNLOGGED));
1250
1251 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1255 smgr->smgr_rlocator.backend);
1256
1257 if (persistence == RELPERSISTENCE_TEMP)
1258 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1259 else
1260 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1261 strategy, foundPtr, io_context);
1262
1263 if (*foundPtr)
1264 TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
1265
1266 if (rel)
1267 {
1268 /*
1269 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1270 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1271 * zeroed instead), the per-relation stats always count them.
1272 */
1274 }
1275
1277}
1278
1279/*
1280 * ReadBuffer_common -- common logic for all ReadBuffer variants
1281 *
1282 * smgr is required, rel is optional unless using P_NEW.
1283 */
1286 ForkNumber forkNum,
1288 BufferAccessStrategy strategy)
1289{
1291 Buffer buffer;
1292 int flags;
1293 char persistence;
1294
1295 /*
1296 * Backward compatibility path, most code should use ExtendBufferedRel()
1297 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1298 * scales a lot better.
1299 */
1300 if (unlikely(blockNum == P_NEW))
1301 {
1303
1304 /*
1305 * Since no-one else can be looking at the page contents yet, there is
1306 * no difference between an exclusive lock and a cleanup-strength
1307 * lock.
1308 */
1310 flags |= EB_LOCK_FIRST;
1311
1312 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1313 }
1314
1315 if (rel)
1316 persistence = rel->rd_rel->relpersistence;
1317 else
1318 persistence = smgr_persistence;
1319
1322 {
1323 bool found;
1326
1327 if (persistence == RELPERSISTENCE_TEMP)
1328 {
1331 }
1332 else
1333 {
1334 io_context = IOContextForStrategy(strategy);
1336 }
1337
1338 buffer = PinBufferForBlock(rel, smgr, persistence,
1339 forkNum, blockNum, strategy,
1340 io_object, io_context, &found);
1341 ZeroAndLockBuffer(buffer, mode, found);
1342 return buffer;
1343 }
1344
1345 /*
1346 * Signal that we are going to immediately wait. If we're immediately
1347 * waiting, there is no benefit in actually executing the IO
1348 * asynchronously, it would just add dispatch overhead.
1349 */
1351 if (mode == RBM_ZERO_ON_ERROR)
1353 operation.smgr = smgr;
1354 operation.rel = rel;
1355 operation.persistence = persistence;
1356 operation.forknum = forkNum;
1357 operation.strategy = strategy;
1359 &buffer,
1360 blockNum,
1361 flags))
1363
1364 return buffer;
1365}
1366
1369 Buffer *buffers,
1370 BlockNumber blockNum,
1371 int *nblocks,
1372 int flags,
1373 bool allow_forwarding)
1374{
1375 int actual_nblocks = *nblocks;
1376 int maxcombine = 0;
1377 bool did_start_io;
1380
1381 Assert(*nblocks == 1 || allow_forwarding);
1382 Assert(*nblocks > 0);
1383 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1384
1385 if (operation->persistence == RELPERSISTENCE_TEMP)
1386 {
1389 }
1390 else
1391 {
1394 }
1395
1396 for (int i = 0; i < actual_nblocks; ++i)
1397 {
1398 bool found;
1399
1400 if (allow_forwarding && buffers[i] != InvalidBuffer)
1401 {
1403
1404 /*
1405 * This is a buffer that was pinned by an earlier call to
1406 * StartReadBuffers(), but couldn't be handled in one operation at
1407 * that time. The operation was split, and the caller has passed
1408 * an already pinned buffer back to us to handle the rest of the
1409 * operation. It must continue at the expected block number.
1410 */
1411 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1412
1413 /*
1414 * It might be an already valid buffer (a hit) that followed the
1415 * final contiguous block of an earlier I/O (a miss) marking the
1416 * end of it, or a buffer that some other backend has since made
1417 * valid by performing the I/O for us, in which case we can handle
1418 * it as a hit now. It is safe to check for a BM_VALID flag with
1419 * a relaxed load, because we got a fresh view of it while pinning
1420 * it in the previous call.
1421 *
1422 * On the other hand if we don't see BM_VALID yet, it must be an
1423 * I/O that was split by the previous call and we need to try to
1424 * start a new I/O from this block. We're also racing against any
1425 * other backend that might start the I/O or even manage to mark
1426 * it BM_VALID after this check, but StartBufferIO() will handle
1427 * those cases.
1428 */
1429 if (BufferIsLocal(buffers[i]))
1430 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1431 else
1432 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1434 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1435 }
1436 else
1437 {
1438 buffers[i] = PinBufferForBlock(operation->rel,
1439 operation->smgr,
1440 operation->persistence,
1441 operation->forknum,
1442 blockNum + i,
1443 operation->strategy,
1445 &found);
1446 }
1447
1448 if (found)
1449 {
1450 /*
1451 * We have a hit. If it's the first block in the requested range,
1452 * we can return it immediately and report that WaitReadBuffers()
1453 * does not need to be called. If the initial value of *nblocks
1454 * was larger, the caller will have to call again for the rest.
1455 */
1456 if (i == 0)
1457 {
1458 *nblocks = 1;
1459
1460#ifdef USE_ASSERT_CHECKING
1461
1462 /*
1463 * Initialize enough of ReadBuffersOperation to make
1464 * CheckReadBuffersOperation() work. Outside of assertions
1465 * that's not necessary when no IO is issued.
1466 */
1467 operation->buffers = buffers;
1468 operation->blocknum = blockNum;
1469 operation->nblocks = 1;
1470 operation->nblocks_done = 1;
1472#endif
1473 return false;
1474 }
1475
1476 /*
1477 * Otherwise we already have an I/O to perform, but this block
1478 * can't be included as it is already valid. Split the I/O here.
1479 * There may or may not be more blocks requiring I/O after this
1480 * one, we haven't checked, but they can't be contiguous with this
1481 * one in the way. We'll leave this buffer pinned, forwarding it
1482 * to the next call, avoiding the need to unpin it here and re-pin
1483 * it in the next call.
1484 */
1485 actual_nblocks = i;
1486 break;
1487 }
1488 else
1489 {
1490 /*
1491 * Check how many blocks we can cover with the same IO. The smgr
1492 * implementation might e.g. be limited due to a segment boundary.
1493 */
1494 if (i == 0 && actual_nblocks > 1)
1495 {
1497 operation->forknum,
1498 blockNum);
1500 {
1501 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1502 blockNum, actual_nblocks, maxcombine);
1504 }
1505 }
1506 }
1507 }
1508 *nblocks = actual_nblocks;
1509
1510 /* Populate information needed for I/O. */
1511 operation->buffers = buffers;
1512 operation->blocknum = blockNum;
1513 operation->flags = flags;
1514 operation->nblocks = actual_nblocks;
1515 operation->nblocks_done = 0;
1516 pgaio_wref_clear(&operation->io_wref);
1517
1518 /*
1519 * When using AIO, start the IO in the background. If not, issue prefetch
1520 * requests if desired by the caller.
1521 *
1522 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1523 * de-risk the introduction of AIO somewhat. It's a large architectural
1524 * change, with lots of chances for unanticipated performance effects.
1525 *
1526 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1527 * asynchronously, but without the check here we'd execute IO earlier than
1528 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1529 */
1530 if (io_method != IOMETHOD_SYNC)
1531 {
1532 /*
1533 * Try to start IO asynchronously. It's possible that no IO needs to
1534 * be started, if another backend already performed the IO.
1535 *
1536 * Note that if an IO is started, it might not cover the entire
1537 * requested range, e.g. because an intermediary block has been read
1538 * in by another backend. In that case any "trailing" buffers we
1539 * already pinned above will be "forwarded" by read_stream.c to the
1540 * next call to StartReadBuffers().
1541 *
1542 * This is signalled to the caller by decrementing *nblocks *and*
1543 * reducing operation->nblocks. The latter is done here, but not below
1544 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1545 * overall read size anymore, we need to retry until done in its
1546 * entirety or until failed.
1547 */
1549
1550 operation->nblocks = *nblocks;
1551 }
1552 else
1553 {
1555
1556 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1557 {
1558 /*
1559 * In theory we should only do this if PinBufferForBlock() had to
1560 * allocate new buffers above. That way, if two calls to
1561 * StartReadBuffers() were made for the same blocks before
1562 * WaitReadBuffers(), only the first would issue the advice.
1563 * That'd be a better simulation of true asynchronous I/O, which
1564 * would only start the I/O once, but isn't done here for
1565 * simplicity.
1566 */
1567 smgrprefetch(operation->smgr,
1568 operation->forknum,
1569 blockNum,
1571 }
1572
1573 /*
1574 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1575 * will initiate the necessary IO.
1576 */
1577 did_start_io = true;
1578 }
1579
1581
1582 return did_start_io;
1583}
1584
1585/*
1586 * Begin reading a range of blocks beginning at blockNum and extending for
1587 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1588 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1589 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1590 * and is now being continued. On return, *nblocks holds the number of blocks
1591 * accepted by this operation. If it is less than the original number then
1592 * this operation has been split, but buffer elements up to the original
1593 * requested size may hold forwarded buffers to be used for a continuing
1594 * operation. The caller must either start a new I/O beginning at the block
1595 * immediately following the blocks accepted by this call and pass those
1596 * buffers back in, or release them if it chooses not to. It shouldn't make
1597 * any other use of or assumptions about forwarded buffers.
1598 *
1599 * If false is returned, no I/O is necessary and the buffers covered by
1600 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1601 * an I/O has been started, and WaitReadBuffers() must be called with the same
1602 * operation object before the buffers covered by *nblocks on exit can be
1603 * accessed. Along with the operation object, the caller-supplied array of
1604 * buffers must remain valid until WaitReadBuffers() is called, and any
1605 * forwarded buffers must also be preserved for a continuing call unless
1606 * they are explicitly released.
1607 */
1608bool
1610 Buffer *buffers,
1611 BlockNumber blockNum,
1612 int *nblocks,
1613 int flags)
1614{
1615 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1616 true /* expect forwarded buffers */ );
1617}
1618
1619/*
1620 * Single block version of the StartReadBuffers(). This might save a few
1621 * instructions when called from another translation unit, because it is
1622 * specialized for nblocks == 1.
1623 *
1624 * This version does not support "forwarded" buffers: they cannot be created
1625 * by reading only one block and *buffer is ignored on entry.
1626 */
1627bool
1629 Buffer *buffer,
1630 BlockNumber blocknum,
1631 int flags)
1632{
1633 int nblocks = 1;
1634 bool result;
1635
1636 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1637 false /* single block, no forwarding */ );
1638 Assert(nblocks == 1); /* single block can't be short */
1639
1640 return result;
1641}
1642
1643/*
1644 * Perform sanity checks on the ReadBuffersOperation.
1645 */
1646static void
1648{
1649#ifdef USE_ASSERT_CHECKING
1650 Assert(operation->nblocks_done <= operation->nblocks);
1651 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1652
1653 for (int i = 0; i < operation->nblocks; i++)
1654 {
1655 Buffer buffer = operation->buffers[i];
1659
1660 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1662
1663 if (i < operation->nblocks_done)
1665 }
1666#endif
1667}
1668
1669/*
1670 * We track various stats related to buffer hits. Because this is done in a
1671 * few separate places, this helper exists for convenience.
1672 */
1675 Relation rel, char persistence, SMgrRelation smgr,
1676 ForkNumber forknum, BlockNumber blocknum)
1677{
1679 blocknum,
1683 smgr->smgr_rlocator.backend,
1684 true);
1685
1686 if (persistence == RELPERSISTENCE_TEMP)
1688 else
1690
1692
1693 if (VacuumCostActive)
1695
1696 if (rel)
1698}
1699
1700/*
1701 * Helper for WaitReadBuffers() that processes the results of a readv
1702 * operation, raising an error if necessary.
1703 */
1704static void
1706{
1707 PgAioReturn *aio_ret = &operation->io_return;
1709 int newly_read_blocks = 0;
1710
1711 Assert(pgaio_wref_valid(&operation->io_wref));
1712 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1713
1714 /*
1715 * SMGR reports the number of blocks successfully read as the result of
1716 * the IO operation. Thus we can simply add that to ->nblocks_done.
1717 */
1718
1719 if (likely(rs != PGAIO_RS_ERROR))
1720 newly_read_blocks = aio_ret->result.result;
1721
1722 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1723 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1724 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1725 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1726 {
1727 /*
1728 * We'll retry, so we just emit a debug message to the server log (or
1729 * not even that in prod scenarios).
1730 */
1731 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1732 elog(DEBUG3, "partial read, will retry");
1733 }
1734
1737
1738 operation->nblocks_done += newly_read_blocks;
1739
1740 Assert(operation->nblocks_done <= operation->nblocks);
1741}
1742
1743/*
1744 * Wait for the IO operation initiated by StartReadBuffers() et al to
1745 * complete.
1746 *
1747 * Returns true if we needed to wait for the IO operation, false otherwise.
1748 */
1749bool
1751{
1752 PgAioReturn *aio_ret = &operation->io_return;
1755 bool needed_wait = false;
1756
1757 if (operation->persistence == RELPERSISTENCE_TEMP)
1758 {
1761 }
1762 else
1763 {
1766 }
1767
1768 /*
1769 * If we get here without an IO operation having been issued, the
1770 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1771 * caller should not have called WaitReadBuffers().
1772 *
1773 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1774 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1775 * of the retry logic below, no extra code is required.
1776 *
1777 * This path is expected to eventually go away.
1778 */
1779 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1780 elog(ERROR, "waiting for read operation that didn't read");
1781
1782 /*
1783 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1784 * done. We may need multiple retries, not just because we could get
1785 * multiple partial reads, but also because some of the remaining
1786 * to-be-read buffers may have been read in by other backends, limiting
1787 * the IO size.
1788 */
1789 while (true)
1790 {
1792
1794
1795 /*
1796 * If there is an IO associated with the operation, we may need to
1797 * wait for it.
1798 */
1799 if (pgaio_wref_valid(&operation->io_wref))
1800 {
1801 /*
1802 * Track the time spent waiting for the IO to complete. As
1803 * tracking a wait even if we don't actually need to wait
1804 *
1805 * a) is not cheap, due to the timestamping overhead
1806 *
1807 * b) reports some time as waiting, even if we never waited
1808 *
1809 * we first check if we already know the IO is complete.
1810 *
1811 * Note that operation->io_return is uninitialized for foreign IO,
1812 * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
1813 */
1814 if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
1815 !pgaio_wref_check_done(&operation->io_wref))
1816 {
1818
1819 pgaio_wref_wait(&operation->io_wref);
1820 needed_wait = true;
1821
1822 /*
1823 * The IO operation itself was already counted earlier, in
1824 * AsyncReadBuffers(), this just accounts for the wait time.
1825 */
1827 io_start, 0, 0);
1828 }
1829 else
1830 {
1832 }
1833
1834 if (unlikely(operation->foreign_io))
1835 {
1836 Buffer buffer = operation->buffers[operation->nblocks_done];
1841
1842 if (buf_state & BM_VALID)
1843 {
1844 BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
1845
1846 operation->nblocks_done += 1;
1847 Assert(operation->nblocks_done <= operation->nblocks);
1848
1849 /*
1850 * Track this as a 'hit' for this backend. The backend
1851 * performing the IO will track it as a 'read'.
1852 */
1854 operation->rel, operation->persistence,
1855 operation->smgr, operation->forknum,
1856 blocknum);
1857 }
1858
1859 /*
1860 * If the foreign IO failed and left the buffer invalid,
1861 * nblocks_done is not incremented. The retry loop below will
1862 * call AsyncReadBuffers() which will attempt the IO itself.
1863 */
1864 }
1865 else
1866 {
1867 /*
1868 * We now are sure the IO completed. Check the results. This
1869 * includes reporting on errors if there were any.
1870 */
1872 }
1873 }
1874
1875 /*
1876 * Most of the time, the one IO we already started, will read in
1877 * everything. But we need to deal with partial reads and buffers not
1878 * needing IO anymore.
1879 */
1880 if (operation->nblocks_done == operation->nblocks)
1881 break;
1882
1884
1885 /*
1886 * If the IO completed only partially, we need to perform additional
1887 * work, consider that a form of having had to wait.
1888 */
1889 needed_wait = true;
1890
1891 /*
1892 * This may only complete the IO partially, either because some
1893 * buffers were already valid, or because of a partial read.
1894 *
1895 * NB: In contrast to after the AsyncReadBuffers() call in
1896 * StartReadBuffers(), we do *not* reduce
1897 * ReadBuffersOperation->nblocks here, callers expect the full
1898 * operation to be completed at this point (as more operations may
1899 * have been queued).
1900 */
1902 }
1903
1905
1906 /* NB: READ_DONE tracepoint was already executed in completion callback */
1907 return needed_wait;
1908}
1909
1910/*
1911 * Initiate IO for the ReadBuffersOperation
1912 *
1913 * This function only starts a single IO at a time. The size of the IO may be
1914 * limited to below the to-be-read blocks, if one of the buffers has
1915 * concurrently been read in. If the first to-be-read buffer is already valid,
1916 * no IO will be issued.
1917 *
1918 * To support retries after partial reads, the first operation->nblocks_done
1919 * buffers are skipped.
1920 *
1921 * On return *nblocks_progress is updated to reflect the number of buffers
1922 * affected by the call. If the first buffer is valid, *nblocks_progress is
1923 * set to 1 and operation->nblocks_done is incremented.
1924 *
1925 * Returns true if IO was initiated or is already in progress (foreign IO),
1926 * false if the buffer was already valid.
1927 */
1928static bool
1930{
1931 Buffer *buffers = &operation->buffers[0];
1932 int flags = operation->flags;
1933 ForkNumber forknum = operation->forknum;
1934 char persistence = operation->persistence;
1935 int16 nblocks_done = operation->nblocks_done;
1936 BlockNumber blocknum = operation->blocknum + nblocks_done;
1937 Buffer *io_buffers = &operation->buffers[nblocks_done];
1938 int io_buffers_len = 0;
1940 uint32 ioh_flags = 0;
1946
1947 if (persistence == RELPERSISTENCE_TEMP)
1948 {
1951 }
1952 else
1953 {
1956 }
1957
1958 /*
1959 * When this IO is executed synchronously, either because the caller will
1960 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1961 * the AIO subsystem needs to know.
1962 */
1963 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1965
1966 if (persistence == RELPERSISTENCE_TEMP)
1968
1969 /*
1970 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1971 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1972 * set globally, but on a per-session basis. The completion callback,
1973 * which may be run in other processes, e.g. in IO workers, may have a
1974 * different value of the zero_damaged_pages GUC.
1975 *
1976 * XXX: We probably should eventually use a different flag for
1977 * zero_damaged_pages, so we can report different log levels / error codes
1978 * for zero_damaged_pages and ZERO_ON_ERROR.
1979 */
1982
1983 /*
1984 * For the same reason as with zero_damaged_pages we need to use this
1985 * backend's ignore_checksum_failure value.
1986 */
1989
1990
1991 /*
1992 * To be allowed to report stats in the local completion callback we need
1993 * to prepare to report stats now. This ensures we can safely report the
1994 * checksum failure even in a critical section.
1995 */
1996 pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
1997
1998 /*
1999 * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
2000 * might block, which we don't want after setting IO_IN_PROGRESS. If we
2001 * don't need to do the IO, we'll release the handle.
2002 *
2003 * If we need to wait for IO before we can get a handle, submit
2004 * already-staged IO first, so that other backends don't need to wait.
2005 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
2006 * wait for already submitted IO, which doesn't require additional locks,
2007 * but it could still cause undesirable waits.
2008 *
2009 * A secondary benefit is that this would allow us to measure the time in
2010 * pgaio_io_acquire() without causing undue timer overhead in the common,
2011 * non-blocking, case. However, currently the pgstats infrastructure
2012 * doesn't really allow that, as it a) asserts that an operation can't
2013 * have time without operations b) doesn't have an API to report
2014 * "accumulated" time.
2015 */
2017 if (unlikely(!ioh))
2018 {
2021 }
2022
2023 operation->foreign_io = false;
2024 pgaio_wref_clear(&operation->io_wref);
2025
2026 /*
2027 * Try to start IO on the first buffer in a new run of blocks. If AIO is
2028 * in progress, be it in this backend or another backend, we just
2029 * associate the wait reference with the operation and wait in
2030 * WaitReadBuffers(). This turns out to be important for performance in
2031 * two workloads:
2032 *
2033 * 1) A read stream that has to read the same block multiple times within
2034 * the readahead distance. This can happen e.g. for the table accesses of
2035 * an index scan.
2036 *
2037 * 2) Concurrent scans by multiple backends on the same relation.
2038 *
2039 * If we were to synchronously wait for the in-progress IO, we'd not be
2040 * able to keep enough I/O in flight.
2041 *
2042 * If we do find there is ongoing I/O for the buffer, we set up a 1-block
2043 * ReadBuffersOperation that WaitReadBuffers then can wait on.
2044 *
2045 * It's possible that another backend has started IO on the buffer but not
2046 * yet set its wait reference. In this case, we have no choice but to wait
2047 * for either the wait reference to be valid or the IO to be done.
2048 */
2049 status = StartBufferIO(buffers[nblocks_done], true, true,
2050 &operation->io_wref);
2052 {
2054 *nblocks_progress = 1;
2056 {
2057 /*
2058 * Someone has already completed this block, we're done.
2059 *
2060 * When IO is necessary, ->nblocks_done is updated in
2061 * ProcessReadBuffersResult(), but that is not called if no IO is
2062 * necessary. Thus update here.
2063 */
2064 operation->nblocks_done += 1;
2065 Assert(operation->nblocks_done <= operation->nblocks);
2066
2067 Assert(!pgaio_wref_valid(&operation->io_wref));
2068
2069 /*
2070 * Report and track this as a 'hit' for this backend, even though
2071 * it must have started out as a miss in PinBufferForBlock(). The
2072 * other backend will track this as a 'read'.
2073 */
2075 operation->rel, operation->persistence,
2076 operation->smgr, operation->forknum,
2077 blocknum);
2078 return false;
2079 }
2080
2081 /* The IO is already in-progress */
2083 Assert(pgaio_wref_valid(&operation->io_wref));
2084 operation->foreign_io = true;
2085
2086 return true;
2087 }
2088
2089 Assert(io_buffers[0] == buffers[nblocks_done]);
2090 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2091 io_buffers_len = 1;
2092
2093 /*
2094 * NB: As little code as possible should be added between the
2095 * StartBufferIO() above, the further StartBufferIO()s below and the
2096 * smgrstartreadv(), as some of the buffers are now marked as
2097 * IO_IN_PROGRESS and will thus cause other backends to wait.
2098 */
2099
2100 /*
2101 * How many neighboring-on-disk blocks can we scatter-read into other
2102 * buffers at the same time? In this case we don't wait if we see an I/O
2103 * already in progress (see comment above).
2104 */
2105 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2106 {
2107 /* Must be consecutive block numbers. */
2108 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2109 BufferGetBlockNumber(buffers[i]) - 1);
2110
2111 status = StartBufferIO(buffers[i], true, false, NULL);
2113 break;
2114
2115 Assert(io_buffers[io_buffers_len] == buffers[i]);
2116
2117 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2118 }
2119
2120 /* get a reference to wait for in WaitReadBuffers() */
2121 pgaio_io_get_wref(ioh, &operation->io_wref);
2122
2123 /* provide the list of buffers to the completion callbacks */
2125
2127 persistence == RELPERSISTENCE_TEMP ?
2130 flags);
2131
2133
2134 /* ---
2135 * Even though we're trying to issue IO asynchronously, track the time
2136 * in smgrstartreadv():
2137 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2138 * immediately
2139 * - the io method might not support the IO (e.g. worker IO for a temp
2140 * table)
2141 * ---
2142 */
2144 smgrstartreadv(ioh, operation->smgr, forknum,
2145 blocknum,
2149
2150 if (persistence == RELPERSISTENCE_TEMP)
2152 else
2154
2155 /*
2156 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
2157 * we could end up issuing a lot of IO in a short timespan, despite a low
2158 * cost limit.
2159 */
2160 if (VacuumCostActive)
2162
2164
2165 return true;
2166}
2167
2168/*
2169 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2170 * buffer. If no buffer exists already, selects a replacement victim and
2171 * evicts the old page, but does NOT read in new page.
2172 *
2173 * "strategy" can be a buffer replacement strategy object, or NULL for
2174 * the default strategy. The selected buffer's usage_count is advanced when
2175 * using the default strategy, but otherwise possibly not (see PinBuffer).
2176 *
2177 * The returned buffer is pinned and is already marked as holding the
2178 * desired page. If it already did have the desired page, *foundPtr is
2179 * set true. Otherwise, *foundPtr is set false.
2180 *
2181 * io_context is passed as an output parameter to avoid calling
2182 * IOContextForStrategy() when there is a shared buffers hit and no IO
2183 * statistics need be captured.
2184 *
2185 * No locks are held either at entry or exit.
2186 */
2188BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2189 BlockNumber blockNum,
2190 BufferAccessStrategy strategy,
2192{
2193 BufferTag newTag; /* identity of requested block */
2194 uint32 newHash; /* hash value for newTag */
2195 LWLock *newPartitionLock; /* buffer partition lock for it */
2196 int existing_buf_id;
2200 uint64 set_bits = 0;
2201
2202 /* Make sure we will have room to remember the buffer pin */
2205
2206 /* create a tag so we can lookup the buffer */
2207 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2208
2209 /* determine its hash code and partition lock ID */
2212
2213 /* see if the block is in the buffer pool already */
2216 if (existing_buf_id >= 0)
2217 {
2218 BufferDesc *buf;
2219 bool valid;
2220
2221 /*
2222 * Found it. Now, pin the buffer so no one can steal it from the
2223 * buffer pool, and check to see if the correct data has been loaded
2224 * into the buffer.
2225 */
2227
2228 valid = PinBuffer(buf, strategy, false);
2229
2230 /* Can release the mapping lock as soon as we've pinned it */
2232
2233 *foundPtr = true;
2234
2235 if (!valid)
2236 {
2237 /*
2238 * We can only get here if (a) someone else is still reading in
2239 * the page, (b) a previous read attempt failed, or (c) someone
2240 * called StartReadBuffers() but not yet WaitReadBuffers().
2241 */
2242 *foundPtr = false;
2243 }
2244
2245 return buf;
2246 }
2247
2248 /*
2249 * Didn't find it in the buffer pool. We'll have to initialize a new
2250 * buffer. Remember to unlock the mapping lock while doing the work.
2251 */
2253
2254 /*
2255 * Acquire a victim buffer. Somebody else might try to do the same, we
2256 * don't hold any conflicting locks. If so we'll have to undo our work
2257 * later.
2258 */
2261
2262 /*
2263 * Try to make a hashtable entry for the buffer under its new tag. If
2264 * somebody else inserted another buffer for the tag, we'll release the
2265 * victim buffer we acquired and use the already inserted one.
2266 */
2269 if (existing_buf_id >= 0)
2270 {
2272 bool valid;
2273
2274 /*
2275 * Got a collision. Someone has already done what we were about to do.
2276 * We'll just handle this as if it were found in the buffer pool in
2277 * the first place. First, give up the buffer we were planning to
2278 * use.
2279 *
2280 * We could do this after releasing the partition lock, but then we'd
2281 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2282 * before acquiring the lock, for the rare case of such a collision.
2283 */
2285
2286 /* remaining code should match code at top of routine */
2287
2289
2290 valid = PinBuffer(existing_buf_hdr, strategy, false);
2291
2292 /* Can release the mapping lock as soon as we've pinned it */
2294
2295 *foundPtr = true;
2296
2297 if (!valid)
2298 {
2299 /*
2300 * We can only get here if (a) someone else is still reading in
2301 * the page, (b) a previous read attempt failed, or (c) someone
2302 * called StartReadBuffers() but not yet WaitReadBuffers().
2303 */
2304 *foundPtr = false;
2305 }
2306
2307 return existing_buf_hdr;
2308 }
2309
2310 /*
2311 * Need to lock the buffer header too in order to change its tag.
2312 */
2314
2315 /* some sanity checks while we hold the buffer header lock */
2318
2319 victim_buf_hdr->tag = newTag;
2320
2321 /*
2322 * Make sure BM_PERMANENT is set for buffers that must be written at every
2323 * checkpoint. Unlogged buffers only need to be written at shutdown
2324 * checkpoints, except for their "init" forks, which need to be treated
2325 * just like permanent relations.
2326 */
2328 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2330
2332 set_bits, 0, 0);
2333
2335
2336 /*
2337 * Buffer contents are currently invalid.
2338 */
2339 *foundPtr = false;
2340
2341 return victim_buf_hdr;
2342}
2343
2344/*
2345 * InvalidateBuffer -- mark a shared buffer invalid.
2346 *
2347 * The buffer header spinlock must be held at entry. We drop it before
2348 * returning. (This is sane because the caller must have locked the
2349 * buffer in order to be sure it should be dropped.)
2350 *
2351 * This is used only in contexts such as dropping a relation. We assume
2352 * that no other backend could possibly be interested in using the page,
2353 * so the only reason the buffer might be pinned is if someone else is
2354 * trying to write it out. We have to let them finish before we can
2355 * reclaim the buffer.
2356 *
2357 * The buffer could get reclaimed by someone else while we are waiting
2358 * to acquire the necessary locks; if so, don't mess it up.
2359 */
2360static void
2362{
2364 uint32 oldHash; /* hash value for oldTag */
2365 LWLock *oldPartitionLock; /* buffer partition lock for it */
2368
2369 /* Save the original buffer tag before dropping the spinlock */
2370 oldTag = buf->tag;
2371
2373
2374 /*
2375 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2376 * worth storing the hashcode in BufferDesc so we need not recompute it
2377 * here? Probably not.
2378 */
2381
2382retry:
2383
2384 /*
2385 * Acquire exclusive mapping lock in preparation for changing the buffer's
2386 * association.
2387 */
2389
2390 /* Re-lock the buffer header */
2392
2393 /* If it's changed while we were waiting for lock, do nothing */
2394 if (!BufferTagsEqual(&buf->tag, &oldTag))
2395 {
2398 return;
2399 }
2400
2401 /*
2402 * We assume the reason for it to be pinned is that either we were
2403 * asynchronously reading the page in before erroring out or someone else
2404 * is flushing the page out. Wait for the IO to finish. (This could be
2405 * an infinite loop if the refcount is messed up... it would be nice to
2406 * time out after awhile, but there seems no way to be sure how many loops
2407 * may be needed. Note that if the other guy has pinned the buffer but
2408 * not yet done StartBufferIO, WaitIO will fall through and we'll
2409 * effectively be busy-looping here.)
2410 */
2412 {
2415 /* safety check: should definitely not be our *own* pin */
2417 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2418 WaitIO(buf);
2419 goto retry;
2420 }
2421
2422 /*
2423 * An invalidated buffer should not have any backends waiting to lock the
2424 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2425 */
2427
2428 /*
2429 * Clear out the buffer's tag and flags. We must do this to ensure that
2430 * linear scans of the buffer array don't think the buffer is valid.
2431 */
2433 ClearBufferTag(&buf->tag);
2434
2436 0,
2438 0);
2439
2440 /*
2441 * Remove the buffer from the lookup hashtable, if it was in there.
2442 */
2443 if (oldFlags & BM_TAG_VALID)
2445
2446 /*
2447 * Done with mapping lock.
2448 */
2450}
2451
2452/*
2453 * Helper routine for GetVictimBuffer()
2454 *
2455 * Needs to be called on a buffer with a valid tag, pinned, but without the
2456 * buffer header spinlock held.
2457 *
2458 * Returns true if the buffer can be reused, in which case the buffer is only
2459 * pinned by this backend and marked as invalid, false otherwise.
2460 */
2461static bool
2463{
2465 uint32 hash;
2467 BufferTag tag;
2468
2470
2471 /* have buffer pinned, so it's safe to read tag without lock */
2472 tag = buf_hdr->tag;
2473
2474 hash = BufTableHashCode(&tag);
2476
2478
2479 /* lock the buffer header */
2481
2482 /*
2483 * We have the buffer pinned nobody else should have been able to unset
2484 * this concurrently.
2485 */
2488 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2489
2490 /*
2491 * If somebody else pinned the buffer since, or even worse, dirtied it,
2492 * give up on this buffer: It's clearly in use.
2493 */
2495 {
2497
2500
2501 return false;
2502 }
2503
2504 /*
2505 * An invalidated buffer should not have any backends waiting to lock the
2506 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2507 */
2509
2510 /*
2511 * Clear out the buffer's tag and flags and usagecount. This is not
2512 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2513 * doing anything with the buffer. But currently it's beneficial, as the
2514 * cheaper pre-check for several linear scans of shared buffers use the
2515 * tag (see e.g. FlushDatabaseBuffers()).
2516 */
2517 ClearBufferTag(&buf_hdr->tag);
2519 0,
2521 0);
2522
2524
2525 /* finally delete buffer from the buffer mapping table */
2526 BufTableDelete(&tag, hash);
2527
2529
2534
2535 return true;
2536}
2537
2538static Buffer
2540{
2542 Buffer buf;
2544 bool from_ring;
2545
2546 /*
2547 * Ensure, before we pin a victim buffer, that there's a free refcount
2548 * entry and resource owner slot for the pin.
2549 */
2552
2553 /* we return here if a prospective victim buffer gets used concurrently */
2554again:
2555
2556 /*
2557 * Select a victim buffer. The buffer is returned pinned and owned by
2558 * this backend.
2559 */
2562
2563 /*
2564 * We shouldn't have any other pins for this buffer.
2565 */
2567
2568 /*
2569 * If the buffer was dirty, try to write it out. There is a race
2570 * condition here, another backend could dirty the buffer between
2571 * StrategyGetBuffer() checking that it is not in use and invalidating the
2572 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2573 * that the buffer is not dirty.
2574 */
2575 if (buf_state & BM_DIRTY)
2576 {
2579
2580 /*
2581 * We need a share-exclusive lock on the buffer contents to write it
2582 * out (else we might write invalid data, eg because someone else is
2583 * compacting the page contents while we write). We must use a
2584 * conditional lock acquisition here to avoid deadlock. Even though
2585 * the buffer was not pinned (and therefore surely not locked) when
2586 * StrategyGetBuffer returned it, someone else could have pinned and
2587 * (share-)exclusive-locked it by the time we get here. If we try to
2588 * get the lock unconditionally, we'd block waiting for them; if they
2589 * later block waiting for us, deadlock ensues. (This has been
2590 * observed to happen when two backends are both trying to split btree
2591 * index pages, and the second one just happens to be trying to split
2592 * the page the first one got from StrategyGetBuffer.)
2593 */
2595 {
2596 /*
2597 * Someone else has locked the buffer, so give it up and loop back
2598 * to get another one.
2599 */
2601 goto again;
2602 }
2603
2604 /*
2605 * If using a nondefault strategy, and this victim came from the
2606 * strategy ring, let the strategy decide whether to reject it when
2607 * reusing it would require a WAL flush. This only applies to
2608 * permanent buffers; unlogged buffers can have fake LSNs, so
2609 * XLogNeedsFlush() is not meaningful for them.
2610 *
2611 * We need to hold the content lock in at least share-exclusive mode
2612 * to safely inspect the page LSN, so this couldn't have been done
2613 * inside StrategyGetBuffer().
2614 */
2615 if (strategy && from_ring &&
2619 {
2621 goto again;
2622 }
2623
2624 /* OK, do the I/O */
2627
2629 &buf_hdr->tag);
2630 }
2631
2632
2633 if (buf_state & BM_VALID)
2634 {
2635 /*
2636 * When a BufferAccessStrategy is in use, blocks evicted from shared
2637 * buffers are counted as IOOP_EVICT in the corresponding context
2638 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2639 * strategy in two cases: 1) while initially claiming buffers for the
2640 * strategy ring 2) to replace an existing strategy ring buffer
2641 * because it is pinned or in use and cannot be reused.
2642 *
2643 * Blocks evicted from buffers already in the strategy ring are
2644 * counted as IOOP_REUSE in the corresponding strategy context.
2645 *
2646 * At this point, we can accurately count evictions and reuses,
2647 * because we have successfully claimed the valid buffer. Previously,
2648 * we may have been forced to release the buffer due to concurrent
2649 * pinners or erroring out.
2650 */
2652 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2653 }
2654
2655 /*
2656 * If the buffer has an entry in the buffer mapping table, delete it. This
2657 * can fail because another backend could have pinned or dirtied the
2658 * buffer.
2659 */
2661 {
2663 goto again;
2664 }
2665
2666 /* a final set of sanity checks */
2667#ifdef USE_ASSERT_CHECKING
2669
2672
2674#endif
2675
2676 return buf;
2677}
2678
2679/*
2680 * Return the maximum number of buffers that a backend should try to pin once,
2681 * to avoid exceeding its fair share. This is the highest value that
2682 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2683 * system with a very small buffer pool relative to max_connections.
2684 */
2685uint32
2687{
2688 return MaxProportionalPins;
2689}
2690
2691/*
2692 * Return the maximum number of additional buffers that this backend should
2693 * pin if it wants to stay under the per-backend limit, considering the number
2694 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2695 * return by this function can be zero.
2696 */
2697uint32
2699{
2701
2702 /*
2703 * We get the number of "overflowed" pins for free, but don't know the
2704 * number of pins in PrivateRefCountArray. The cost of calculating that
2705 * exactly doesn't seem worth it, so just assume the max.
2706 */
2708
2709 /* Is this backend already holding more than its fair share? */
2711 return 0;
2712
2714}
2715
2716/*
2717 * Limit the number of pins a batch operation may additionally acquire, to
2718 * avoid running out of pinnable buffers.
2719 *
2720 * One additional pin is always allowed, on the assumption that the operation
2721 * requires at least one to make progress.
2722 */
2723void
2725{
2726 uint32 limit;
2727
2728 if (*additional_pins <= 1)
2729 return;
2730
2731 limit = GetAdditionalPinLimit();
2732 limit = Max(limit, 1);
2733 if (limit < *additional_pins)
2734 *additional_pins = limit;
2735}
2736
2737/*
2738 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2739 * avoid duplicating the tracing and relpersistence related logic.
2740 */
2741static BlockNumber
2744 BufferAccessStrategy strategy,
2745 uint32 flags,
2748 Buffer *buffers,
2750{
2752
2754 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2755 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2756 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2757 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2758 extend_by);
2759
2760 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2763 buffers, &extend_by);
2764 else
2765 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2767 buffers, &extend_by);
2769
2771 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2772 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2773 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2774 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2775 *extended_by,
2776 first_block);
2777
2778 return first_block;
2779}
2780
2781/*
2782 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2783 * shared buffers.
2784 */
2785static BlockNumber
2788 BufferAccessStrategy strategy,
2789 uint32 flags,
2792 Buffer *buffers,
2794{
2798
2800
2801 /*
2802 * Acquire victim buffers for extension without holding extension lock.
2803 * Writing out victim buffers is the most expensive part of extending the
2804 * relation, particularly when doing so requires WAL flushes. Zeroing out
2805 * the buffers is also quite expensive, so do that before holding the
2806 * extension lock as well.
2807 *
2808 * These pages are pinned by us and not valid. While we hold the pin they
2809 * can't be acquired as victim buffers by another backend.
2810 */
2811 for (uint32 i = 0; i < extend_by; i++)
2812 {
2814
2815 buffers[i] = GetVictimBuffer(strategy, io_context);
2817
2818 /* new buffers are zero-filled */
2819 MemSet(buf_block, 0, BLCKSZ);
2820 }
2821
2822 /*
2823 * Lock relation against concurrent extensions, unless requested not to.
2824 *
2825 * We use the same extension lock for all forks. That's unnecessarily
2826 * restrictive, but currently extensions for forks don't happen often
2827 * enough to make it worth locking more granularly.
2828 *
2829 * Note that another backend might have extended the relation by the time
2830 * we get the lock.
2831 */
2832 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2834
2835 /*
2836 * If requested, invalidate size cache, so that smgrnblocks asks the
2837 * kernel.
2838 */
2839 if (flags & EB_CLEAR_SIZE_CACHE)
2840 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2841
2843
2844 /*
2845 * Now that we have the accurate relation size, check if the caller wants
2846 * us to extend to only up to a specific size. If there were concurrent
2847 * extensions, we might have acquired too many buffers and need to release
2848 * them.
2849 */
2851 {
2853
2855 extend_by = 0;
2856 else if ((uint64) first_block + extend_by > extend_upto)
2858
2859 for (uint32 i = extend_by; i < orig_extend_by; i++)
2860 {
2861 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2862
2864 }
2865
2866 if (extend_by == 0)
2867 {
2868 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2871 return first_block;
2872 }
2873 }
2874
2875 /* Fail if relation is already at maximum possible length */
2877 ereport(ERROR,
2879 errmsg("cannot extend relation %s beyond %u blocks",
2880 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2881 MaxBlockNumber)));
2882
2883 /*
2884 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2885 *
2886 * This needs to happen before we extend the relation, because as soon as
2887 * we do, other backends can start to read in those pages.
2888 */
2889 for (uint32 i = 0; i < extend_by; i++)
2890 {
2891 Buffer victim_buf = buffers[i];
2893 BufferTag tag;
2894 uint32 hash;
2896 int existing_id;
2897
2898 /* in case we need to pin an existing buffer below */
2901
2902 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2903 first_block + i);
2904 hash = BufTableHashCode(&tag);
2906
2908
2910
2911 /*
2912 * We get here only in the corner case where we are trying to extend
2913 * the relation but we found a pre-existing buffer. This can happen
2914 * because a prior attempt at extending the relation failed, and
2915 * because mdread doesn't complain about reads beyond EOF (when
2916 * zero_damaged_pages is ON) and so a previous attempt to read a block
2917 * beyond EOF could have left a "valid" zero-filled buffer.
2918 *
2919 * This has also been observed when relation was overwritten by
2920 * external process. Since the legitimate cases should always have
2921 * left a zero-filled buffer, complain if not PageIsNew.
2922 */
2923 if (existing_id >= 0)
2924 {
2927 bool valid;
2928
2929 /*
2930 * Pin the existing buffer before releasing the partition lock,
2931 * preventing it from being evicted.
2932 */
2933 valid = PinBuffer(existing_hdr, strategy, false);
2934
2937
2940
2941 if (valid && !PageIsNew((Page) buf_block))
2942 ereport(ERROR,
2943 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2944 existing_hdr->tag.blockNum,
2945 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2946
2947 /*
2948 * We *must* do smgr[zero]extend before succeeding, else the page
2949 * will not be reserved by the kernel, and the next P_NEW call
2950 * will decide to return the same page. Clear the BM_VALID bit,
2951 * do StartSharedBufferIO() and proceed.
2952 *
2953 * Loop to handle the very small possibility that someone re-sets
2954 * BM_VALID between our clearing it and StartSharedBufferIO
2955 * inspecting it.
2956 */
2957 while (true)
2958 {
2960
2962
2964
2966 break;
2967 }
2968 }
2969 else
2970 {
2972 uint64 set_bits = 0;
2973
2975
2976 /* some sanity checks while we hold the buffer header lock */
2979
2980 victim_buf_hdr->tag = tag;
2981
2983 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2985
2987 set_bits, 0,
2988 0);
2989
2991
2992 /* XXX: could combine the locked operations in it with the above */
2994 }
2995 }
2996
2998
2999 /*
3000 * Note: if smgrzeroextend fails, we will end up with buffers that are
3001 * allocated but not marked BM_VALID. The next relation extension will
3002 * still select the same block number (because the relation didn't get any
3003 * longer on disk) and so future attempts to extend the relation will find
3004 * the same buffers (if they have not been recycled) but come right back
3005 * here to try smgrzeroextend again.
3006 *
3007 * We don't need to set checksum for all-zero pages.
3008 */
3010
3011 /*
3012 * Release the file-extension lock; it's now OK for someone else to extend
3013 * the relation some more.
3014 *
3015 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
3016 * take noticeable time.
3017 */
3018 if (!(flags & EB_SKIP_EXTENSION_LOCK))
3020
3022 io_start, 1, extend_by * BLCKSZ);
3023
3024 /* Set BM_VALID, terminate IO, and wake up any waiters */
3025 for (uint32 i = 0; i < extend_by; i++)
3026 {
3027 Buffer buf = buffers[i];
3029 bool lock = false;
3030
3031 if (flags & EB_LOCK_FIRST && i == 0)
3032 lock = true;
3033 else if (flags & EB_LOCK_TARGET)
3034 {
3036 if (first_block + i + 1 == extend_upto)
3037 lock = true;
3038 }
3039
3040 if (lock)
3042
3043 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
3044 }
3045
3047
3049
3050 return first_block;
3051}
3052
3053/*
3054 * BufferIsLockedByMe
3055 *
3056 * Checks if this backend has the buffer locked in any mode.
3057 *
3058 * Buffer must be pinned.
3059 */
3060bool
3062{
3064
3066
3067 if (BufferIsLocal(buffer))
3068 {
3069 /* Content locks are not maintained for local buffers. */
3070 return true;
3071 }
3072 else
3073 {
3075 return BufferLockHeldByMe(bufHdr);
3076 }
3077}
3078
3079/*
3080 * BufferIsLockedByMeInMode
3081 *
3082 * Checks if this backend has the buffer locked in the specified mode.
3083 *
3084 * Buffer must be pinned.
3085 */
3086bool
3088{
3090
3092
3093 if (BufferIsLocal(buffer))
3094 {
3095 /* Content locks are not maintained for local buffers. */
3096 return true;
3097 }
3098 else
3099 {
3102 }
3103}
3104
3105/*
3106 * BufferIsDirty
3107 *
3108 * Checks if buffer is already dirty.
3109 *
3110 * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock,
3111 * the result may be stale before it's returned.)
3112 */
3113bool
3115{
3117
3119
3120 if (BufferIsLocal(buffer))
3121 {
3122 int bufid = -buffer - 1;
3123
3125 /* Content locks are not maintained for local buffers. */
3126 }
3127 else
3128 {
3132 }
3133
3134 return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3135}
3136
3137/*
3138 * MarkBufferDirty
3139 *
3140 * Marks buffer contents as dirty (actual write happens later).
3141 *
3142 * Buffer must be pinned and exclusive-locked. (If caller does not hold
3143 * exclusive lock, then somebody could be in process of writing the buffer,
3144 * leading to risk of bad data written to disk.)
3145 */
3146void
3148{
3152
3153 if (!BufferIsValid(buffer))
3154 elog(ERROR, "bad buffer ID: %d", buffer);
3155
3156 if (BufferIsLocal(buffer))
3157 {
3159 return;
3160 }
3161
3163
3166
3167 /*
3168 * NB: We have to wait for the buffer header spinlock to be not held, as
3169 * TerminateBufferIO() relies on the spinlock.
3170 */
3172 for (;;)
3173 {
3176
3178
3181
3183 buf_state))
3184 break;
3185 }
3186
3187 /*
3188 * If the buffer was not dirty already, do vacuum accounting.
3189 */
3190 if (!(old_buf_state & BM_DIRTY))
3191 {
3193 if (VacuumCostActive)
3195 }
3196}
3197
3198/*
3199 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3200 *
3201 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3202 * compared to calling the two routines separately. Now it's mainly just
3203 * a convenience function. However, if the passed buffer is valid and
3204 * already contains the desired block, we just return it as-is; and that
3205 * does save considerable work compared to a full release and reacquire.
3206 *
3207 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3208 * buffer actually needs to be released. This case is the same as ReadBuffer,
3209 * but can save some tests in the caller.
3210 */
3211Buffer
3213 Relation relation,
3214 BlockNumber blockNum)
3215{
3216 ForkNumber forkNum = MAIN_FORKNUM;
3218
3219 if (BufferIsValid(buffer))
3220 {
3222 if (BufferIsLocal(buffer))
3223 {
3225 if (bufHdr->tag.blockNum == blockNum &&
3226 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3227 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3228 return buffer;
3230 }
3231 else
3232 {
3234 /* we have pin, so it's ok to examine tag without spinlock */
3235 if (bufHdr->tag.blockNum == blockNum &&
3236 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3237 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3238 return buffer;
3240 }
3241 }
3242
3243 return ReadBuffer(relation, blockNum);
3244}
3245
3246/*
3247 * PinBuffer -- make buffer unavailable for replacement.
3248 *
3249 * For the default access strategy, the buffer's usage_count is incremented
3250 * when we first pin it; for other strategies we just make sure the usage_count
3251 * isn't zero. (The idea of the latter is that we don't want synchronized
3252 * heap scans to inflate the count, but we need it to not be zero to discourage
3253 * other backends from stealing buffers from our ring. As long as we cycle
3254 * through the ring faster than the global clock-sweep cycles, buffers in
3255 * our ring won't be chosen as victims for replacement by other backends.)
3256 *
3257 * This should be applied only to shared buffers, never local ones.
3258 *
3259 * Since buffers are pinned/unpinned very frequently, pin buffers without
3260 * taking the buffer header lock; instead update the state variable in loop of
3261 * CAS operations. Hopefully it's just a single CAS.
3262 *
3263 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3264 * must have been done already.
3265 *
3266 * Returns true if buffer is BM_VALID, else false. This provision allows
3267 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3268 * true, then a false return value also indicates that the buffer was
3269 * (recently) invalid and has not been pinned.
3270 */
3271static bool
3273 bool skip_if_not_valid)
3274{
3276 bool result;
3278
3281
3282 ref = GetPrivateRefCountEntry(b, true);
3283
3284 if (ref == NULL)
3285 {
3288
3290 for (;;)
3291 {
3293 return false;
3294
3295 /*
3296 * We're not allowed to increase the refcount while the buffer
3297 * header spinlock is held. Wait for the lock to be released.
3298 */
3300 {
3302
3303 /* perform checks at the top of the loop again */
3304 continue;
3305 }
3306
3308
3309 /* increase refcount */
3311
3312 if (strategy == NULL)
3313 {
3314 /* Default case: increase usagecount unless already max. */
3317 }
3318 else
3319 {
3320 /*
3321 * Ring buffers shouldn't evict others from pool. Thus we
3322 * don't make usagecount more than 1.
3323 */
3326 }
3327
3329 buf_state))
3330 {
3331 result = (buf_state & BM_VALID) != 0;
3332
3334 break;
3335 }
3336 }
3337 }
3338 else
3339 {
3340 /*
3341 * If we previously pinned the buffer, it is likely to be valid, but
3342 * it may not be if StartReadBuffers() was called and
3343 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3344 * the flags without locking. This is racy, but it's OK to return
3345 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3346 * it'll see that it's now valid.
3347 *
3348 * Note: We deliberately avoid a Valgrind client request here.
3349 * Individual access methods can optionally superimpose buffer page
3350 * client requests on top of our client requests to enforce that
3351 * buffers are only accessed while locked (and pinned). It's possible
3352 * that the buffer page is legitimately non-accessible here. We
3353 * cannot meddle with that.
3354 */
3355 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3356
3357 Assert(ref->data.refcount > 0);
3358 ref->data.refcount++;
3360 }
3361
3362 return result;
3363}
3364
3365/*
3366 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3367 * The spinlock is released before return.
3368 *
3369 * As this function is called with the spinlock held, the caller has to
3370 * previously call ReservePrivateRefCountEntry() and
3371 * ResourceOwnerEnlarge(CurrentResourceOwner);
3372 *
3373 * Currently, no callers of this function want to modify the buffer's
3374 * usage_count at all, so there's no need for a strategy parameter.
3375 * Also we don't bother with a BM_VALID test (the caller could check that for
3376 * itself).
3377 *
3378 * Also all callers only ever use this function when it's known that the
3379 * buffer can't have a preexisting pin by this backend. That allows us to skip
3380 * searching the private refcount array & hash, which is a boon, because the
3381 * spinlock is still held.
3382 *
3383 * Note: use of this routine is frequently mandatory, not just an optimization
3384 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3385 * its state can change under us.
3386 */
3387static void
3389{
3391
3392 /*
3393 * As explained, We don't expect any preexisting pins. That allows us to
3394 * manipulate the PrivateRefCount after releasing the spinlock
3395 */
3397
3398 /*
3399 * Since we hold the buffer spinlock, we can update the buffer state and
3400 * release the lock in one operation.
3401 */
3403
3405 0, 0, 1);
3406
3408}
3409
3410/*
3411 * Support for waking up another backend that is waiting for the cleanup lock
3412 * to be released using BM_PIN_COUNT_WAITER.
3413 *
3414 * See LockBufferForCleanup().
3415 *
3416 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3417 * not just reducing the backend-local pincount for the buffer).
3418 */
3419static void
3421{
3422 /*
3423 * Acquire the buffer header lock, re-check that there's a waiter. Another
3424 * backend could have unpinned this buffer, and already woken up the
3425 * waiter.
3426 *
3427 * There's no danger of the buffer being replaced after we unpinned it
3428 * above, as it's pinned by the waiter. The waiter removes
3429 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3430 * backend waking it up.
3431 */
3433
3436 {
3437 /* we just released the last pin other than the waiter's */
3438 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3439
3442 0);
3443 ProcSendSignal(wait_backend_pgprocno);
3444 }
3445 else
3447}
3448
3449/*
3450 * UnpinBuffer -- make buffer available for replacement.
3451 *
3452 * This should be applied only to shared buffers, never local ones. This
3453 * always adjusts CurrentResourceOwner.
3454 */
3455static void
3463
3464static void
3466{
3469
3471
3472 /* not moving as we're likely deleting it soon anyway */
3473 ref = GetPrivateRefCountEntry(b, false);
3474 Assert(ref != NULL);
3475 Assert(ref->data.refcount > 0);
3476 ref->data.refcount--;
3477 if (ref->data.refcount == 0)
3478 {
3480
3481 /*
3482 * Mark buffer non-accessible to Valgrind.
3483 *
3484 * Note that the buffer may have already been marked non-accessible
3485 * within access method code that enforces that buffers are only
3486 * accessed while a buffer lock is held.
3487 */
3489
3490 /*
3491 * I'd better not still hold the buffer content lock. Can't use
3492 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3493 */
3495
3496 /* decrement the shared reference count */
3498
3499 /* Support LockBufferForCleanup() */
3502
3504 }
3505}
3506
3507/*
3508 * Set up backend-local tracking of a buffer pinned the first time by this
3509 * backend.
3510 */
3511inline void
3513{
3515
3517 ref->data.refcount++;
3518
3520
3521 /*
3522 * This is the first pin for this page by this backend, mark its page as
3523 * defined to valgrind. While the page contents might not actually be
3524 * valid yet, we don't currently guarantee that such pages are marked
3525 * undefined or non-accessible.
3526 *
3527 * It's not necessarily the prettiest to do this here, but otherwise we'd
3528 * need this block of code in multiple places.
3529 */
3531 BLCKSZ);
3532}
3533
3534#define ST_SORT sort_checkpoint_bufferids
3535#define ST_ELEMENT_TYPE CkptSortItem
3536#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3537#define ST_SCOPE static
3538#define ST_DEFINE
3539#include "lib/sort_template.h"
3540
3541/*
3542 * BufferSync -- Write out all dirty buffers in the pool.
3543 *
3544 * This is called at checkpoint time to write out all dirty shared buffers.
3545 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3546 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3547 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3548 * even unlogged buffers, which are otherwise skipped. The remaining flags
3549 * currently have no effect here.
3550 */
3551static void
3552BufferSync(int flags)
3553{
3555 int buf_id;
3556 int num_to_scan;
3557 int num_spaces;
3558 int num_processed;
3559 int num_written;
3561 Oid last_tsid;
3563 int i;
3564 uint64 mask = BM_DIRTY;
3566
3567 /*
3568 * Unless this is a shutdown checkpoint or we have been explicitly told,
3569 * we write only permanent, dirty buffers. But at shutdown or end of
3570 * recovery, we write all dirty buffers.
3571 */
3574 mask |= BM_PERMANENT;
3575
3576 /*
3577 * Loop over all buffers, and mark the ones that need to be written with
3578 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3579 * can estimate how much work needs to be done.
3580 *
3581 * This allows us to write only those pages that were dirty when the
3582 * checkpoint began, and not those that get dirtied while it proceeds.
3583 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3584 * later in this function, or by normal backends or the bgwriter cleaning
3585 * scan, the flag is cleared. Any buffer dirtied after this point won't
3586 * have the flag set.
3587 *
3588 * Note that if we fail to write some buffer, we may leave buffers with
3589 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3590 * certainly need to be written for the next checkpoint attempt, too.
3591 */
3592 num_to_scan = 0;
3593 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3594 {
3596 uint64 set_bits = 0;
3597
3598 /*
3599 * Header spinlock is enough to examine BM_DIRTY, see comment in
3600 * SyncOneBuffer.
3601 */
3603
3604 if ((buf_state & mask) == mask)
3605 {
3606 CkptSortItem *item;
3607
3609
3610 item = &CkptBufferIds[num_to_scan++];
3611 item->buf_id = buf_id;
3612 item->tsId = bufHdr->tag.spcOid;
3613 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3614 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3615 item->blockNum = bufHdr->tag.blockNum;
3616 }
3617
3619 set_bits, 0,
3620 0);
3621
3622 /* Check for barrier events in case NBuffers is large. */
3625 }
3626
3627 if (num_to_scan == 0)
3628 return; /* nothing to do */
3629
3631
3633
3634 /*
3635 * Sort buffers that need to be written to reduce the likelihood of random
3636 * IO. The sorting is also important for the implementation of balancing
3637 * writes between tablespaces. Without balancing writes we'd potentially
3638 * end up writing to the tablespaces one-by-one; possibly overloading the
3639 * underlying system.
3640 */
3642
3643 num_spaces = 0;
3644
3645 /*
3646 * Allocate progress status for each tablespace with buffers that need to
3647 * be flushed. This requires the to-be-flushed array to be sorted.
3648 */
3650 for (i = 0; i < num_to_scan; i++)
3651 {
3652 CkptTsStatus *s;
3653 Oid cur_tsid;
3654
3656
3657 /*
3658 * Grow array of per-tablespace status structs, every time a new
3659 * tablespace is found.
3660 */
3662 {
3663 Size sz;
3664
3665 num_spaces++;
3666
3667 /*
3668 * Not worth adding grow-by-power-of-2 logic here - even with a
3669 * few hundred tablespaces this should be fine.
3670 */
3671 sz = sizeof(CkptTsStatus) * num_spaces;
3672
3673 if (per_ts_stat == NULL)
3675 else
3677
3678 s = &per_ts_stat[num_spaces - 1];
3679 memset(s, 0, sizeof(*s));
3680 s->tsId = cur_tsid;
3681
3682 /*
3683 * The first buffer in this tablespace. As CkptBufferIds is sorted
3684 * by tablespace all (s->num_to_scan) buffers in this tablespace
3685 * will follow afterwards.
3686 */
3687 s->index = i;
3688
3689 /*
3690 * progress_slice will be determined once we know how many buffers
3691 * are in each tablespace, i.e. after this loop.
3692 */
3693
3695 }
3696 else
3697 {
3698 s = &per_ts_stat[num_spaces - 1];
3699 }
3700
3701 s->num_to_scan++;
3702
3703 /* Check for barrier events. */
3706 }
3707
3708 Assert(num_spaces > 0);
3709
3710 /*
3711 * Build a min-heap over the write-progress in the individual tablespaces,
3712 * and compute how large a portion of the total progress a single
3713 * processed buffer is.
3714 */
3717 NULL);
3718
3719 for (i = 0; i < num_spaces; i++)
3720 {
3722
3723 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3724
3726 }
3727
3729
3730 /*
3731 * Iterate through to-be-checkpointed buffers and write the ones (still)
3732 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3733 * tablespaces; otherwise the sorting would lead to only one tablespace
3734 * receiving writes at a time, making inefficient use of the hardware.
3735 */
3736 num_processed = 0;
3737 num_written = 0;
3738 while (!binaryheap_empty(ts_heap))
3739 {
3743
3744 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3745 Assert(buf_id != -1);
3746
3747 bufHdr = GetBufferDescriptor(buf_id);
3748
3749 num_processed++;
3750
3751 /*
3752 * We don't need to acquire the lock here, because we're only looking
3753 * at a single bit. It's possible that someone else writes the buffer
3754 * and clears the flag right after we check, but that doesn't matter
3755 * since SyncOneBuffer will then do nothing. However, there is a
3756 * further race condition: it's conceivable that between the time we
3757 * examine the bit here and the time SyncOneBuffer acquires the lock,
3758 * someone else not only wrote the buffer but replaced it with another
3759 * page and dirtied it. In that improbable case, SyncOneBuffer will
3760 * write the buffer though we didn't need to. It doesn't seem worth
3761 * guarding against this, though.
3762 */
3764 {
3765 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3766 {
3769 num_written++;
3770 }
3771 }
3772
3773 /*
3774 * Measure progress independent of actually having to flush the buffer
3775 * - otherwise writing become unbalanced.
3776 */
3777 ts_stat->progress += ts_stat->progress_slice;
3778 ts_stat->num_scanned++;
3779 ts_stat->index++;
3780
3781 /* Have all the buffers from the tablespace been processed? */
3782 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3783 {
3785 }
3786 else
3787 {
3788 /* update heap with the new progress */
3790 }
3791
3792 /*
3793 * Sleep to throttle our I/O rate.
3794 *
3795 * (This will check for barrier events even if it doesn't sleep.)
3796 */
3797 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3798 }
3799
3800 /*
3801 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3802 * IOContext will always be IOCONTEXT_NORMAL.
3803 */
3805
3807 per_ts_stat = NULL;
3809
3810 /*
3811 * Update checkpoint statistics. As noted above, this doesn't include
3812 * buffers written by other backends or bgwriter scan.
3813 */
3815
3817}
3818
3819/*
3820 * BgBufferSync -- Write out some dirty buffers in the pool.
3821 *
3822 * This is called periodically by the background writer process.
3823 *
3824 * Returns true if it's appropriate for the bgwriter process to go into
3825 * low-power hibernation mode. (This happens if the strategy clock-sweep
3826 * has been "lapped" and no buffer allocations have occurred recently,
3827 * or if the bgwriter has been effectively disabled by setting
3828 * bgwriter_lru_maxpages to 0.)
3829 */
3830bool
3832{
3833 /* info obtained from freelist.c */
3834 int strategy_buf_id;
3837
3838 /*
3839 * Information saved between calls so we can determine the strategy
3840 * point's advance rate and avoid scanning already-cleaned buffers.
3841 */
3842 static bool saved_info_valid = false;
3843 static int prev_strategy_buf_id;
3845 static int next_to_clean;
3846 static uint32 next_passes;
3847
3848 /* Moving averages of allocation rate and clean-buffer density */
3849 static float smoothed_alloc = 0;
3850 static float smoothed_density = 10.0;
3851
3852 /* Potentially these could be tunables, but for now, not */
3853 float smoothing_samples = 16;
3854 float scan_whole_pool_milliseconds = 120000.0;
3855
3856 /* Used to compute how far we scan ahead */
3857 long strategy_delta;
3858 int bufs_to_lap;
3859 int bufs_ahead;
3860 float scans_per_alloc;
3863 int min_scan_buffers;
3864
3865 /* Variables for the scanning loop proper */
3866 int num_to_scan;
3867 int num_written;
3868 int reusable_buffers;
3869
3870 /* Variables for final smoothed_density update */
3871 long new_strategy_delta;
3873
3874 /*
3875 * Find out where the clock-sweep currently is, and how many buffer
3876 * allocations have happened since our last call.
3877 */
3879
3880 /* Report buffer alloc counts to pgstat */
3882
3883 /*
3884 * If we're not running the LRU scan, just stop after doing the stats
3885 * stuff. We mark the saved state invalid so that we can recover sanely
3886 * if LRU scan is turned back on later.
3887 */
3888 if (bgwriter_lru_maxpages <= 0)
3889 {
3890 saved_info_valid = false;
3891 return true;
3892 }
3893
3894 /*
3895 * Compute strategy_delta = how many buffers have been scanned by the
3896 * clock-sweep since last time. If first time through, assume none. Then
3897 * see if we are still ahead of the clock-sweep, and if so, how many
3898 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3899 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3900 * behavior when the passes counts wrap around.
3901 */
3902 if (saved_info_valid)
3903 {
3905
3908
3909 Assert(strategy_delta >= 0);
3910
3911 if ((int32) (next_passes - strategy_passes) > 0)
3912 {
3913 /* we're one pass ahead of the strategy point */
3915#ifdef BGW_DEBUG
3916 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3920#endif
3921 }
3922 else if (next_passes == strategy_passes &&
3924 {
3925 /* on same pass, but ahead or at least not behind */
3927#ifdef BGW_DEBUG
3928 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3932#endif
3933 }
3934 else
3935 {
3936 /*
3937 * We're behind, so skip forward to the strategy point and start
3938 * cleaning from there.
3939 */
3940#ifdef BGW_DEBUG
3941 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3945#endif
3949 }
3950 }
3951 else
3952 {
3953 /*
3954 * Initializing at startup or after LRU scanning had been off. Always
3955 * start at the strategy point.
3956 */
3957#ifdef BGW_DEBUG
3958 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3960#endif
3961 strategy_delta = 0;
3965 }
3966
3967 /* Update saved info for next time */
3970 saved_info_valid = true;
3971
3972 /*
3973 * Compute how many buffers had to be scanned for each new allocation, ie,
3974 * 1/density of reusable buffers, and track a moving average of that.
3975 *
3976 * If the strategy point didn't move, we don't update the density estimate
3977 */
3978 if (strategy_delta > 0 && recent_alloc > 0)
3979 {
3983 }
3984
3985 /*
3986 * Estimate how many reusable buffers there are between the current
3987 * strategy point and where we've scanned ahead to, based on the smoothed
3988 * density estimate.
3989 */
3992
3993 /*
3994 * Track a moving average of recent buffer allocations. Here, rather than
3995 * a true average we want a fast-attack, slow-decline behavior: we
3996 * immediately follow any increase.
3997 */
3998 if (smoothed_alloc <= (float) recent_alloc)
4000 else
4003
4004 /* Scale the estimate by a GUC to allow more aggressive tuning. */
4006
4007 /*
4008 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
4009 * eventually underflow to zero, and the underflows produce annoying
4010 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
4011 * zero, there's no point in tracking smaller and smaller values of
4012 * smoothed_alloc, so just reset it to exactly zero to avoid this
4013 * syndrome. It will pop back up as soon as recent_alloc increases.
4014 */
4015 if (upcoming_alloc_est == 0)
4016 smoothed_alloc = 0;
4017
4018 /*
4019 * Even in cases where there's been little or no buffer allocation
4020 * activity, we want to make a small amount of progress through the buffer
4021 * cache so that as many reusable buffers as possible are clean after an
4022 * idle period.
4023 *
4024 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
4025 * the BGW will be called during the scan_whole_pool time; slice the
4026 * buffer pool into that many sections.
4027 */
4029
4031 {
4032#ifdef BGW_DEBUG
4033 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
4035#endif
4037 }
4038
4039 /*
4040 * Now write out dirty reusable buffers, working forward from the
4041 * next_to_clean point, until we have lapped the strategy scan, or cleaned
4042 * enough buffers to match our estimate of the next cycle's allocation
4043 * requirements, or hit the bgwriter_lru_maxpages limit.
4044 */
4045
4046 num_to_scan = bufs_to_lap;
4047 num_written = 0;
4049
4050 /* Execute the LRU scan */
4051 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
4052 {
4054 wb_context);
4055
4056 if (++next_to_clean >= NBuffers)
4057 {
4058 next_to_clean = 0;
4059 next_passes++;
4060 }
4061 num_to_scan--;
4062
4063 if (sync_state & BUF_WRITTEN)
4064 {
4067 {
4069 break;
4070 }
4071 }
4072 else if (sync_state & BUF_REUSABLE)
4074 }
4075
4077
4078#ifdef BGW_DEBUG
4079 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
4082 bufs_to_lap - num_to_scan,
4085#endif
4086
4087 /*
4088 * Consider the above scan as being like a new allocation scan.
4089 * Characterize its density and update the smoothed one based on it. This
4090 * effectively halves the moving average period in cases where both the
4091 * strategy and the background writer are doing some useful scanning,
4092 * which is helpful because a long memory isn't as desirable on the
4093 * density estimates.
4094 */
4095 new_strategy_delta = bufs_to_lap - num_to_scan;
4097 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4098 {
4102
4103#ifdef BGW_DEBUG
4104 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4107#endif
4108 }
4109
4110 /* Return true if OK to hibernate */
4111 return (bufs_to_lap == 0 && recent_alloc == 0);
4112}
4113
4114/*
4115 * SyncOneBuffer -- process a single buffer during syncing.
4116 *
4117 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4118 * buffers marked recently used, as these are not replacement candidates.
4119 *
4120 * Returns a bitmask containing the following flag bits:
4121 * BUF_WRITTEN: we wrote the buffer.
4122 * BUF_REUSABLE: buffer is available for replacement, ie, it has
4123 * pin count 0 and usage count 0.
4124 *
4125 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4126 * after locking it, but we don't care all that much.)
4127 */
4128static int
4130{
4132 int result = 0;
4134 BufferTag tag;
4135
4136 /* Make sure we can handle the pin */
4139
4140 /*
4141 * Check whether buffer needs writing.
4142 *
4143 * We can make this check without taking the buffer content lock so long
4144 * as we mark pages dirty in access methods *before* logging changes with
4145 * XLogInsert(): if someone marks the buffer dirty just after our check we
4146 * don't worry because our checkpoint.redo points before log record for
4147 * upcoming changes and so we are not required to write such dirty buffer.
4148 */
4150
4153 {
4155 }
4156 else if (skip_recently_used)
4157 {
4158 /* Caller told us not to write recently-used buffers */
4160 return result;
4161 }
4162
4163 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4164 {
4165 /* It's clean, so nothing to do */
4167 return result;
4168 }
4169
4170 /*
4171 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4172 * nothing if the buffer is clean by the time we've locked it.)
4173 */
4175
4177
4178 tag = bufHdr->tag;
4179
4181
4182 /*
4183 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4184 * IOContext will always be IOCONTEXT_NORMAL.
4185 */
4187
4188 return result | BUF_WRITTEN;
4189}
4190
4191/*
4192 * AtEOXact_Buffers - clean up at end of transaction.
4193 *
4194 * As of PostgreSQL 8.0, buffer pins should get released by the
4195 * ResourceOwner mechanism. This routine is just a debugging
4196 * cross-check that no pins remain.
4197 */
4198void
4207
4208/*
4209 * Initialize access to shared buffer pool
4210 *
4211 * This is called during backend startup (whether standalone or under the
4212 * postmaster). It sets up for this backend's access to the already-existing
4213 * buffer pool.
4214 */
4215void
4217{
4218 /*
4219 * An advisory limit on the number of pins each backend should hold, based
4220 * on shared_buffers and the maximum number of connections possible.
4221 * That's very pessimistic, but outside toy-sized shared_buffers it should
4222 * allow plenty of pins. LimitAdditionalPins() and
4223 * GetAdditionalPinLimit() can be used to check the remaining balance.
4224 */
4226
4229
4231
4232 /*
4233 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4234 * the corresponding phase of backend shutdown.
4235 */
4236 Assert(MyProc != NULL);
4238}
4239
4240/*
4241 * During backend exit, ensure that we released all shared-buffer locks and
4242 * assert that we have no remaining pins.
4243 */
4244static void
4246{
4247 UnlockBuffers();
4248
4250
4251 /* localbuf.c needs a chance too */
4253}
4254
4255/*
4256 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4257 *
4258 * As of PostgreSQL 8.0, buffer pins should get released by the
4259 * ResourceOwner mechanism. This routine is just a debugging
4260 * cross-check that no pins remain.
4261 */
4262static void
4264{
4265#ifdef USE_ASSERT_CHECKING
4266 int RefCountErrors = 0;
4268 int i;
4269 char *s;
4270
4271 /* check the array */
4272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4273 {
4275 {
4276 res = &PrivateRefCountArray[i];
4277
4279 elog(WARNING, "buffer refcount leak: %s", s);
4280 pfree(s);
4281
4283 }
4284 }
4285
4286 /* if necessary search the hash */
4288 {
4289 refcount_iterator iter;
4290
4292 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4293 {
4295 elog(WARNING, "buffer refcount leak: %s", s);
4296 pfree(s);
4298 }
4299 }
4300
4301 Assert(RefCountErrors == 0);
4302#endif
4303}
4304
4305#ifdef USE_ASSERT_CHECKING
4306/*
4307 * Check for exclusive-locked catalog buffers. This is the core of
4308 * AssertCouldGetRelation().
4309 *
4310 * A backend would self-deadlock on the content lock if the catalog scan read
4311 * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4312 * of catalogs used in relcache, because a catcache search on any catalog may
4313 * build that catalog's relcache entry. We don't have an inventory of
4314 * catalogs relcache uses, so just check buffers of most catalogs.
4315 *
4316 * It's better to minimize waits while holding an exclusive buffer lock, so it
4317 * would be nice to broaden this check not to be catalog-specific. However,
4318 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4319 * read tables. That is deadlock-free as long as there's no loop in the
4320 * dependency graph: modifying table A may cause an opclass to read table B,
4321 * but it must not cause a read of table A.
4322 */
4323void
4325{
4327
4328 /* check the array */
4329 for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4330 {
4332 {
4333 res = &PrivateRefCountArray[i];
4334
4335 if (res->buffer == InvalidBuffer)
4336 continue;
4337
4339 }
4340 }
4341
4342 /* if necessary search the hash */
4344 {
4345 refcount_iterator iter;
4346
4348 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4349 {
4351 }
4352 }
4353}
4354
4355static void
4357{
4359 BufferTag tag;
4360 Oid relid;
4361
4363 return;
4364
4365 tag = bufHdr->tag;
4366
4367 /*
4368 * This relNumber==relid assumption holds until a catalog experiences
4369 * VACUUM FULL or similar. After a command like that, relNumber will be
4370 * in the normal (non-catalog) range, and we lose the ability to detect
4371 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4372 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4373 * held lock.
4374 */
4375 relid = tag.relNumber;
4376
4377 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4378 return;
4379
4381}
4382#endif
4383
4384
4385/*
4386 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4387 */
4388char *
4390{
4391 BufferDesc *buf;
4393 char *result;
4394 ProcNumber backend;
4396
4398 if (BufferIsLocal(buffer))
4399 {
4402 backend = MyProcNumber;
4403 }
4404 else
4405 {
4408 backend = INVALID_PROC_NUMBER;
4409 }
4410
4411 /* theoretically we should lock the bufHdr here */
4412 buf_state = pg_atomic_read_u64(&buf->state);
4413
4414 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4415 buffer,
4417 BufTagGetForkNum(&buf->tag)).str,
4418 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4420 return result;
4421}
4422
4423/*
4424 * CheckPointBuffers
4425 *
4426 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4427 *
4428 * Note: temporary relations do not participate in checkpoints, so they don't
4429 * need to be flushed.
4430 */
4431void
4433{
4434 BufferSync(flags);
4435}
4436
4437/*
4438 * BufferGetBlockNumber
4439 * Returns the block number associated with a buffer.
4440 *
4441 * Note:
4442 * Assumes that the buffer is valid and pinned, else the
4443 * value may be obsolete immediately...
4444 */
4447{
4449
4451
4452 if (BufferIsLocal(buffer))
4454 else
4456
4457 /* pinned, so OK to read tag without spinlock */
4458 return bufHdr->tag.blockNum;
4459}
4460
4461/*
4462 * BufferGetTag
4463 * Returns the relfilelocator, fork number and block number associated with
4464 * a buffer.
4465 */
4466void
4469{
4471
4472 /* Do the same checks as BufferGetBlockNumber. */
4474
4475 if (BufferIsLocal(buffer))
4477 else
4479
4480 /* pinned, so OK to read tag without spinlock */
4481 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4482 *forknum = BufTagGetForkNum(&bufHdr->tag);
4483 *blknum = bufHdr->tag.blockNum;
4484}
4485
4486/*
4487 * FlushBuffer
4488 * Physically write out a shared buffer.
4489 *
4490 * NOTE: this actually just passes the buffer contents to the kernel; the
4491 * real write to disk won't happen until the kernel feels like it. This
4492 * is okay from our point of view since we can redo the changes from WAL.
4493 * However, we will need to force the changes to disk via fsync before
4494 * we can checkpoint WAL.
4495 *
4496 * The caller must hold a pin on the buffer and have
4497 * (share-)exclusively-locked the buffer contents.
4498 *
4499 * If the caller has an smgr reference for the buffer's relation, pass it
4500 * as the second parameter. If not, pass NULL.
4501 */
4502static void
4505{
4507 ErrorContextCallback errcallback;
4510
4513
4514 /*
4515 * Try to start an I/O operation. If StartBufferIO returns false, then
4516 * someone else flushed the buffer before we could, so we need not do
4517 * anything.
4518 */
4519 if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
4520 return;
4521
4522 /* Setup error traceback support for ereport() */
4524 errcallback.arg = buf;
4525 errcallback.previous = error_context_stack;
4526 error_context_stack = &errcallback;
4527
4528 /* Find smgr relation for buffer */
4529 if (reln == NULL)
4531
4533 buf->tag.blockNum,
4534 reln->smgr_rlocator.locator.spcOid,
4535 reln->smgr_rlocator.locator.dbOid,
4536 reln->smgr_rlocator.locator.relNumber);
4537
4538 /*
4539 * As we hold at least a share-exclusive lock on the buffer, the LSN
4540 * cannot change during the flush (and thus can't be torn).
4541 */
4543
4544 /*
4545 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4546 * rule that log updates must hit disk before any of the data-file changes
4547 * they describe do.
4548 *
4549 * However, this rule does not apply to unlogged relations, which will be
4550 * lost after a crash anyway. Most unlogged relation pages do not bear
4551 * LSNs since we never emit WAL records for them, and therefore flushing
4552 * up through the buffer LSN would be useless, but harmless. However,
4553 * some index AMs use LSNs internally to detect concurrent page
4554 * modifications, and therefore unlogged index pages bear "fake" LSNs
4555 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4556 * LSN counter could advance past the WAL insertion point; and if it did
4557 * happen, attempting to flush WAL through that location would fail, with
4558 * disastrous system-wide consequences. To make sure that can't happen,
4559 * skip the flush if the buffer isn't permanent.
4560 */
4561 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4563
4564 /*
4565 * Now it's safe to write the buffer to disk. Note that no one else should
4566 * have been able to write it, while we were busy with log flushing,
4567 * because we got the exclusive right to perform I/O by setting the
4568 * BM_IO_IN_PROGRESS bit.
4569 */
4571
4572 /* Update page checksum if desired. */
4573 PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
4574
4576
4578 BufTagGetForkNum(&buf->tag),
4579 buf->tag.blockNum,
4580 bufBlock,
4581 false);
4582
4583 /*
4584 * When a strategy is in use, only flushes of dirty buffers already in the
4585 * strategy ring are counted as strategy writes (IOCONTEXT
4586 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4587 * statistics tracking.
4588 *
4589 * If a shared buffer initially added to the ring must be flushed before
4590 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4591 *
4592 * If a shared buffer which was added to the ring later because the
4593 * current strategy buffer is pinned or in use or because all strategy
4594 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4595 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4596 * (from_ring will be false).
4597 *
4598 * When a strategy is not in use, the write can only be a "regular" write
4599 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4600 */
4603
4605
4606 /*
4607 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4608 */
4609 TerminateBufferIO(buf, true, 0, true, false);
4610
4612 buf->tag.blockNum,
4613 reln->smgr_rlocator.locator.spcOid,
4614 reln->smgr_rlocator.locator.dbOid,
4615 reln->smgr_rlocator.locator.relNumber);
4616
4617 /* Pop the error context stack */
4618 error_context_stack = errcallback.previous;
4619}
4620
4621/*
4622 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4623 * before/after calling FlushBuffer().
4624 */
4625static void
4635
4636/*
4637 * RelationGetNumberOfBlocksInFork
4638 * Determines the current number of pages in the specified relation fork.
4639 *
4640 * Note that the accuracy of the result will depend on the details of the
4641 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4642 * it might not be.
4643 */
4646{
4647 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4648 {
4649 /*
4650 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4651 * tableam returns the size in bytes - but for the purpose of this
4652 * routine, we want the number of blocks. Therefore divide, rounding
4653 * up.
4654 */
4656
4657 szbytes = table_relation_size(relation, forkNum);
4658
4659 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4660 }
4661 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4662 {
4663 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4664 }
4665 else
4666 Assert(false);
4667
4668 return 0; /* keep compiler quiet */
4669}
4670
4671/*
4672 * BufferIsPermanent
4673 * Determines whether a buffer will potentially still be around after
4674 * a crash. Caller must hold a buffer pin.
4675 */
4676bool
4678{
4680
4681 /* Local buffers are used only for temp relations. */
4682 if (BufferIsLocal(buffer))
4683 return false;
4684
4685 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4688
4689 /*
4690 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4691 * need not bother with the buffer header spinlock. Even if someone else
4692 * changes the buffer header state while we're doing this, the state is
4693 * changed atomically, so we'll read the old value or the new value, but
4694 * not random garbage.
4695 */
4697 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4698}
4699
4700/*
4701 * BufferGetLSNAtomic
4702 * Retrieves the LSN of the buffer atomically.
4703 *
4704 * This is necessary for some callers who may only hold a share lock on
4705 * the buffer. A share lock allows a concurrent backend to set hint bits
4706 * on the page, which in turn may require a WAL record to be emitted.
4707 *
4708 * On platforms with 8 byte atomic reads/writes, we don't need to do any
4709 * additional locking. On platforms not supporting such 8 byte atomic
4710 * reads/writes, we need to actually take the header lock.
4711 */
4714{
4715 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4718
4719#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4721#else
4722 {
4723 char *page = BufferGetPage(buffer);
4725 XLogRecPtr lsn;
4726
4727 /*
4728 * If we don't need locking for correctness, fastpath out.
4729 */
4731 return PageGetLSN(page);
4732
4735 lsn = PageGetLSN(page);
4737
4738 return lsn;
4739 }
4740#endif
4741}
4742
4743/* ---------------------------------------------------------------------
4744 * DropRelationBuffers
4745 *
4746 * This function removes from the buffer pool all the pages of the
4747 * specified relation forks that have block numbers >= firstDelBlock.
4748 * (In particular, with firstDelBlock = 0, all pages are removed.)
4749 * Dirty pages are simply dropped, without bothering to write them
4750 * out first. Therefore, this is NOT rollback-able, and so should be
4751 * used only with extreme caution!
4752 *
4753 * Currently, this is called only from smgr.c when the underlying file
4754 * is about to be deleted or truncated (firstDelBlock is needed for
4755 * the truncation case). The data in the affected pages would therefore
4756 * be deleted momentarily anyway, and there is no point in writing it.
4757 * It is the responsibility of higher-level code to ensure that the
4758 * deletion or truncation does not lose any data that could be needed
4759 * later. It is also the responsibility of higher-level code to ensure
4760 * that no other process could be trying to load more pages of the
4761 * relation into buffers.
4762 * --------------------------------------------------------------------
4763 */
4764void
4767{
4768 int i;
4769 int j;
4770 RelFileLocatorBackend rlocator;
4773
4774 rlocator = smgr_reln->smgr_rlocator;
4775
4776 /* If it's a local relation, it's localbuf.c's problem. */
4777 if (RelFileLocatorBackendIsTemp(rlocator))
4778 {
4779 if (rlocator.backend == MyProcNumber)
4780 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4782
4783 return;
4784 }
4785
4786 /*
4787 * To remove all the pages of the specified relation forks from the buffer
4788 * pool, we need to scan the entire buffer pool but we can optimize it by
4789 * finding the buffers from BufMapping table provided we know the exact
4790 * size of each fork of the relation. The exact size is required to ensure
4791 * that we don't leave any buffer for the relation being dropped as
4792 * otherwise the background writer or checkpointer can lead to a PANIC
4793 * error while flushing buffers corresponding to files that don't exist.
4794 *
4795 * To know the exact size, we rely on the size cached for each fork by us
4796 * during recovery which limits the optimization to recovery and on
4797 * standbys but we can easily extend it once we have shared cache for
4798 * relation size.
4799 *
4800 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4801 * and the future writes keeps the cached value up-to-date. See
4802 * smgrextend. It is possible that the value of the first lseek is smaller
4803 * than the actual number of existing blocks in the file due to buggy
4804 * Linux kernels that might not have accounted for the recent write. But
4805 * that should be fine because there must not be any buffers after that
4806 * file size.
4807 */
4808 for (i = 0; i < nforks; i++)
4809 {
4810 /* Get the number of blocks for a relation's fork */
4812
4814 {
4816 break;
4817 }
4818
4819 /* calculate the number of blocks to be invalidated */
4821 }
4822
4823 /*
4824 * We apply the optimization iff the total number of blocks to invalidate
4825 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4826 */
4829 {
4830 for (j = 0; j < nforks; j++)
4831 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4833 return;
4834 }
4835
4836 for (i = 0; i < NBuffers; i++)
4837 {
4839
4840 /*
4841 * We can make this a tad faster by prechecking the buffer tag before
4842 * we attempt to lock the buffer; this saves a lot of lock
4843 * acquisitions in typical cases. It should be safe because the
4844 * caller must have AccessExclusiveLock on the relation, or some other
4845 * reason to be certain that no one is loading new pages of the rel
4846 * into the buffer pool. (Otherwise we might well miss such pages
4847 * entirely.) Therefore, while the tag might be changing while we
4848 * look at it, it can't be changing *to* a value we care about, only
4849 * *away* from such a value. So false negatives are impossible, and
4850 * false positives are safe because we'll recheck after getting the
4851 * buffer lock.
4852 *
4853 * We could check forkNum and blockNum as well as the rlocator, but
4854 * the incremental win from doing so seems small.
4855 */
4856 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4857 continue;
4858
4860
4861 for (j = 0; j < nforks; j++)
4862 {
4863 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4864 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4865 bufHdr->tag.blockNum >= firstDelBlock[j])
4866 {
4867 InvalidateBuffer(bufHdr); /* releases spinlock */
4868 break;
4869 }
4870 }
4871 if (j >= nforks)
4873 }
4874}
4875
4876/* ---------------------------------------------------------------------
4877 * DropRelationsAllBuffers
4878 *
4879 * This function removes from the buffer pool all the pages of all
4880 * forks of the specified relations. It's equivalent to calling
4881 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4882 * --------------------------------------------------------------------
4883 */
4884void
4886{
4887 int i;
4888 int n = 0;
4889 SMgrRelation *rels;
4890 BlockNumber (*block)[MAX_FORKNUM + 1];
4893 bool cached = true;
4894 bool use_bsearch;
4895
4896 if (nlocators == 0)
4897 return;
4898
4899 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4900
4901 /* If it's a local relation, it's localbuf.c's problem. */
4902 for (i = 0; i < nlocators; i++)
4903 {
4904 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4905 {
4906 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4907 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4908 }
4909 else
4910 rels[n++] = smgr_reln[i];
4911 }
4912
4913 /*
4914 * If there are no non-local relations, then we're done. Release the
4915 * memory and return.
4916 */
4917 if (n == 0)
4918 {
4919 pfree(rels);
4920 return;
4921 }
4922
4923 /*
4924 * This is used to remember the number of blocks for all the relations
4925 * forks.
4926 */
4927 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4928 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4929
4930 /*
4931 * We can avoid scanning the entire buffer pool if we know the exact size
4932 * of each of the given relation forks. See DropRelationBuffers.
4933 */
4934 for (i = 0; i < n && cached; i++)
4935 {
4936 for (int j = 0; j <= MAX_FORKNUM; j++)
4937 {
4938 /* Get the number of blocks for a relation's fork. */
4939 block[i][j] = smgrnblocks_cached(rels[i], j);
4940
4941 /* We need to only consider the relation forks that exists. */
4942 if (block[i][j] == InvalidBlockNumber)
4943 {
4944 if (!smgrexists(rels[i], j))
4945 continue;
4946 cached = false;
4947 break;
4948 }
4949
4950 /* calculate the total number of blocks to be invalidated */
4951 nBlocksToInvalidate += block[i][j];
4952 }
4953 }
4954
4955 /*
4956 * We apply the optimization iff the total number of blocks to invalidate
4957 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4958 */
4960 {
4961 for (i = 0; i < n; i++)
4962 {
4963 for (int j = 0; j <= MAX_FORKNUM; j++)
4964 {
4965 /* ignore relation forks that doesn't exist */
4966 if (!BlockNumberIsValid(block[i][j]))
4967 continue;
4968
4969 /* drop all the buffers for a particular relation fork */
4970 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4971 j, block[i][j], 0);
4972 }
4973 }
4974
4975 pfree(block);
4976 pfree(rels);
4977 return;
4978 }
4979
4980 pfree(block);
4981 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4982 for (i = 0; i < n; i++)
4983 locators[i] = rels[i]->smgr_rlocator.locator;
4984
4985 /*
4986 * For low number of relations to drop just use a simple walk through, to
4987 * save the bsearch overhead. The threshold to use is rather a guess than
4988 * an exactly determined value, as it depends on many factors (CPU and RAM
4989 * speeds, amount of shared buffers etc.).
4990 */
4992
4993 /* sort the list of rlocators if necessary */
4994 if (use_bsearch)
4996
4997 for (i = 0; i < NBuffers; i++)
4998 {
4999 RelFileLocator *rlocator = NULL;
5001
5002 /*
5003 * As in DropRelationBuffers, an unlocked precheck should be safe and
5004 * saves some cycles.
5005 */
5006
5007 if (!use_bsearch)
5008 {
5009 int j;
5010
5011 for (j = 0; j < n; j++)
5012 {
5014 {
5015 rlocator = &locators[j];
5016 break;
5017 }
5018 }
5019 }
5020 else
5021 {
5022 RelFileLocator locator;
5023
5024 locator = BufTagGetRelFileLocator(&bufHdr->tag);
5025 rlocator = bsearch(&locator,
5026 locators, n, sizeof(RelFileLocator),
5028 }
5029
5030 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5031 if (rlocator == NULL)
5032 continue;
5033
5035 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
5036 InvalidateBuffer(bufHdr); /* releases spinlock */
5037 else
5039 }
5040
5041 pfree(locators);
5042 pfree(rels);
5043}
5044
5045/* ---------------------------------------------------------------------
5046 * FindAndDropRelationBuffers
5047 *
5048 * This function performs look up in BufMapping table and removes from the
5049 * buffer pool all the pages of the specified relation fork that has block
5050 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
5051 * pages are removed.)
5052 * --------------------------------------------------------------------
5053 */
5054static void
5058{
5059 BlockNumber curBlock;
5060
5061 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
5062 {
5063 uint32 bufHash; /* hash value for tag */
5064 BufferTag bufTag; /* identity of requested block */
5065 LWLock *bufPartitionLock; /* buffer partition lock for it */
5066 int buf_id;
5068
5069 /* create a tag so we can lookup the buffer */
5070 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
5071
5072 /* determine its hash code and partition lock ID */
5075
5076 /* Check that it is in the buffer pool. If not, do nothing. */
5078 buf_id = BufTableLookup(&bufTag, bufHash);
5080
5081 if (buf_id < 0)
5082 continue;
5083
5084 bufHdr = GetBufferDescriptor(buf_id);
5085
5086 /*
5087 * We need to lock the buffer header and recheck if the buffer is
5088 * still associated with the same block because the buffer could be
5089 * evicted by some other backend loading blocks for a different
5090 * relation after we release lock on the BufMapping table.
5091 */
5093
5094 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5095 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5096 bufHdr->tag.blockNum >= firstDelBlock)
5097 InvalidateBuffer(bufHdr); /* releases spinlock */
5098 else
5100 }
5101}
5102
5103/* ---------------------------------------------------------------------
5104 * DropDatabaseBuffers
5105 *
5106 * This function removes all the buffers in the buffer cache for a
5107 * particular database. Dirty pages are simply dropped, without
5108 * bothering to write them out first. This is used when we destroy a
5109 * database, to avoid trying to flush data to disk when the directory
5110 * tree no longer exists. Implementation is pretty similar to
5111 * DropRelationBuffers() which is for destroying just one relation.
5112 * --------------------------------------------------------------------
5113 */
5114void
5116{
5117 int i;
5118
5119 /*
5120 * We needn't consider local buffers, since by assumption the target
5121 * database isn't our own.
5122 */
5123
5124 for (i = 0; i < NBuffers; i++)
5125 {
5127
5128 /*
5129 * As in DropRelationBuffers, an unlocked precheck should be safe and
5130 * saves some cycles.
5131 */
5132 if (bufHdr->tag.dbOid != dbid)
5133 continue;
5134
5136 if (bufHdr->tag.dbOid == dbid)
5137 InvalidateBuffer(bufHdr); /* releases spinlock */
5138 else
5140 }
5141}
5142
5143/* ---------------------------------------------------------------------
5144 * FlushRelationBuffers
5145 *
5146 * This function writes all dirty pages of a relation out to disk
5147 * (or more accurately, out to kernel disk buffers), ensuring that the
5148 * kernel has an up-to-date view of the relation.
5149 *
5150 * Generally, the caller should be holding AccessExclusiveLock on the
5151 * target relation to ensure that no other backend is busy dirtying
5152 * more blocks of the relation; the effects can't be expected to last
5153 * after the lock is released.
5154 *
5155 * XXX currently it sequentially searches the buffer pool, should be
5156 * changed to more clever ways of searching. This routine is not
5157 * used in any performance-critical code paths, so it's not worth
5158 * adding additional overhead to normal paths to make it go faster.
5159 * --------------------------------------------------------------------
5160 */
5161void
5163{
5164 int i;
5166 SMgrRelation srel = RelationGetSmgr(rel);
5167
5168 if (RelationUsesLocalBuffers(rel))
5169 {
5170 for (i = 0; i < NLocBuffer; i++)
5171 {
5173
5175 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5176 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5177 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5178 {
5179 ErrorContextCallback errcallback;
5180
5181 /* Setup error traceback support for ereport() */
5183 errcallback.arg = bufHdr;
5184 errcallback.previous = error_context_stack;
5185 error_context_stack = &errcallback;
5186
5187 /* Make sure we can handle the pin */
5190
5191 /*
5192 * Pin/unpin mostly to make valgrind work, but it also seems
5193 * like the right thing to do.
5194 */
5195 PinLocalBuffer(bufHdr, false);
5196
5197
5198 FlushLocalBuffer(bufHdr, srel);
5199
5201
5202 /* Pop the error context stack */
5203 error_context_stack = errcallback.previous;
5204 }
5205 }
5206
5207 return;
5208 }
5209
5210 for (i = 0; i < NBuffers; i++)
5211 {
5213
5215
5216 /*
5217 * As in DropRelationBuffers, an unlocked precheck should be safe and
5218 * saves some cycles.
5219 */
5221 continue;
5222
5223 /* Make sure we can handle the pin */
5226
5228 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5230 {
5234 }
5235 else
5237 }
5238}
5239
5240/* ---------------------------------------------------------------------
5241 * FlushRelationsAllBuffers
5242 *
5243 * This function flushes out of the buffer pool all the pages of all
5244 * forks of the specified smgr relations. It's equivalent to calling
5245 * FlushRelationBuffers once per relation. The relations are assumed not
5246 * to use local buffers.
5247 * --------------------------------------------------------------------
5248 */
5249void
5251{
5252 int i;
5254 bool use_bsearch;
5255
5256 if (nrels == 0)
5257 return;
5258
5259 /* fill-in array for qsort */
5261
5262 for (i = 0; i < nrels; i++)
5263 {
5264 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5265
5266 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5267 srels[i].srel = smgrs[i];
5268 }
5269
5270 /*
5271 * Save the bsearch overhead for low number of relations to sync. See
5272 * DropRelationsAllBuffers for details.
5273 */
5275
5276 /* sort the list of SMgrRelations if necessary */
5277 if (use_bsearch)
5278 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5279
5280 for (i = 0; i < NBuffers; i++)
5281 {
5285
5286 /*
5287 * As in DropRelationBuffers, an unlocked precheck should be safe and
5288 * saves some cycles.
5289 */
5290
5291 if (!use_bsearch)
5292 {
5293 int j;
5294
5295 for (j = 0; j < nrels; j++)
5296 {
5297 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5298 {
5299 srelent = &srels[j];
5300 break;
5301 }
5302 }
5303 }
5304 else
5305 {
5306 RelFileLocator rlocator;
5307
5308 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5309 srelent = bsearch(&rlocator,
5310 srels, nrels, sizeof(SMgrSortArray),
5312 }
5313
5314 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5315 if (srelent == NULL)
5316 continue;
5317
5318 /* Make sure we can handle the pin */
5321
5323 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5325 {
5329 }
5330 else
5332 }
5333
5334 pfree(srels);
5335}
5336
5337/* ---------------------------------------------------------------------
5338 * RelationCopyStorageUsingBuffer
5339 *
5340 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5341 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5342 *
5343 * Refer comments atop CreateAndCopyRelationData() for details about
5344 * 'permanent' parameter.
5345 * --------------------------------------------------------------------
5346 */
5347static void
5350 ForkNumber forkNum, bool permanent)
5351{
5352 Buffer srcBuf;
5353 Buffer dstBuf;
5354 Page srcPage;
5355 Page dstPage;
5356 bool use_wal;
5357 BlockNumber nblocks;
5358 BlockNumber blkno;
5365
5366 /*
5367 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5368 * can skip it when copying any fork of an unlogged relation other than
5369 * the init fork.
5370 */
5371 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5372
5373 /* Get number of blocks in the source relation. */
5375 forkNum);
5376
5377 /* Nothing to copy; just return. */
5378 if (nblocks == 0)
5379 return;
5380
5381 /*
5382 * Bulk extend the destination relation of the same size as the source
5383 * relation before starting to copy block by block.
5384 */
5385 memset(buf.data, 0, BLCKSZ);
5386 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5387 buf.data, true);
5388
5389 /* This is a bulk operation, so use buffer access strategies. */
5392
5393 /* Initialize streaming read */
5394 p.current_blocknum = 0;
5395 p.last_exclusive = nblocks;
5397
5398 /*
5399 * It is safe to use batchmode as block_range_read_stream_cb takes no
5400 * locks.
5401 */
5405 src_smgr,
5407 forkNum,
5409 &p,
5410 0);
5411
5412 /* Iterate over each block of the source relation file. */
5413 for (blkno = 0; blkno < nblocks; blkno++)
5414 {
5416
5417 /* Read block from source relation. */
5421
5425 permanent);
5427
5429
5430 /* Copy page data from the source to the destination. */
5433
5434 /* WAL-log the copied page. */
5435 if (use_wal)
5437
5439
5442 }
5445
5448}
5449
5450/* ---------------------------------------------------------------------
5451 * CreateAndCopyRelationData
5452 *
5453 * Create destination relation storage and copy all forks from the
5454 * source relation to the destination.
5455 *
5456 * Pass permanent as true for permanent relations and false for
5457 * unlogged relations. Currently this API is not supported for
5458 * temporary relations.
5459 * --------------------------------------------------------------------
5460 */
5461void
5463 RelFileLocator dst_rlocator, bool permanent)
5464{
5465 char relpersistence;
5468
5469 /* Set the relpersistence. */
5470 relpersistence = permanent ?
5472
5475
5476 /*
5477 * Create and copy all forks of the relation. During create database we
5478 * have a separate cleanup mechanism which deletes complete database
5479 * directory. Therefore, each individual relation doesn't need to be
5480 * registered for cleanup.
5481 */
5482 RelationCreateStorage(dst_rlocator, relpersistence, false);
5483
5484 /* copy main fork. */
5486 permanent);
5487
5488 /* copy those extra forks that exist */
5489 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5490 forkNum <= MAX_FORKNUM; forkNum++)
5491 {
5492 if (smgrexists(src_rel, forkNum))
5493 {
5494 smgrcreate(dst_rel, forkNum, false);
5495
5496 /*
5497 * WAL log creation if the relation is persistent, or this is the
5498 * init fork of an unlogged relation.
5499 */
5500 if (permanent || forkNum == INIT_FORKNUM)
5501 log_smgrcreate(&dst_rlocator, forkNum);
5502
5503 /* Copy a fork's data, block by block. */
5505 permanent);
5506 }
5507 }
5508}
5509
5510/* ---------------------------------------------------------------------
5511 * FlushDatabaseBuffers
5512 *
5513 * This function writes all dirty pages of a database out to disk
5514 * (or more accurately, out to kernel disk buffers), ensuring that the
5515 * kernel has an up-to-date view of the database.
5516 *
5517 * Generally, the caller should be holding an appropriate lock to ensure
5518 * no other backend is active in the target database; otherwise more
5519 * pages could get dirtied.
5520 *
5521 * Note we don't worry about flushing any pages of temporary relations.
5522 * It's assumed these wouldn't be interesting.
5523 * --------------------------------------------------------------------
5524 */
5525void
5527{
5528 int i;
5530
5531 for (i = 0; i < NBuffers; i++)
5532 {
5534
5536
5537 /*
5538 * As in DropRelationBuffers, an unlocked precheck should be safe and
5539 * saves some cycles.
5540 */
5541 if (bufHdr->tag.dbOid != dbid)
5542 continue;
5543
5544 /* Make sure we can handle the pin */
5547
5549 if (bufHdr->tag.dbOid == dbid &&
5551 {
5555 }
5556 else
5558 }
5559}
5560
5561/*
5562 * Flush a previously, share-exclusively or exclusively, locked and pinned
5563 * buffer to the OS.
5564 */
5565void
5567{
5569
5570 /* currently not needed, but no fundamental reason not to support */
5572
5574
5576
5578
5580}
5581
5582/*
5583 * ReleaseBuffer -- release the pin on a buffer
5584 */
5585void
5587{
5588 if (!BufferIsValid(buffer))
5589 elog(ERROR, "bad buffer ID: %d", buffer);
5590
5591 if (BufferIsLocal(buffer))
5593 else
5595}
5596
5597/*
5598 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5599 *
5600 * This is just a, more efficient, shorthand for a common combination.
5601 */
5602void
5604{
5605 int mode;
5606 BufferDesc *buf;
5608 uint64 sub;
5610
5612
5613 if (BufferIsLocal(buffer))
5614 {
5616 return;
5617 }
5618
5620
5622
5624
5625 /* compute state modification for lock release */
5627
5628 /* compute state modification for pin release */
5630 Assert(ref != NULL);
5631 Assert(ref->data.refcount > 0);
5632 ref->data.refcount--;
5633
5634 /* no more backend local pins, reduce shared pin count */
5635 if (likely(ref->data.refcount == 0))
5636 {
5637 /* See comment in UnpinBufferNoOwner() */
5639
5640 sub |= BUF_REFCOUNT_ONE;
5642 }
5643
5644 /* perform the lock and pin release in one atomic op */
5645 lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
5646
5647 /* wake up waiters for the lock */
5649
5650 /* wake up waiter for the pin release */
5653
5654 /*
5655 * Now okay to allow cancel/die interrupts again, which were held when the
5656 * lock was acquired.
5657 */
5659}
5660
5661/*
5662 * IncrBufferRefCount
5663 * Increment the pin count on a buffer that we have *already* pinned
5664 * at least once.
5665 *
5666 * This function cannot be used on a buffer we do not have pinned,
5667 * because it doesn't change the shared buffer state.
5668 */
5669void
5686
5687/*
5688 * Shared-buffer only helper for MarkBufferDirtyHint() and
5689 * BufferSetHintBits16().
5690 *
5691 * This is separated out because it turns out that the repeated checks for
5692 * local buffers, repeated GetBufferDescriptor() and repeated reading of the
5693 * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
5694 */
5695static inline void
5697 bool buffer_std)
5698{
5699 Page page = BufferGetPage(buffer);
5700
5702
5703 /* here, either share-exclusive or exclusive lock is OK */
5706
5707 /*
5708 * This routine might get called many times on the same page, if we are
5709 * making the first scan after commit of an xact that added/deleted many
5710 * tuples. So, be as quick as we can if the buffer is already dirty.
5711 *
5712 * As we are holding (at least) a share-exclusive lock, nobody could have
5713 * cleaned or dirtied the page concurrently, so we can just rely on the
5714 * previously fetched value here without any danger of races.
5715 */
5716 if (unlikely(!(lockstate & BM_DIRTY)))
5717 {
5719 bool wal_log = false;
5721
5722 /*
5723 * If we need to protect hint bit updates from torn writes, WAL-log a
5724 * full page image of the page. This full page image is only necessary
5725 * if the hint bit update is the first change to the page since the
5726 * last checkpoint.
5727 *
5728 * We don't check full_page_writes here because that logic is included
5729 * when we call XLogInsert() since the value changes dynamically.
5730 */
5732 {
5733 /*
5734 * If we must not write WAL, due to a relfilelocator-specific
5735 * condition or being in recovery, don't dirty the page. We can
5736 * set the hint, just not dirty the page as a result so the hint
5737 * is lost when we evict the page or shutdown.
5738 *
5739 * See src/backend/storage/page/README for longer discussion.
5740 */
5741 if (RecoveryInProgress() ||
5743 return;
5744
5745 wal_log = true;
5746 }
5747
5748 /*
5749 * We must mark the page dirty before we emit the WAL record, as per
5750 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5751 * flush the buffer, even if we haven't inserted the WAL record yet.
5752 * As we hold at least a share-exclusive lock, checkpoints will wait
5753 * for this backend to be done with the buffer before continuing. If
5754 * we did it the other way round, a checkpoint could start between
5755 * writing the WAL record and marking the buffer dirty.
5756 */
5758
5759 /*
5760 * It should not be possible for the buffer to already be dirty, see
5761 * comment above.
5762 */
5766 BM_DIRTY,
5767 0, 0);
5768
5769 /*
5770 * If the block is already dirty because we either made a change or
5771 * set a hint already, then we don't need to write a full page image.
5772 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5773 * would increase the call rate. Bulk setting of hint bits would
5774 * reduce the call rate...
5775 */
5776 if (wal_log)
5778
5779 if (XLogRecPtrIsValid(lsn))
5780 {
5781 /*
5782 * Set the page LSN if we wrote a backup block. To allow backends
5783 * that only hold a share lock on the buffer to read the LSN in a
5784 * tear-free manner, we set the page LSN while holding the buffer
5785 * header lock. This allows any reader of an LSN who holds only a
5786 * share lock to also obtain a buffer header lock before using
5787 * PageGetLSN() to read the LSN in a tear free way. This is done
5788 * in BufferGetLSNAtomic().
5789 *
5790 * If checksums are enabled, you might think we should reset the
5791 * checksum here. That will happen when the page is written
5792 * sometime later in this checkpoint cycle.
5793 */
5795 PageSetLSN(page, lsn);
5797 }
5798
5800 if (VacuumCostActive)
5802 }
5803}
5804
5805/*
5806 * MarkBufferDirtyHint
5807 *
5808 * Mark a buffer dirty for non-critical changes.
5809 *
5810 * This is essentially the same as MarkBufferDirty, except:
5811 *
5812 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5813 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5814 * 2. The caller might have only a share-exclusive-lock instead of an
5815 * exclusive-lock on the buffer's content lock.
5816 * 3. This function does not guarantee that the buffer is always marked dirty
5817 * (it e.g. can't always on a hot standby), so it cannot be used for
5818 * important changes.
5819 */
5820inline void
5822{
5824
5826
5827 if (!BufferIsValid(buffer))
5828 elog(ERROR, "bad buffer ID: %d", buffer);
5829
5830 if (BufferIsLocal(buffer))
5831 {
5833 return;
5834 }
5835
5837 pg_atomic_read_u64(&bufHdr->state),
5838 buffer_std);
5839}
5840
5841/*
5842 * Release buffer content locks for shared buffers.
5843 *
5844 * Used to clean up after errors.
5845 *
5846 * Currently, we can expect that resource owner cleanup, via
5847 * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5848 * se; the only thing we need to deal with here is clearing any PIN_COUNT
5849 * request that was in progress.
5850 */
5851void
5853{
5855
5856 if (buf)
5857 {
5859 uint64 unset_bits = 0;
5860
5862
5863 /*
5864 * Don't complain if flag bit not set; it could have been reset but we
5865 * got a cancel/die interrupt before getting the signal.
5866 */
5867 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5868 buf->wait_backend_pgprocno == MyProcNumber)
5870
5872 0, unset_bits,
5873 0);
5874
5876 }
5877}
5878
5879/*
5880 * Acquire the buffer content lock in the specified mode
5881 *
5882 * If the lock is not available, sleep until it is.
5883 *
5884 * Side effect: cancel/die interrupts are held off until lock release.
5885 *
5886 * This uses almost the same locking approach as lwlock.c's
5887 * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5888 * detailed discussion.
5889 *
5890 * The reason that this, and most of the other BufferLock* functions, get both
5891 * the Buffer and BufferDesc* as parameters, is that looking up one from the
5892 * other repeatedly shows up noticeably in profiles.
5893 *
5894 * Callers should provide a constant for mode, for more efficient code
5895 * generation.
5896 */
5897static inline void
5899{
5900 PrivateRefCountEntry *entry;
5901 int extraWaits = 0;
5902
5903 /*
5904 * Get reference to the refcount entry before we hold the lock, it seems
5905 * better to do before holding the lock.
5906 */
5907 entry = GetPrivateRefCountEntry(buffer, true);
5908
5909 /*
5910 * We better not already hold a lock on the buffer.
5911 */
5913
5914 /*
5915 * Lock out cancel/die interrupts until we exit the code section protected
5916 * by the content lock. This ensures that interrupts will not interfere
5917 * with manipulations of data structures in shared memory.
5918 */
5920
5921 for (;;)
5922 {
5923 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5924 bool mustwait;
5925
5926 /*
5927 * Try to grab the lock the first time, we're not in the waitqueue
5928 * yet/anymore.
5929 */
5931
5932 if (likely(!mustwait))
5933 {
5934 break;
5935 }
5936
5937 /*
5938 * Ok, at this point we couldn't grab the lock on the first try. We
5939 * cannot simply queue ourselves to the end of the list and wait to be
5940 * woken up because by now the lock could long have been released.
5941 * Instead add us to the queue and try to grab the lock again. If we
5942 * succeed we need to revert the queuing and be happy, otherwise we
5943 * recheck the lock. If we still couldn't grab it, we know that the
5944 * other locker will see our queue entries when releasing since they
5945 * existed before we checked for the lock.
5946 */
5947
5948 /* add to the queue */
5950
5951 /* we're now guaranteed to be woken up if necessary */
5953
5954 /* ok, grabbed the lock the second time round, need to undo queueing */
5955 if (!mustwait)
5956 {
5958 break;
5959 }
5960
5961 switch (mode)
5962 {
5965 break;
5968 break;
5969 case BUFFER_LOCK_SHARE:
5971 break;
5972 case BUFFER_LOCK_UNLOCK:
5974
5975 }
5977
5978 /*
5979 * Wait until awakened.
5980 *
5981 * It is possible that we get awakened for a reason other than being
5982 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5983 * Once we've gotten the lock, re-increment the sema by the number of
5984 * additional signals received.
5985 */
5986 for (;;)
5987 {
5990 break;
5991 extraWaits++;
5992 }
5993
5995
5996 /* Retrying, allow BufferLockRelease to release waiters again. */
5998 }
5999
6000 /* Remember that we now hold this lock */
6001 entry->data.lockmode = mode;
6002
6003 /*
6004 * Fix the process wait semaphore's count for any absorbed wakeups.
6005 */
6006 while (unlikely(extraWaits-- > 0))
6008}
6009
6010/*
6011 * Release a previously acquired buffer content lock.
6012 */
6013static void
6015{
6018 uint64 sub;
6019
6021
6022 /*
6023 * Release my hold on lock, after that it can immediately be acquired by
6024 * others, even if we still have to wakeup other waiters.
6025 */
6027
6029
6031
6032 /*
6033 * Now okay to allow cancel/die interrupts.
6034 */
6036}
6037
6038
6039/*
6040 * Acquire the content lock for the buffer, but only if we don't have to wait.
6041 *
6042 * It is allowed to try to conditionally acquire a lock on a buffer that this
6043 * backend has already locked, but the lock acquisition will always fail, even
6044 * if the new lock acquisition does not conflict with an already held lock
6045 * (e.g. two share locks). This is because we currently do not have space to
6046 * track multiple lock ownerships of the same buffer within one backend. That
6047 * is ok for the current uses of BufferLockConditional().
6048 */
6049static bool
6051{
6053 bool mustwait;
6054
6055 /*
6056 * As described above, if we're trying to lock a buffer this backend
6057 * already has locked, return false, independent of the existing and
6058 * desired lock level.
6059 */
6060 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
6061 return false;
6062
6063 /*
6064 * Lock out cancel/die interrupts until we exit the code section protected
6065 * by the content lock. This ensures that interrupts will not interfere
6066 * with manipulations of data structures in shared memory.
6067 */
6069
6070 /* Check for the lock */
6072
6073 if (mustwait)
6074 {
6075 /* Failed to get lock, so release interrupt holdoff */
6077 }
6078 else
6079 {
6080 entry->data.lockmode = mode;
6081 }
6082
6083 return !mustwait;
6084}
6085
6086/*
6087 * Internal function that tries to atomically acquire the content lock in the
6088 * passed in mode.
6089 *
6090 * This function will not block waiting for a lock to become free - that's the
6091 * caller's job.
6092 *
6093 * Similar to LWLockAttemptLock().
6094 */
6095static inline bool
6097{
6099
6100 /*
6101 * Read once outside the loop, later iterations will get the newer value
6102 * via compare & exchange.
6103 */
6105
6106 /* loop until we've determined whether we could acquire the lock or not */
6107 while (true)
6108 {
6110 bool lock_free;
6111
6113
6115 {
6116 lock_free = (old_state & BM_LOCK_MASK) == 0;
6117 if (lock_free)
6119 }
6121 {
6123 if (lock_free)
6125 }
6126 else
6127 {
6129 if (lock_free)
6131 }
6132
6133 /*
6134 * Attempt to swap in the state we are expecting. If we didn't see
6135 * lock to be free, that's just the old value. If we saw it as free,
6136 * we'll attempt to mark it acquired. The reason that we always swap
6137 * in the value is that this doubles as a memory barrier. We could try
6138 * to be smarter and only swap in values if we saw the lock as free,
6139 * but benchmark haven't shown it as beneficial so far.
6140 *
6141 * Retry if the value changed since we last looked at it.
6142 */
6145 {
6146 if (lock_free)
6147 {
6148 /* Great! Got the lock. */
6149 return false;
6150 }
6151 else
6152 return true; /* somebody else has the lock */
6153 }
6154 }
6155
6157}
6158
6159/*
6160 * Add ourselves to the end of the content lock's wait queue.
6161 */
6162static void
6164{
6165 /*
6166 * If we don't have a PGPROC structure, there's no way to wait. This
6167 * should never occur, since MyProc should only be null during shared
6168 * memory initialization.
6169 */
6170 if (MyProc == NULL)
6171 elog(PANIC, "cannot wait without a PGPROC structure");
6172
6174 elog(PANIC, "queueing for lock while waiting on another one");
6175
6177
6178 /* setting the flag is protected by the spinlock */
6180
6181 /*
6182 * These are currently used both for lwlocks and buffer content locks,
6183 * which is acceptable, although not pretty, because a backend can't wait
6184 * for both types of locks at the same time.
6185 */
6188
6189 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6190
6191 /* Can release the mutex now */
6193}
6194
6195/*
6196 * Remove ourselves from the waitlist.
6197 *
6198 * This is used if we queued ourselves because we thought we needed to sleep
6199 * but, after further checking, we discovered that we don't actually need to
6200 * do so.
6201 */
6202static void
6204{
6205 bool on_waitlist;
6206
6208
6210 if (on_waitlist)
6211 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6212
6213 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6215 {
6217 }
6218
6219 /* XXX: combine with fetch_and above? */
6221
6222 /* clear waiting state again, nice for debugging */
6223 if (on_waitlist)
6225 else
6226 {
6227 int extraWaits = 0;
6228
6229
6230 /*
6231 * Somebody else dequeued us and has or will wake us up. Deal with the
6232 * superfluous absorption of a wakeup.
6233 */
6234
6235 /*
6236 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6237 * removed ourselves - they'll have set it.
6238 */
6240
6241 /*
6242 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6243 * get reset at some inconvenient point later. Most of the time this
6244 * will immediately return.
6245 */
6246 for (;;)
6247 {
6250 break;
6251 extraWaits++;
6252 }
6253
6254 /*
6255 * Fix the process wait semaphore's count for any absorbed wakeups.
6256 */
6257 while (extraWaits-- > 0)
6259 }
6260}
6261
6262/*
6263 * Stop treating lock as held by current backend.
6264 *
6265 * After calling this function it's the callers responsibility to ensure that
6266 * the lock gets released, even in case of an error. This only is desirable if
6267 * the lock is going to be released in a different process than the process
6268 * that acquired it.
6269 */
6270static inline void
6276
6277/*
6278 * Stop treating lock as held by current backend.
6279 *
6280 * This is the code that can be shared between actually releasing a lock
6281 * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6282 * without releasing the lock (BufferLockDisown()).
6283 */
6284static inline int
6286{
6289
6291 if (ref == NULL)
6292 elog(ERROR, "lock %d is not held", buffer);
6293 mode = ref->data.lockmode;
6294 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6295
6296 return mode;
6297}
6298
6299/*
6300 * Wakeup all the lockers that currently have a chance to acquire the lock.
6301 *
6302 * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6303 */
6304static void
6306{
6307 bool new_wake_in_progress = false;
6308 bool wake_share_exclusive = true;
6311
6313
6314 /* lock wait list while collecting backends to wake up */
6316
6317 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6318 {
6319 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6320
6321 /*
6322 * Already woke up a conflicting lock, so skip over this wait list
6323 * entry.
6324 */
6326 continue;
6328 continue;
6329
6330 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6331 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6332
6333 /*
6334 * Prevent additional wakeups until retryer gets to run. Backends that
6335 * are just waiting for the lock to become free don't retry
6336 * automatically.
6337 */
6338 new_wake_in_progress = true;
6339
6340 /*
6341 * Signal that the process isn't on the wait list anymore. This allows
6342 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6343 * proclist_delete(), rather than having to check if it has been
6344 * removed from the list.
6345 */
6346 Assert(waiter->lwWaiting == LW_WS_WAITING);
6348
6349 /*
6350 * Don't wakeup further waiters after waking a conflicting waiter.
6351 */
6352 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6353 {
6354 /*
6355 * Share locks conflict with exclusive locks.
6356 */
6357 wake_exclusive = false;
6358 }
6359 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6360 {
6361 /*
6362 * Share-exclusive locks conflict with share-exclusive and
6363 * exclusive locks.
6364 */
6365 wake_exclusive = false;
6366 wake_share_exclusive = false;
6367 }
6368 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6369 {
6370 /*
6371 * Exclusive locks conflict with all other locks, there's no point
6372 * in waking up anybody else.
6373 */
6374 break;
6375 }
6376 }
6377
6379
6380 /* unset required flags, and release lock, in one fell swoop */
6381 {
6384
6386 while (true)
6387 {
6389
6390 /* compute desired flags */
6391
6394 else
6396
6397 if (proclist_is_empty(&buf_hdr->lock_waiters))
6399
6400 desired_state &= ~BM_LOCKED; /* release lock */
6401
6404 break;
6405 }
6406 }
6407
6408 /* Awaken any waiters I removed from the queue. */
6409 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6410 {
6411 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6412
6413 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6414
6415 /*
6416 * Guarantee that lwWaiting being unset only becomes visible once the
6417 * unlink from the link has completed. Otherwise the target backend
6418 * could be woken up for other reason and enqueue for a new lock - if
6419 * that happens before the list unlink happens, the list would end up
6420 * being corrupted.
6421 *
6422 * The barrier pairs with the LockBufHdr() when enqueuing for another
6423 * lock.
6424 */
6426 waiter->lwWaiting = LW_WS_NOT_WAITING;
6427 PGSemaphoreUnlock(waiter->sem);
6428 }
6429}
6430
6431/*
6432 * Compute subtraction from buffer state for a release of a held lock in
6433 * `mode`.
6434 *
6435 * This is separated from BufferLockUnlock() as we want to combine the lock
6436 * release with other atomic operations when possible, leading to the lock
6437 * release being done in multiple places, each needing to compute what to
6438 * subtract from the lock state.
6439 */
6440static inline uint64
6442{
6443 /*
6444 * Turns out that a switch() leads gcc to generate sufficiently worse code
6445 * for this to show up in profiles...
6446 */
6448 return BM_LOCK_VAL_EXCLUSIVE;
6451 else
6452 {
6454 return BM_LOCK_VAL_SHARED;
6455 }
6456
6457 return 0; /* keep compiler quiet */
6458}
6459
6460/*
6461 * Handle work that needs to be done after releasing a lock that was held in
6462 * `mode`, where `lockstate` is the result of the atomic operation modifying
6463 * the state variable.
6464 *
6465 * This is separated from BufferLockUnlock() as we want to combine the lock
6466 * release with other atomic operations when possible, leading to the lock
6467 * release being done in multiple places.
6468 */
6469static void
6471{
6472 bool check_waiters = false;
6473 bool wake_exclusive = false;
6474
6475 /* nobody else can have that kind of lock */
6477
6478 /*
6479 * If we're still waiting for backends to get scheduled, don't wake them
6480 * up again. Otherwise check if we need to look through the waitqueue to
6481 * wake other backends.
6482 */
6485 {
6486 if ((lockstate & BM_LOCK_MASK) == 0)
6487 {
6488 /*
6489 * We released a lock and the lock was, in that moment, free. We
6490 * therefore can wake waiters for any kind of lock.
6491 */
6492 check_waiters = true;
6493 wake_exclusive = true;
6494 }
6496 {
6497 /*
6498 * We released the lock, but another backend still holds a lock.
6499 * We can't have released an exclusive lock, as there couldn't
6500 * have been other lock holders. If we released a share lock, no
6501 * waiters need to be woken up, as there must be other share
6502 * lockers. However, if we held a share-exclusive lock, another
6503 * backend now could acquire a share-exclusive lock.
6504 */
6505 check_waiters = true;
6506 wake_exclusive = false;
6507 }
6508 }
6509
6510 /*
6511 * As waking up waiters requires the spinlock to be acquired, only do so
6512 * if necessary.
6513 */
6514 if (check_waiters)
6516}
6517
6518/*
6519 * BufferLockHeldByMeInMode - test whether my process holds the content lock
6520 * in the specified mode
6521 *
6522 * This is meant as debug support only.
6523 */
6524static bool
6526{
6527 PrivateRefCountEntry *entry =
6529
6530 if (!entry)
6531 return false;
6532 else
6533 return entry->data.lockmode == mode;
6534}
6535
6536/*
6537 * BufferLockHeldByMe - test whether my process holds the content lock in any
6538 * mode
6539 *
6540 * This is meant as debug support only.
6541 */
6542static bool
6544{
6545 PrivateRefCountEntry *entry =
6547
6548 if (!entry)
6549 return false;
6550 else
6551 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6552}
6553
6554/*
6555 * Release the content lock for the buffer.
6556 */
6557void
6559{
6561
6563 if (BufferIsLocal(buffer))
6564 return; /* local buffers need no lock */
6565
6568}
6569
6570/*
6571 * Acquire the content_lock for the buffer.
6572 */
6573void
6575{
6577
6578 /*
6579 * We can't wait if we haven't got a PGPROC. This should only occur
6580 * during bootstrap or shared memory initialization. Put an Assert here
6581 * to catch unsafe coding practices.
6582 */
6584
6585 /* handled in LockBuffer() wrapper */
6587
6589 if (BufferIsLocal(buffer))
6590 return; /* local buffers need no lock */
6591
6593
6594 /*
6595 * Test the most frequent lock modes first. While a switch (mode) would be
6596 * nice, at least gcc generates considerably worse code for it.
6597 *
6598 * Call BufferLockAcquire() with a constant argument for mode, to generate
6599 * more efficient code for the different lock modes.
6600 */
6601 if (mode == BUFFER_LOCK_SHARE)
6603 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6607 else
6608 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6609}
6610
6611/*
6612 * Acquire the content_lock for the buffer, but only if we don't have to wait.
6613 *
6614 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6615 */
6616bool
6618{
6619 BufferDesc *buf;
6620
6622 if (BufferIsLocal(buffer))
6623 return true; /* act as though we got it */
6624
6626
6628}
6629
6630/*
6631 * Verify that this backend is pinning the buffer exactly once.
6632 *
6633 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6634 * holds a pin on the buffer. We do not care whether some other backend does.
6635 */
6636void
6638{
6639 if (BufferIsLocal(buffer))
6640 {
6641 if (LocalRefCount[-buffer - 1] != 1)
6642 elog(ERROR, "incorrect local pin count: %d",
6643 LocalRefCount[-buffer - 1]);
6644 }
6645 else
6646 {
6647 if (GetPrivateRefCount(buffer) != 1)
6648 elog(ERROR, "incorrect local pin count: %d",
6650 }
6651}
6652
6653/*
6654 * LockBufferForCleanup - lock a buffer in preparation for deleting items
6655 *
6656 * Items may be deleted from a disk page only when the caller (a) holds an
6657 * exclusive lock on the buffer and (b) has observed that no other backend
6658 * holds a pin on the buffer. If there is a pin, then the other backend
6659 * might have a pointer into the buffer (for example, a heapscan reference
6660 * to an item --- see README for more details). It's OK if a pin is added
6661 * after the cleanup starts, however; the newly-arrived backend will be
6662 * unable to look at the page until we release the exclusive lock.
6663 *
6664 * To implement this protocol, a would-be deleter must pin the buffer and
6665 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6666 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6667 * it has successfully observed pin count = 1.
6668 */
6669void
6671{
6673 TimestampTz waitStart = 0;
6674 bool waiting = false;
6675 bool logged_recovery_conflict = false;
6676
6679
6681
6682 /*
6683 * We do not yet need to be worried about in-progress AIOs holding a pin,
6684 * as we, so far, only support doing reads via AIO and this function can
6685 * only be called once the buffer is valid (i.e. no read can be in
6686 * flight).
6687 */
6688
6689 /* Nobody else to wait for */
6690 if (BufferIsLocal(buffer))
6691 return;
6692
6694
6695 for (;;)
6696 {
6698 uint64 unset_bits = 0;
6699
6700 /* Try to acquire lock */
6703
6706 {
6707 /* Successfully acquired exclusive lock with pincount 1 */
6709
6710 /*
6711 * Emit the log message if recovery conflict on buffer pin was
6712 * resolved but the startup process waited longer than
6713 * deadlock_timeout for it.
6714 */
6717 waitStart, GetCurrentTimestamp(),
6718 NULL, false);
6719
6720 if (waiting)
6721 {
6722 /* reset ps display to remove the suffix if we added one */
6724 waiting = false;
6725 }
6726 return;
6727 }
6728 /* Failed, so mark myself as waiting for pincount 1 */
6730 {
6733 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6734 }
6735 bufHdr->wait_backend_pgprocno = MyProcNumber;
6739 0);
6741
6742 /* Wait to be signaled by UnpinBuffer() */
6743 if (InHotStandby)
6744 {
6745 if (!waiting)
6746 {
6747 /* adjust the process title to indicate that it's waiting */
6748 set_ps_display_suffix("waiting");
6749 waiting = true;
6750 }
6751
6752 /*
6753 * Emit the log message if the startup process is waiting longer
6754 * than deadlock_timeout for recovery conflict on buffer pin.
6755 *
6756 * Skip this if first time through because the startup process has
6757 * not started waiting yet in this case. So, the wait start
6758 * timestamp is set after this logic.
6759 */
6760 if (waitStart != 0 && !logged_recovery_conflict)
6761 {
6763
6764 if (TimestampDifferenceExceeds(waitStart, now,
6766 {
6768 waitStart, now, NULL, true);
6770 }
6771 }
6772
6773 /*
6774 * Set the wait start timestamp if logging is enabled and first
6775 * time through.
6776 */
6777 if (log_recovery_conflict_waits && waitStart == 0)
6778 waitStart = GetCurrentTimestamp();
6779
6780 /* Publish the bufid that Startup process waits on */
6782 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6784 /* Reset the published bufid */
6786 }
6787 else
6789
6790 /*
6791 * Remove flag marking us as waiter. Normally this will not be set
6792 * anymore, but ProcWaitForSignal() can return for other signals as
6793 * well. We take care to only reset the flag if we're the waiter, as
6794 * theoretically another backend could have started waiting. That's
6795 * impossible with the current usages due to table level locking, but
6796 * better be safe.
6797 */
6799 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6800 bufHdr->wait_backend_pgprocno == MyProcNumber)
6802
6804 0, unset_bits,
6805 0);
6806
6808 /* Loop back and try again */
6809 }
6810}
6811
6812/*
6813 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6814 * requests cancellation of all pin holders that are blocking it.
6815 */
6816bool
6818{
6820
6821 /*
6822 * If we get woken slowly then it's possible that the Startup process was
6823 * already woken by other backends before we got here. Also possible that
6824 * we get here by multiple interrupts or interrupts at inappropriate
6825 * times, so make sure we do nothing if the bufid is not set.
6826 */
6827 if (bufid < 0)
6828 return false;
6829
6830 if (GetPrivateRefCount(bufid + 1) > 0)
6831 return true;
6832
6833 return false;
6834}
6835
6836/*
6837 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6838 *
6839 * We won't loop, but just check once to see if the pin count is OK. If
6840 * not, return false with no lock held.
6841 */
6842bool
6844{
6847 refcount;
6848
6850
6851 /* see AIO related comment in LockBufferForCleanup() */
6852
6853 if (BufferIsLocal(buffer))
6854 {
6855 refcount = LocalRefCount[-buffer - 1];
6856 /* There should be exactly one pin */
6857 Assert(refcount > 0);
6858 if (refcount != 1)
6859 return false;
6860 /* Nobody else to wait for */
6861 return true;
6862 }
6863
6864 /* There should be exactly one local pin */
6865 refcount = GetPrivateRefCount(buffer);
6866 Assert(refcount);
6867 if (refcount != 1)
6868 return false;
6869
6870 /* Try to acquire lock */
6872 return false;
6873
6877
6878 Assert(refcount > 0);
6879 if (refcount == 1)
6880 {
6881 /* Successfully acquired exclusive lock with pincount 1 */
6883 return true;
6884 }
6885
6886 /* Failed, so release the lock */
6889 return false;
6890}
6891
6892/*
6893 * IsBufferCleanupOK - as above, but we already have the lock
6894 *
6895 * Check whether it's OK to perform cleanup on a buffer we've already
6896 * locked. If we observe that the pin count is 1, our exclusive lock
6897 * happens to be a cleanup lock, and we can proceed with anything that
6898 * would have been allowable had we sought a cleanup lock originally.
6899 */
6900bool
6902{
6905
6907
6908 /* see AIO related comment in LockBufferForCleanup() */
6909
6910 if (BufferIsLocal(buffer))
6911 {
6912 /* There should be exactly one pin */
6913 if (LocalRefCount[-buffer - 1] != 1)
6914 return false;
6915 /* Nobody else to wait for */
6916 return true;
6917 }
6918
6919 /* There should be exactly one local pin */
6920 if (GetPrivateRefCount(buffer) != 1)
6921 return false;
6922
6924
6925 /* caller must hold exclusive lock on buffer */
6927
6929
6932 {
6933 /* pincount is OK. */
6935 return true;
6936 }
6937
6939 return false;
6940}
6941
6942/*
6943 * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
6944 *
6945 * This checks if the current lock mode already suffices to allow hint bits
6946 * being set and, if not, whether the current lock can be upgraded.
6947 *
6948 * Updates *lockstate when returning true.
6949 */
6950static inline bool
6952{
6956
6958
6959 if (ref == NULL)
6960 elog(ERROR, "buffer is not pinned");
6961
6962 mode = ref->data.lockmode;
6963 if (mode == BUFFER_LOCK_UNLOCK)
6964 elog(ERROR, "buffer is not locked");
6965
6966 /* we're done if we are already holding a sufficient lock level */
6968 {
6970 return true;
6971 }
6972
6973 /*
6974 * We are only holding a share lock right now, try to upgrade it to
6975 * SHARE_EXCLUSIVE.
6976 */
6978
6980 while (true)
6981 {
6983
6985
6986 /*
6987 * Can't upgrade if somebody else holds the lock in exclusive or
6988 * share-exclusive mode.
6989 */
6991 {
6992 return false;
6993 }
6994
6995 /* currently held lock state */
6997
6998 /* new lock level */
7000
7003 {
7004 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
7006
7007 return true;
7008 }
7009 }
7010}
7011
7012/*
7013 * Try to acquire the right to set hint bits on the buffer.
7014 *
7015 * To be allowed to set hint bits, this backend needs to hold either a
7016 * share-exclusive or an exclusive lock. In case this backend only holds a
7017 * share lock, this function will try to upgrade the lock to
7018 * share-exclusive. The caller is only allowed to set hint bits if true is
7019 * returned.
7020 *
7021 * Once BufferBeginSetHintBits() has returned true, hint bits may be set
7022 * without further calls to BufferBeginSetHintBits(), until the buffer is
7023 * unlocked.
7024 *
7025 *
7026 * Requiring a share-exclusive lock to set hint bits prevents setting hint
7027 * bits on buffers that are currently being written out, which could corrupt
7028 * the checksum on the page. Flushing buffers also requires a share-exclusive
7029 * lock.
7030 *
7031 * Due to a lock >= share-exclusive being required to set hint bits, only one
7032 * backend can set hint bits at a time. Allowing multiple backends to set hint
7033 * bits would require more complicated locking: For setting hint bits we'd
7034 * need to store the count of backends currently setting hint bits, for I/O we
7035 * would need another lock-level conflicting with the hint-setting
7036 * lock-level. Given that the share-exclusive lock for setting hint bits is
7037 * only held for a short time, that backends often would just set the same
7038 * hint bits and that the cost of occasionally not setting hint bits in hotly
7039 * accessed pages is fairly low, this seems like an acceptable tradeoff.
7040 */
7041bool
7043{
7046
7047 if (BufferIsLocal(buffer))
7048 {
7049 /*
7050 * NB: Will need to check if there is a write in progress, once it is
7051 * possible for writes to be done asynchronously.
7052 */
7053 return true;
7054 }
7055
7057
7059}
7060
7061/*
7062 * End a phase of setting hint bits on this buffer, started with
7063 * BufferBeginSetHintBits().
7064 *
7065 * This would strictly speaking not be required (i.e. the caller could do
7066 * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
7067 * checks.
7068 */
7069void
7079
7080/*
7081 * Try to set hint bits on a single 16bit value in a buffer.
7082 *
7083 * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
7084 * dirty and return true. Otherwise false is returned.
7085 *
7086 * *ptr needs to be a pointer to memory within the buffer.
7087 *
7088 * This is a bit faster than BufferBeginSetHintBits() /
7089 * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
7090 * than the former when setting hint bits multiple times in the same buffer.
7091 */
7092bool
7094{
7097#ifdef USE_ASSERT_CHECKING
7098 char *page;
7099
7100 /* verify that the address is on the page */
7101 page = BufferGetPage(buffer);
7102 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
7103#endif
7104
7105 if (BufferIsLocal(buffer))
7106 {
7107 *ptr = val;
7108
7110
7111 return true;
7112 }
7113
7115
7117 {
7118 *ptr = val;
7119
7121
7122 return true;
7123 }
7124
7125 return false;
7126}
7127
7128
7129/*
7130 * Functions for buffer I/O handling
7131 *
7132 * Also note that these are used only for shared buffers, not local ones.
7133 */
7134
7135/*
7136 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
7137 */
7138static void
7140{
7142
7143 /*
7144 * Should never end up here with unsubmitted IO, as no AIO unaware code
7145 * may be used while in batch mode and AIO aware code needs to have
7146 * submitted all staged IO to avoid deadlocks & slowness.
7147 */
7149
7151 for (;;)
7152 {
7155
7156 /*
7157 * It may not be necessary to acquire the spinlock to check the flag
7158 * here, but since this test is essential for correctness, we'd better
7159 * play it safe.
7160 */
7162
7163 /*
7164 * Copy the wait reference while holding the spinlock. This protects
7165 * against a concurrent TerminateBufferIO() in another backend from
7166 * clearing the wref while it's being read.
7167 */
7168 iow = buf->io_wref;
7170
7171 /* no IO in progress, we don't need to wait */
7173 break;
7174
7175 /*
7176 * The buffer has asynchronous IO in progress, wait for it to
7177 * complete.
7178 */
7179 if (pgaio_wref_valid(&iow))
7180 {
7182
7183 /*
7184 * The AIO subsystem internally uses condition variables and thus
7185 * might remove this backend from the BufferDesc's CV. While that
7186 * wouldn't cause a correctness issue (the first CV sleep just
7187 * immediately returns if not already registered), it seems worth
7188 * avoiding unnecessary loop iterations, given that we take care
7189 * to do so at the start of the function.
7190 */
7192 continue;
7193 }
7194
7195 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7197 }
7199}
7200
7201/*
7202 * StartSharedBufferIO: begin I/O on this buffer
7203 * (Assumptions)
7204 * The buffer is Pinned
7205 *
7206 * In several scenarios the buffer may already be undergoing I/O in this or
7207 * another backend. How to best handle that depends on the caller's
7208 * situation. It might be appropriate to wait synchronously (e.g., because the
7209 * buffer is about to be invalidated); wait asynchronously, using the buffer's
7210 * IO wait reference (e.g., because the caller is doing readahead and doesn't
7211 * need the buffer to be ready immediately); or to not wait at all (e.g.,
7212 * because the caller is trying to combine IO for this buffer with another
7213 * buffer).
7214 *
7215 * How and whether to wait is controlled by the wait and io_wref
7216 * parameters. In detail:
7217 *
7218 * - If the caller passes a non-NULL io_wref and the buffer has an I/O wait
7219 * reference, the *io_wref is set to the buffer's io_wref and
7220 * BUFFER_IO_IN_PROGRESS is returned. This is done regardless of the wait
7221 * parameter.
7222 *
7223 * - If the caller passes a NULL io_wref (i.e. the caller does not want to
7224 * asynchronously wait for the completion of the IO), wait = false and the
7225 * buffer is undergoing IO, BUFFER_IO_IN_PROGRESS is returned.
7226 *
7227 * - If wait = true and either the buffer does not have a wait reference,
7228 * or the caller passes io_wref = NULL, WaitIO() is used to wait for the IO
7229 * to complete. To avoid the potential of deadlocks and unnecessary delays,
7230 * all staged I/O is submitted before waiting.
7231 *
7232 * Input operations are only attempted on buffers that are not BM_VALID, and
7233 * output operations only on buffers that are BM_VALID and BM_DIRTY, so we can
7234 * always tell if the work is already done. If no I/O is necessary,
7235 * BUFFER_IO_ALREADY_DONE is returned.
7236 *
7237 * If we successfully marked the buffer as BM_IO_IN_PROGRESS,
7238 * BUFFER_IO_READY_FOR_IO is returned.
7239 */
7242{
7244
7246
7247 for (;;)
7248 {
7250
7252 break;
7253
7254 /* Join the existing IO */
7255 if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
7256 {
7257 *io_wref = buf->io_wref;
7259
7260 return BUFFER_IO_IN_PROGRESS;
7261 }
7262 else if (!wait)
7263 {
7265 return BUFFER_IO_IN_PROGRESS;
7266 }
7267 else
7268 {
7269 /*
7270 * With wait = true, we always have to wait if the caller has
7271 * passed io_wref = NULL.
7272 *
7273 * Even with io_wref != NULL, we have to wait if the buffer's wait
7274 * ref is not valid but the IO is in progress, someone else
7275 * started IO but hasn't set the wait ref yet. We have no choice
7276 * but to wait until the IO completes.
7277 */
7279
7280 /*
7281 * If this backend currently has staged IO, submit it before
7282 * waiting for in-progress IO, to avoid potential deadlocks and
7283 * unnecessary delays.
7284 */
7286
7287 WaitIO(buf);
7288 }
7289 }
7290
7291 /* Once we get here, there is definitely no I/O active on this buffer */
7292
7293 /* Check if someone else already did the I/O */
7294 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7295 {
7298 }
7299
7300 /*
7301 * No IO in progress and not already done; we will start IO. It's possible
7302 * that the IO was in progress but we're not done, because the IO errored
7303 * out. We'll do the IO ourselves.
7304 */
7307 0);
7308
7311
7313}
7314
7315/*
7316 * Wrapper around StartSharedBufferIO / StartLocalBufferIO. Only to be used
7317 * when the caller doesn't otherwise need to care about local vs shared. See
7318 * StartSharedBufferIO() for details.
7319 */
7322{
7324
7325 if (BufferIsLocal(buffer))
7326 {
7328
7329 return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
7330 }
7331 else
7332 {
7334
7335 return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
7336 }
7337}
7338
7339/*
7340 * TerminateBufferIO: release a buffer we were doing I/O on
7341 * (Assumptions)
7342 * My process is executing IO for the buffer
7343 * BM_IO_IN_PROGRESS bit is set for the buffer
7344 * The buffer is Pinned
7345 *
7346 * If clear_dirty is true, we clear the buffer's BM_DIRTY flag. This is
7347 * appropriate when terminating a successful write.
7348 *
7349 * set_flag_bits gets ORed into the buffer's flags. It must include
7350 * BM_IO_ERROR in a failure case. For successful completion it could
7351 * be 0, or BM_VALID if we just finished reading in the page.
7352 *
7353 * If forget_owner is true, we release the buffer I/O from the current
7354 * resource owner. (forget_owner=false is used when the resource owner itself
7355 * is being released)
7356 */
7357void
7359 bool forget_owner, bool release_aio)
7360{
7363 int refcount_change = 0;
7364
7366
7369
7370 /* Clear earlier errors, if this IO failed, it'll be marked again */
7372
7373 if (clear_dirty)
7375
7376 if (release_aio)
7377 {
7378 /* release ownership by the AIO subsystem */
7380 refcount_change = -1;
7381 pgaio_wref_clear(&buf->io_wref);
7382 }
7383
7387
7388 if (forget_owner)
7391
7393
7394 /*
7395 * Support LockBufferForCleanup()
7396 *
7397 * We may have just released the last pin other than the waiter's. In most
7398 * cases, this backend holds another pin on the buffer. But, if, for
7399 * example, this backend is completing an IO issued by another backend, it
7400 * may be time to wake the waiter.
7401 */
7404}
7405
7406/*
7407 * AbortBufferIO: Clean up active buffer I/O after an error.
7408 *
7409 * All LWLocks & content locks we might have held have been released, but we
7410 * haven't yet released buffer pins, so the buffer is still pinned.
7411 *
7412 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7413 * possible the error condition wasn't related to the I/O.
7414 *
7415 * Note: this does not remove the buffer I/O from the resource owner.
7416 * That's correct when we're releasing the whole resource owner, but
7417 * beware if you use this in other contexts.
7418 */
7419static void
7421{
7424
7427
7428 if (!(buf_state & BM_VALID))
7429 {
7432 }
7433 else
7434 {
7437
7438 /* Issue notice if this is not the first failure... */
7439 if (buf_state & BM_IO_ERROR)
7440 {
7441 /* Buffer is pinned, so we can read tag without spinlock */
7444 errmsg("could not write block %u of %s",
7445 buf_hdr->tag.blockNum,
7447 BufTagGetForkNum(&buf_hdr->tag)).str),
7448 errdetail("Multiple failures --- write error might be permanent.")));
7449 }
7450 }
7451
7452 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7453}
7454
7455/*
7456 * Error context callback for errors occurring during shared buffer writes.
7457 */
7458static void
7460{
7462
7463 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7464 if (bufHdr != NULL)
7465 errcontext("writing block %u of relation \"%s\"",
7466 bufHdr->tag.blockNum,
7468 BufTagGetForkNum(&bufHdr->tag)).str);
7469}
7470
7471/*
7472 * Error context callback for errors occurring during local buffer writes.
7473 */
7474static void
7476{
7478
7479 if (bufHdr != NULL)
7480 errcontext("writing block %u of relation \"%s\"",
7481 bufHdr->tag.blockNum,
7484 BufTagGetForkNum(&bufHdr->tag)).str);
7485}
7486
7487/*
7488 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7489 */
7490static int
7491rlocator_comparator(const void *p1, const void *p2)
7492{
7493 RelFileLocator n1 = *(const RelFileLocator *) p1;
7494 RelFileLocator n2 = *(const RelFileLocator *) p2;
7495
7496 if (n1.relNumber < n2.relNumber)
7497 return -1;
7498 else if (n1.relNumber > n2.relNumber)
7499 return 1;
7500
7501 if (n1.dbOid < n2.dbOid)
7502 return -1;
7503 else if (n1.dbOid > n2.dbOid)
7504 return 1;
7505
7506 if (n1.spcOid < n2.spcOid)
7507 return -1;
7508 else if (n1.spcOid > n2.spcOid)
7509 return 1;
7510 else
7511 return 0;
7512}
7513
7514/*
7515 * Lock buffer header - set BM_LOCKED in buffer state.
7516 */
7517uint64
7519{
7521
7523
7524 while (true)
7525 {
7526 /*
7527 * Always try once to acquire the lock directly, without setting up
7528 * the spin-delay infrastructure. The work necessary for that shows up
7529 * in profiles and is rarely necessary.
7530 */
7532 if (likely(!(old_buf_state & BM_LOCKED)))
7533 break; /* got lock */
7534
7535 /* and then spin without atomic operations until lock is released */
7536 {
7538
7540
7541 while (old_buf_state & BM_LOCKED)
7542 {
7545 }
7547 }
7548
7549 /*
7550 * Retry. The lock might obviously already be re-acquired by the time
7551 * we're attempting to get it again.
7552 */
7553 }
7554
7555 return old_buf_state | BM_LOCKED;
7556}
7557
7558/*
7559 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7560 * state at that point.
7561 *
7562 * Obviously the buffer could be locked by the time the value is returned, so
7563 * this is primarily useful in CAS style loops.
7564 */
7567{
7570
7572
7573 buf_state = pg_atomic_read_u64(&buf->state);
7574
7575 while (buf_state & BM_LOCKED)
7576 {
7578 buf_state = pg_atomic_read_u64(&buf->state);
7579 }
7580
7582
7583 return buf_state;
7584}
7585
7586/*
7587 * BufferTag comparator.
7588 */
7589static inline int
7591{
7592 int ret;
7595
7598
7600
7601 if (ret != 0)
7602 return ret;
7603
7605 return -1;
7607 return 1;
7608
7609 if (ba->blockNum < bb->blockNum)
7610 return -1;
7611 if (ba->blockNum > bb->blockNum)
7612 return 1;
7613
7614 return 0;
7615}
7616
7617/*
7618 * Comparator determining the writeout order in a checkpoint.
7619 *
7620 * It is important that tablespaces are compared first, the logic balancing
7621 * writes between tablespaces relies on it.
7622 */
7623static inline int
7625{
7626 /* compare tablespace */
7627 if (a->tsId < b->tsId)
7628 return -1;
7629 else if (a->tsId > b->tsId)
7630 return 1;
7631 /* compare relation */
7632 if (a->relNumber < b->relNumber)
7633 return -1;
7634 else if (a->relNumber > b->relNumber)
7635 return 1;
7636 /* compare fork */
7637 else if (a->forkNum < b->forkNum)
7638 return -1;
7639 else if (a->forkNum > b->forkNum)
7640 return 1;
7641 /* compare block number */
7642 else if (a->blockNum < b->blockNum)
7643 return -1;
7644 else if (a->blockNum > b->blockNum)
7645 return 1;
7646 /* equal page IDs are unlikely, but not impossible */
7647 return 0;
7648}
7649
7650/*
7651 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7652 * progress.
7653 */
7654static int
7656{
7659
7660 /* we want a min-heap, so return 1 for the a < b */
7661 if (sa->progress < sb->progress)
7662 return 1;
7663 else if (sa->progress == sb->progress)
7664 return 0;
7665 else
7666 return -1;
7667}
7668
7669/*
7670 * Initialize a writeback context, discarding potential previous state.
7671 *
7672 * *max_pending is a pointer instead of an immediate value, so the coalesce
7673 * limits can easily changed by the GUC mechanism, and so calling code does
7674 * not have to check the current configuration. A value of 0 means that no
7675 * writeback control will be performed.
7676 */
7677void
7678WritebackContextInit(WritebackContext *context, int *max_pending)
7679{
7680 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7681
7682 context->max_pending = max_pending;
7683 context->nr_pending = 0;
7684}
7685
7686/*
7687 * Add buffer to list of pending writeback requests.
7688 */
7689void
7691 BufferTag *tag)
7692{
7693 PendingWriteback *pending;
7694
7695 /*
7696 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7697 * point in tracking in that case.
7698 */
7700 !enableFsync)
7701 return;
7702
7703 /*
7704 * Add buffer to the pending writeback array, unless writeback control is
7705 * disabled.
7706 */
7707 if (*wb_context->max_pending > 0)
7708 {
7710
7711 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7712
7713 pending->tag = *tag;
7714 }
7715
7716 /*
7717 * Perform pending flushes if the writeback limit is exceeded. This
7718 * includes the case where previously an item has been added, but control
7719 * is now disabled.
7720 */
7721 if (wb_context->nr_pending >= *wb_context->max_pending)
7723}
7724
7725#define ST_SORT sort_pending_writebacks
7726#define ST_ELEMENT_TYPE PendingWriteback
7727#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7728#define ST_SCOPE static
7729#define ST_DEFINE
7730#include "lib/sort_template.h"
7731
7732/*
7733 * Issue all pending writeback requests, previously scheduled with
7734 * ScheduleBufferTagForWriteback, to the OS.
7735 *
7736 * Because this is only used to improve the OSs IO scheduling we try to never
7737 * error out - it's just a hint.
7738 */
7739void
7741{
7743 int i;
7744
7745 if (wb_context->nr_pending == 0)
7746 return;
7747
7748 /*
7749 * Executing the writes in-order can make them a lot faster, and allows to
7750 * merge writeback requests to consecutive blocks into larger writebacks.
7751 */
7752 sort_pending_writebacks(wb_context->pending_writebacks,
7753 wb_context->nr_pending);
7754
7756
7757 /*
7758 * Coalesce neighbouring writes, but nothing else. For that we iterate
7759 * through the, now sorted, array of pending flushes, and look forward to
7760 * find all neighbouring (or identical) writes.
7761 */
7762 for (i = 0; i < wb_context->nr_pending; i++)
7763 {
7767 int ahead;
7768 BufferTag tag;
7770 Size nblocks = 1;
7771
7772 cur = &wb_context->pending_writebacks[i];
7773 tag = cur->tag;
7775
7776 /*
7777 * Peek ahead, into following writeback requests, to see if they can
7778 * be combined with the current one.
7779 */
7780 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7781 {
7782
7783 next = &wb_context->pending_writebacks[i + ahead + 1];
7784
7785 /* different file, stop */
7787 BufTagGetRelFileLocator(&next->tag)) ||
7788 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7789 break;
7790
7791 /* ok, block queued twice, skip */
7792 if (cur->tag.blockNum == next->tag.blockNum)
7793 continue;
7794
7795 /* only merge consecutive writes */
7796 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7797 break;
7798
7799 nblocks++;
7800 cur = next;
7801 }
7802
7803 i += ahead;
7804
7805 /* and finally tell the kernel to write the data to storage */
7807 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7808 }
7809
7810 /*
7811 * Assume that writeback requests are only issued for buffers containing
7812 * blocks of permanent relations.
7813 */
7815 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7816
7817 wb_context->nr_pending = 0;
7818}
7819
7820/* ResourceOwner callbacks */
7821
7822static void
7829
7830static char *
7832{
7834
7835 return psprintf("lost track of buffer IO on buffer %d", buffer);
7836}
7837
7838/*
7839 * Release buffer as part of resource owner cleanup. This will only be called
7840 * if the buffer is pinned. If this backend held the content lock at the time
7841 * of the error we also need to release that (note that it is not possible to
7842 * hold a content lock without a pin).
7843 */
7844static void
7846{
7848
7849 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7850 if (!BufferIsValid(buffer))
7851 elog(ERROR, "bad buffer ID: %d", buffer);
7852
7853 if (BufferIsLocal(buffer))
7855 else
7856 {
7858
7860
7861 /* not having a private refcount would imply resowner corruption */
7862 Assert(ref != NULL);
7863
7864 /*
7865 * If the buffer was locked at the time of the resowner release,
7866 * release the lock now. This should only happen after errors.
7867 */
7868 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7869 {
7871
7872 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7874 }
7875
7877 }
7878}
7879
7880static char *
7885
7886/*
7887 * Helper function to evict unpinned buffer whose buffer header lock is
7888 * already acquired.
7889 */
7890static bool
7892{
7894 bool result;
7895
7896 *buffer_flushed = false;
7897
7900
7901 if ((buf_state & BM_VALID) == 0)
7902 {
7903 UnlockBufHdr(desc);
7904 return false;
7905 }
7906
7907 /* Check that it's not pinned already. */
7909 {
7910 UnlockBufHdr(desc);
7911 return false;
7912 }
7913
7914 PinBuffer_Locked(desc); /* releases spinlock */
7915
7916 /* If it was dirty, try to clean it once. */
7917 if (buf_state & BM_DIRTY)
7918 {
7920 *buffer_flushed = true;
7921 }
7922
7923 /* This will return false if it becomes dirty or someone else pins it. */
7925
7926 UnpinBuffer(desc);
7927
7928 return result;
7929}
7930
7931/*
7932 * Try to evict the current block in a shared buffer.
7933 *
7934 * This function is intended for testing/development use only!
7935 *
7936 * To succeed, the buffer must not be pinned on entry, so if the caller had a
7937 * particular block in mind, it might already have been replaced by some other
7938 * block by the time this function runs. It's also unpinned on return, so the
7939 * buffer might be occupied again by the time control is returned, potentially
7940 * even by the same block. This inherent raciness without other interlocking
7941 * makes the function unsuitable for non-testing usage.
7942 *
7943 * *buffer_flushed is set to true if the buffer was dirty and has been
7944 * flushed, false otherwise. However, *buffer_flushed=true does not
7945 * necessarily mean that we flushed the buffer, it could have been flushed by
7946 * someone else.
7947 *
7948 * Returns true if the buffer was valid and it has now been made invalid.
7949 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7950 * or if the buffer becomes dirty again while we're trying to write it out.
7951 */
7952bool
7954{
7955 BufferDesc *desc;
7956
7958
7959 /* Make sure we can pin the buffer. */
7962
7963 desc = GetBufferDescriptor(buf - 1);
7964 LockBufHdr(desc);
7965
7967}
7968
7969/*
7970 * Try to evict all the shared buffers.
7971 *
7972 * This function is intended for testing/development use only! See
7973 * EvictUnpinnedBuffer().
7974 *
7975 * The buffers_* parameters are mandatory and indicate the total count of
7976 * buffers that:
7977 * - buffers_evicted - were evicted
7978 * - buffers_flushed - were flushed
7979 * - buffers_skipped - could not be evicted
7980 */
7981void
7984{
7985 *buffers_evicted = 0;
7986 *buffers_skipped = 0;
7987 *buffers_flushed = 0;
7988
7989 for (int buf = 1; buf <= NBuffers; buf++)
7990 {
7991 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7993 bool buffer_flushed;
7994
7996
7998 if (!(buf_state & BM_VALID))
7999 continue;
8000
8003
8004 LockBufHdr(desc);
8005
8007 (*buffers_evicted)++;
8008 else
8009 (*buffers_skipped)++;
8010
8011 if (buffer_flushed)
8012 (*buffers_flushed)++;
8013 }
8014}
8015
8016/*
8017 * Try to evict all the shared buffers containing provided relation's pages.
8018 *
8019 * This function is intended for testing/development use only! See
8020 * EvictUnpinnedBuffer().
8021 *
8022 * The caller must hold at least AccessShareLock on the relation to prevent
8023 * the relation from being dropped.
8024 *
8025 * The buffers_* parameters are mandatory and indicate the total count of
8026 * buffers that:
8027 * - buffers_evicted - were evicted
8028 * - buffers_flushed - were flushed
8029 * - buffers_skipped - could not be evicted
8030 */
8031void
8034{
8036
8037 *buffers_skipped = 0;
8038 *buffers_evicted = 0;
8039 *buffers_flushed = 0;
8040
8041 for (int buf = 1; buf <= NBuffers; buf++)
8042 {
8043 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8045 bool buffer_flushed;
8046
8048
8049 /* An unlocked precheck should be safe and saves some cycles. */
8050 if ((buf_state & BM_VALID) == 0 ||
8052 continue;
8053
8054 /* Make sure we can pin the buffer. */
8057
8058 buf_state = LockBufHdr(desc);
8059
8060 /* recheck, could have changed without the lock */
8061 if ((buf_state & BM_VALID) == 0 ||
8063 {
8064 UnlockBufHdr(desc);
8065 continue;
8066 }
8067
8069 (*buffers_evicted)++;
8070 else
8071 (*buffers_skipped)++;
8072
8073 if (buffer_flushed)
8074 (*buffers_flushed)++;
8075 }
8076}
8077
8078/*
8079 * Helper function to mark unpinned buffer dirty whose buffer header lock is
8080 * already acquired.
8081 */
8082static bool
8085{
8087 bool result = false;
8088
8089 *buffer_already_dirty = false;
8090
8093
8094 if ((buf_state & BM_VALID) == 0)
8095 {
8096 UnlockBufHdr(desc);
8097 return false;
8098 }
8099
8100 /* Check that it's not pinned already. */
8102 {
8103 UnlockBufHdr(desc);
8104 return false;
8105 }
8106
8107 /* Pin the buffer and then release the buffer spinlock */
8108 PinBuffer_Locked(desc);
8109
8110 /* If it was not already dirty, mark it as dirty. */
8111 if (!(buf_state & BM_DIRTY))
8112 {
8115 result = true;
8116 BufferLockUnlock(buf, desc);
8117 }
8118 else
8119 *buffer_already_dirty = true;
8120
8121 UnpinBuffer(desc);
8122
8123 return result;
8124}
8125
8126/*
8127 * Try to mark the provided shared buffer as dirty.
8128 *
8129 * This function is intended for testing/development use only!
8130 *
8131 * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
8132 *
8133 * The buffer_already_dirty parameter is mandatory and indicate if the buffer
8134 * could not be dirtied because it is already dirty.
8135 *
8136 * Returns true if the buffer has successfully been marked as dirty.
8137 */
8138bool
8140{
8141 BufferDesc *desc;
8142 bool buffer_dirtied = false;
8143
8145
8146 /* Make sure we can pin the buffer. */
8149
8150 desc = GetBufferDescriptor(buf - 1);
8151 LockBufHdr(desc);
8152
8154 /* Both can not be true at the same time */
8156
8157 return buffer_dirtied;
8158}
8159
8160/*
8161 * Try to mark all the shared buffers containing provided relation's pages as
8162 * dirty.
8163 *
8164 * This function is intended for testing/development use only! See
8165 * MarkDirtyUnpinnedBuffer().
8166 *
8167 * The buffers_* parameters are mandatory and indicate the total count of
8168 * buffers that:
8169 * - buffers_dirtied - were dirtied
8170 * - buffers_already_dirty - were already dirty
8171 * - buffers_skipped - could not be dirtied because of a reason different
8172 * than a buffer being already dirty.
8173 */
8174void
8179{
8181
8182 *buffers_dirtied = 0;
8184 *buffers_skipped = 0;
8185
8186 for (int buf = 1; buf <= NBuffers; buf++)
8187 {
8188 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8191
8193
8194 /* An unlocked precheck should be safe and saves some cycles. */
8195 if ((buf_state & BM_VALID) == 0 ||
8197 continue;
8198
8199 /* Make sure we can pin the buffer. */
8202
8203 buf_state = LockBufHdr(desc);
8204
8205 /* recheck, could have changed without the lock */
8206 if ((buf_state & BM_VALID) == 0 ||
8208 {
8209 UnlockBufHdr(desc);
8210 continue;
8211 }
8212
8214 (*buffers_dirtied)++;
8215 else if (buffer_already_dirty)
8216 (*buffers_already_dirty)++;
8217 else
8218 (*buffers_skipped)++;
8219 }
8220}
8221
8222/*
8223 * Try to mark all the shared buffers as dirty.
8224 *
8225 * This function is intended for testing/development use only! See
8226 * MarkDirtyUnpinnedBuffer().
8227 *
8228 * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
8229 * parameters.
8230 */
8231void
8235{
8236 *buffers_dirtied = 0;
8238 *buffers_skipped = 0;
8239
8240 for (int buf = 1; buf <= NBuffers; buf++)
8241 {
8242 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8245
8247
8249 if (!(buf_state & BM_VALID))
8250 continue;
8251
8254
8255 LockBufHdr(desc);
8256
8258 (*buffers_dirtied)++;
8259 else if (buffer_already_dirty)
8260 (*buffers_already_dirty)++;
8261 else
8262 (*buffers_skipped)++;
8263 }
8264}
8265
8266/*
8267 * Generic implementation of the AIO handle staging callback for readv/writev
8268 * on local/shared buffers.
8269 *
8270 * Each readv/writev can target multiple buffers. The buffers have already
8271 * been registered with the IO handle.
8272 *
8273 * To make the IO ready for execution ("staging"), we need to ensure that the
8274 * targeted buffers are in an appropriate state while the IO is ongoing. For
8275 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
8276 * in this backend could lead to this backend's buffer pin being released as
8277 * part of error handling, which in turn could lead to the buffer being
8278 * replaced while IO is ongoing.
8279 */
8282{
8283 uint64 *io_data;
8284 uint8 handle_data_len;
8287
8288 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8289
8291
8292 /* iterate over all buffers affected by the vectored readv/writev */
8293 for (int i = 0; i < handle_data_len; i++)
8294 {
8296 BufferDesc *buf_hdr = is_temp ?
8300
8301 /*
8302 * Check that all the buffers are actually ones that could conceivably
8303 * be done in one IO, i.e. are sequential. This is the last
8304 * buffer-aware code before IO is actually executed and confusion
8305 * about which buffers are targeted by IO can be hard to debug, making
8306 * it worth doing extra-paranoid checks.
8307 */
8308 if (i == 0)
8309 first = buf_hdr->tag;
8310 else
8311 {
8312 Assert(buf_hdr->tag.relNumber == first.relNumber);
8313 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8314 }
8315
8316 if (is_temp)
8318 else
8320
8321 /* verify the buffer is in the expected state */
8323 if (is_write)
8324 {
8327 }
8328 else
8329 {
8332 }
8333
8334 /* temp buffers don't use BM_IO_IN_PROGRESS */
8335 if (!is_temp)
8337
8339
8340 /*
8341 * Reflect that the buffer is now owned by the AIO subsystem.
8342 *
8343 * For local buffers: This can't be done just via LocalRefCount, as
8344 * one might initially think, as this backend could error out while
8345 * AIO is still in progress, releasing all the pins by the backend
8346 * itself.
8347 *
8348 * This pin is released again in TerminateBufferIO().
8349 */
8350 buf_hdr->io_wref = io_ref;
8351
8352 if (is_temp)
8353 {
8356 }
8357 else
8359
8360 /*
8361 * Ensure the content lock that prevents buffer modifications while
8362 * the buffer is being written out is not released early due to an
8363 * error.
8364 */
8365 if (is_write && !is_temp)
8366 {
8368
8369 /*
8370 * Lock is now owned by AIO subsystem.
8371 */
8373 }
8374
8375 /*
8376 * Stop tracking this buffer via the resowner - the AIO system now
8377 * keeps track.
8378 */
8379 if (!is_temp)
8381 }
8382}
8383
8384/*
8385 * Decode readv errors as encoded by buffer_readv_encode_error().
8386 */
8387static inline void
8389 bool *zeroed_any,
8390 bool *ignored_any,
8394{
8395 uint32 rem_error = result.error_data;
8396
8397 /* see static asserts in buffer_readv_encode_error */
8398#define READV_COUNT_BITS 7
8399#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8400
8401 *zeroed_any = rem_error & 1;
8402 rem_error >>= 1;
8403
8404 *ignored_any = rem_error & 1;
8405 rem_error >>= 1;
8406
8409
8412
8415}
8416
8417/*
8418 * Helper to encode errors for buffer_readv_complete()
8419 *
8420 * Errors are encoded as follows:
8421 * - bit 0 indicates whether any page was zeroed (1) or not (0)
8422 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8423 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8424 * - next READV_COUNT_BITS bits indicate the number of checksum failures
8425 * - next READV_COUNT_BITS bits indicate the first offset of the first page
8426 * that was errored or zeroed or, if no errors/zeroes, the first ignored
8427 * checksum
8428 */
8429static inline void
8431 bool is_temp,
8432 bool zeroed_any,
8433 bool ignored_any,
8440{
8441
8442 uint8 shift = 0;
8446
8448 "PG_IOV_MAX is bigger than reserved space for error data");
8450 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8451
8452 /*
8453 * We only have space to encode one offset - but luckily that's good
8454 * enough. If there is an error, the error is the interesting offset, same
8455 * with a zeroed buffer vs an ignored buffer.
8456 */
8457 if (error_count > 0)
8459 else if (zeroed_count > 0)
8461 else
8463
8464 Assert(!zeroed_any || error_count == 0);
8465
8466 result->error_data = 0;
8467
8468 result->error_data |= zeroed_any << shift;
8469 shift += 1;
8470
8471 result->error_data |= ignored_any << shift;
8472 shift += 1;
8473
8474 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8475 shift += READV_COUNT_BITS;
8476
8477 result->error_data |= ((uint32) checkfail_count) << shift;
8478 shift += READV_COUNT_BITS;
8479
8480 result->error_data |= ((uint32) first_off) << shift;
8481 shift += READV_COUNT_BITS;
8482
8483 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8485
8486 if (error_count > 0)
8487 result->status = PGAIO_RS_ERROR;
8488 else
8489 result->status = PGAIO_RS_WARNING;
8490
8491 /*
8492 * The encoding is complicated enough to warrant cross-checking it against
8493 * the decode function.
8494 */
8495#ifdef USE_ASSERT_CHECKING
8496 {
8497 bool zeroed_any_2,
8502
8507 &first_off_2);
8513 }
8514#endif
8515
8516#undef READV_COUNT_BITS
8517#undef READV_COUNT_MASK
8518}
8519
8520/*
8521 * Helper for AIO readv completion callbacks, supporting both shared and temp
8522 * buffers. Gets called once for each buffer in a multi-page read.
8523 */
8526 uint8 flags, bool failed, bool is_temp,
8527 bool *buffer_invalid,
8528 bool *failed_checksum,
8529 bool *ignored_checksum,
8530 bool *zeroed_buffer)
8531{
8532 BufferDesc *buf_hdr = is_temp ?
8535 BufferTag tag = buf_hdr->tag;
8536 char *bufdata = BufferGetBlock(buffer);
8538 int piv_flags;
8539
8540 /* check that the buffer is in the expected state for a read */
8541#ifdef USE_ASSERT_CHECKING
8542 {
8544
8547 /* temp buffers don't use BM_IO_IN_PROGRESS */
8548 if (!is_temp)
8551 }
8552#endif
8553
8554 *buffer_invalid = false;
8555 *failed_checksum = false;
8556 *ignored_checksum = false;
8557 *zeroed_buffer = false;
8558
8559 /*
8560 * We ask PageIsVerified() to only log the message about checksum errors,
8561 * as the completion might be run in any backend (or IO workers). We will
8562 * report checksum errors in buffer_readv_report().
8563 */
8565
8566 /* the local zero_damaged_pages may differ from the definer's */
8569
8570 /*
8571 * If the buffers are marked for zero on error, we want to log that in
8572 * case of a checksum failure.
8573 */
8574 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8576
8577 /* Check for garbage data. */
8578 if (!failed)
8579 {
8580 /*
8581 * If the buffer is not currently pinned by this backend, e.g. because
8582 * we're completing this IO after an error, the buffer data will have
8583 * been marked as inaccessible when the buffer was unpinned. The AIO
8584 * subsystem holds a pin, but that doesn't prevent the buffer from
8585 * having been marked as inaccessible. The completion might also be
8586 * executed in a different process.
8587 */
8588#ifdef USE_VALGRIND
8589 if (!BufferIsPinned(buffer))
8591#endif
8592
8593 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8595 {
8596 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8597 {
8598 memset(bufdata, 0, BLCKSZ);
8599 *zeroed_buffer = true;
8600 }
8601 else
8602 {
8603 *buffer_invalid = true;
8604 /* mark buffer as having failed */
8605 failed = true;
8606 }
8607 }
8608 else if (*failed_checksum)
8609 *ignored_checksum = true;
8610
8611 /* undo what we did above */
8612#ifdef USE_VALGRIND
8613 if (!BufferIsPinned(buffer))
8615#endif
8616
8617 /*
8618 * Immediately log a message about the invalid page, but only to the
8619 * server log. The reason to do so immediately is that this may be
8620 * executed in a different backend than the one that originated the
8621 * request. The reason to do so immediately is that the originator
8622 * might not process the query result immediately (because it is busy
8623 * doing another part of query processing) or at all (e.g. if it was
8624 * cancelled or errored out due to another IO also failing). The
8625 * definer of the IO will emit an ERROR or WARNING when processing the
8626 * IO's results
8627 *
8628 * To avoid duplicating the code to emit these log messages, we reuse
8629 * buffer_readv_report().
8630 */
8632 {
8633 PgAioResult result_one = {0};
8634
8639 *zeroed_buffer ? 1 : 0,
8640 *failed_checksum ? 1 : 0,
8643 }
8644 }
8645
8646 /* Terminate I/O and set BM_VALID. */
8647 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8648 if (is_temp)
8650 else
8651 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8652
8653 /*
8654 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8655 * callback may not be executed in the same backend that called
8656 * BUFFER_READ_START. The alternative would be to defer calling the
8657 * tracepoint to a later point (e.g. the local completion callback for
8658 * shared buffer reads), which seems even less helpful.
8659 */
8661 tag.blockNum,
8662 tag.spcOid,
8663 tag.dbOid,
8664 tag.relNumber,
8666 false);
8667}
8668
8669/*
8670 * Perform completion handling of a single AIO read. This read may cover
8671 * multiple blocks / buffers.
8672 *
8673 * Shared between shared and local buffers, to reduce code duplication.
8674 */
8677 uint8 cb_data, bool is_temp)
8678{
8684 uint8 error_count = 0;
8685 uint8 zeroed_count = 0;
8686 uint8 ignored_count = 0;
8688 uint64 *io_data;
8689 uint8 handle_data_len;
8690
8691 if (is_temp)
8692 {
8693 Assert(td->smgr.is_temp);
8695 }
8696 else
8697 Assert(!td->smgr.is_temp);
8698
8699 /*
8700 * Iterate over all the buffers affected by this IO and call the
8701 * per-buffer completion function for each buffer.
8702 */
8703 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8704 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8705 {
8707 bool failed;
8708 bool failed_verification = false;
8709 bool failed_checksum = false;
8710 bool zeroed_buffer = false;
8711 bool ignored_checksum = false;
8712
8714
8715 /*
8716 * If the entire I/O failed on a lower-level, each buffer needs to be
8717 * marked as failed. In case of a partial read, the first few buffers
8718 * may be ok.
8719 */
8720 failed =
8722 || prior_result.result <= buf_off;
8723
8724 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8728 &zeroed_buffer);
8729
8730 /*
8731 * Track information about the number of different kinds of error
8732 * conditions across all pages, as there can be multiple pages failing
8733 * verification as part of one IO.
8734 */
8737 if (zeroed_buffer && zeroed_count++ == 0)
8739 if (ignored_checksum && ignored_count++ == 0)
8741 if (failed_checksum)
8743 }
8744
8745 /*
8746 * If the smgr read succeeded [partially] and page verification failed for
8747 * some of the pages, adjust the IO's result state appropriately.
8748 */
8749 if (prior_result.status != PGAIO_RS_ERROR &&
8750 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8751 {
8753 zeroed_count > 0, ignored_count > 0,
8758 }
8759
8760 /*
8761 * For shared relations this reporting is done in
8762 * shared_buffer_readv_complete_local().
8763 */
8764 if (is_temp && checkfail_count > 0)
8767
8768 return result;
8769}
8770
8771/*
8772 * AIO error reporting callback for aio_shared_buffer_readv_cb and
8773 * aio_local_buffer_readv_cb.
8774 *
8775 * The error is encoded / decoded in buffer_readv_encode_error() /
8776 * buffer_readv_decode_error().
8777 */
8778static void
8780 int elevel)
8781{
8782 int nblocks = td->smgr.nblocks;
8783 BlockNumber first = td->smgr.blockNum;
8784 BlockNumber last = first + nblocks - 1;
8787 RelPathStr rpath =
8789 bool zeroed_any,
8793 first_off;
8795 const char *msg_one,
8796 *msg_mult,
8797 *det_mult,
8798 *hint_mult;
8799
8803 &first_off);
8804
8805 /*
8806 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8807 * special case, it's too irregular to be emitted the same way as the
8808 * other cases.
8809 */
8810 if (zeroed_any && ignored_any)
8811 {
8813 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8814 Assert(result.status != PGAIO_RS_ERROR);
8816
8817 ereport(elevel,
8819 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8820 affected_count, checkfail_count, first, last, rpath.str),
8821 affected_count > 1 ?
8822 errdetail("Block %u held the first zeroed page.",
8823 first + first_off) : 0,
8824 errhint_plural("See server log for details about the other %d invalid block.",
8825 "See server log for details about the other %d invalid blocks.",
8828 return;
8829 }
8830
8831 /*
8832 * The other messages are highly repetitive. To avoid duplicating a long
8833 * and complicated ereport(), gather the translated format strings
8834 * separately and then do one common ereport.
8835 */
8836 if (result.status == PGAIO_RS_ERROR)
8837 {
8838 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8840 msg_one = _("invalid page in block %u of relation \"%s\"");
8841 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8842 det_mult = _("Block %u held the first invalid page.");
8843 hint_mult = _("See server log for the other %u invalid block(s).");
8844 }
8845 else if (zeroed_any && !ignored_any)
8846 {
8848 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8849 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8850 det_mult = _("Block %u held the first zeroed page.");
8851 hint_mult = _("See server log for the other %u zeroed block(s).");
8852 }
8853 else if (!zeroed_any && ignored_any)
8854 {
8856 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8857 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8858 det_mult = _("Block %u held the first ignored page.");
8859 hint_mult = _("See server log for the other %u ignored block(s).");
8860 }
8861 else
8863
8864 ereport(elevel,
8866 affected_count == 1 ?
8867 errmsg_internal(msg_one, first + first_off, rpath.str) :
8868 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8871}
8872
8873static void
8878
8879static PgAioResult
8885
8886/*
8887 * We need a backend-local completion callback for shared buffers, to be able
8888 * to report checksum errors correctly. Unfortunately that can only safely
8889 * happen if the reporting backend has previously called
8890 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8891 * the backend that started the IO. Hence this callback.
8892 */
8893static PgAioResult
8923
8924static void
8929
8930static PgAioResult
8936
8937/* readv callback is passed READ_BUFFERS_* flags as callback data */
8940 .complete_shared = shared_buffer_readv_complete,
8941 /* need a local callback to report checksum failures */
8942 .complete_local = shared_buffer_readv_complete_local,
8943 .report = buffer_readv_report,
8944};
8945
8946/* readv callback is passed READ_BUFFERS_* flags as callback data */
8949
8950 /*
8951 * Note that this, in contrast to the shared_buffers case, uses
8952 * complete_local, as only the issuing backend has access to the required
8953 * datastructures. This is important in case the IO completion may be
8954 * consumed incidentally by another backend.
8955 */
8956 .complete_local = local_buffer_readv_complete,
8957 .report = buffer_readv_report,
8958};
int io_method
Definition aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
bool pgaio_have_staged(void)
Definition aio.c:1117
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
void pgaio_submit_staged(void)
Definition aio.c:1133
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ IOMETHOD_SYNC
Definition aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_OK
Definition aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define pg_write_barrier()
Definition atomics.h:155
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1775
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1639
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1603
int BgWriterDelay
Definition bgwriter.c:59
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
#define MaxBlockNumber
Definition block.h:35
static int32 next
Definition blutils.c:225
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
#define BufferIsLocal(buffer)
Definition buf.h:37
CkptSortItem * CkptBufferIds
Definition buf_init.c:28
WritebackContext BackendWritebackContext
Definition buf_init.c:27
#define BM_MAX_USAGE_COUNT
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
#define BM_PERMANENT
#define BUF_USAGECOUNT_MASK
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BM_LOCK_VAL_SHARED
#define BUF_REFCOUNT_ONE
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_LOCK_VAL_EXCLUSIVE
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
#define BM_PIN_COUNT_WAITER
#define BM_DIRTY
#define BM_LOCK_WAKE_IN_PROGRESS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
#define BUF_STATE_GET_USAGECOUNT(state)
#define BM_LOCK_MASK
StartBufferIOResult
@ BUFFER_IO_IN_PROGRESS
@ BUFFER_IO_ALREADY_DONE
@ BUFFER_IO_READY_FOR_IO
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
#define BUF_STATE_GET_REFCOUNT(state)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_LOCK_HAS_WAITERS
#define BM_IO_ERROR
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BM_LOCK_VAL_SHARE_EXCLUSIVE
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:154
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:96
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:84
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:124
bool track_io_timing
Definition bufmgr.c:192
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7845
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6637
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition bufmgr.c:5250
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5670
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5696
void DropDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5115
bool BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
Definition bufmgr.c:7093
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition bufmgr.c:7624
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8676
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4446
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
Definition bufmgr.c:7070
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition bufmgr.c:4765
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition bufmgr.c:3212
static int ReservedRefCountSlot
Definition bufmgr.c:268
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8894
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1368
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1647
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:787
static uint32 PrivateRefCountClock
Definition bufmgr.c:267
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4503
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7823
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8931
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1609
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7982
int io_max_combine_limit
Definition bufmgr.c:217
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4626
const ResourceOwnerDesc buffer_io_resowner_desc
Definition bufmgr.c:285
bool zero_damaged_pages
Definition bufmgr.c:189
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3388
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:8032
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8525
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7881
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5898
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6096
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition bufmgr.c:7590
bool IsBufferCleanupOK(Buffer buffer)
Definition bufmgr.c:6901
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7831
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3087
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6271
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:979
void AtEOXact_Buffers(bool isCommit)
Definition bufmgr.c:4199
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7420
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition bufmgr.c:8938
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6014
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1011
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1285
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6305
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1705
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7566
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1146
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2188
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4263
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition bufmgr.c:5462
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition bufmgr.c:4885
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6203
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7491
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6525
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition bufmgr.c:1040
static pg_attribute_always_inline void TrackBufferHit(IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
Definition bufmgr.c:1674
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition bufmgr.c:8947
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2462
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4245
int io_combine_limit_guc
Definition bufmgr.c:216
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7655
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4467
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6050
const ResourceOwnerDesc buffer_resowner_desc
Definition bufmgr.c:294
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8281
void UnlockBuffer(Buffer buffer)
Definition bufmgr.c:6558
#define BUF_REUSABLE
Definition bufmgr.c:85
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7475
static void BufferSync(int flags)
Definition bufmgr.c:3552
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1929
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8925
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4389
void CheckPointBuffers(int flags)
Definition bufmgr.c:4432
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3114
static uint32 MaxProportionalPins
Definition bufmgr.c:271
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6163
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2786
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6285
bool BgBufferSync(WritebackContext *wb_context)
Definition bufmgr.c:3831
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7518
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3420
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
Definition bufmgr.c:1232
bool BufferIsPermanent(Buffer buffer)
Definition bufmgr.c:4677
void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:8232
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8874
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6470
void UnlockBuffers(void)
Definition bufmgr.c:5852
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:697
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8880
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2539
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6617
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition bufmgr.c:4645
StartBufferIOResult StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7241
int bgwriter_flush_after
Definition bufmgr.c:224
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5586
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:3061
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3272
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:5055
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4713
void LockBufferInternal(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:6574
bool HoldingBufferPinThatDelaysRecovery(void)
Definition bufmgr.c:6817
bool MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
Definition bufmgr.c:8139
int checkpoint_flush_after
Definition bufmgr.c:223
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5603
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3465
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7459
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7690
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7678
StartBufferIOResult StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7321
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3147
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7891
bool BufferBeginSetHintBits(Buffer buffer)
Definition bufmgr.c:7042
int backend_flush_after
Definition bufmgr.c:225
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2724
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8779
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
bool WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1750
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2742
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6670
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6951
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5821
void FlushRelationBuffers(Relation rel)
Definition bufmgr.c:5162
#define READV_COUNT_BITS
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6441
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7740
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition bufmgr.c:7953
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:963
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition bufmgr.c:818
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
int maintenance_io_concurrency
Definition bufmgr.c:207
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3456
void FlushDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5526
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2361
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5348
int effective_io_concurrency
Definition bufmgr.c:200
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6543
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7358
void MarkDirtyRelUnpinnedBuffers(Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:8175
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1628
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:8083
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8388
#define READV_COUNT_MASK
static int PrivateRefCountEntryLast
Definition bufmgr.c:269
int io_combine_limit
Definition bufmgr.c:215
void InitBufferManagerAccess(void)
Definition bufmgr.c:4216
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8430
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4129
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2698
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:879
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3512
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6843
int bgwriter_lru_maxpages
Definition bufmgr.c:190
uint32 GetPinLimit(void)
Definition bufmgr.c:2686
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7139
#define BUF_WRITTEN
Definition bufmgr.c:84
void FlushOneBuffer(Buffer buffer)
Definition bufmgr.c:5566
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define P_NEW
Definition bufmgr.h:200
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
#define DEFAULT_IO_COMBINE_LIMIT
Definition bufmgr.h:176
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:435
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
BufferLockMode
Definition bufmgr.h:206
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:217
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:212
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:222
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:207
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:175
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition bufmgr.h:170
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition bufmgr.h:171
void * Block
Definition bufmgr.h:26
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
ReadBufferMode
Definition bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
@ RBM_NORMAL
Definition bufmgr.h:46
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
bool ignore_checksum_failure
Definition bufpage.c:27
void PageSetChecksum(Page page, BlockNumber blkno)
Definition bufpage.c:1518
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:500
#define PIV_ZERO_BUFFERS_ON_ERROR
Definition bufpage.h:502
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:416
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:501
#define pg_noinline
Definition c.h:321
#define likely(x)
Definition c.h:437
uint8_t uint8
Definition c.h:622
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
#define Max(x, y)
Definition c.h:1085
#define Assert(condition)
Definition c.h:943
double float8
Definition c.h:714
#define pg_attribute_always_inline
Definition c.h:305
int16_t int16
Definition c.h:619
int32_t int32
Definition c.h:620
uint64_t uint64
Definition c.h:625
uint16_t uint16
Definition c.h:623
#define pg_unreachable()
Definition c.h:367
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
#define lengthof(array)
Definition c.h:873
#define MemSet(start, val, len)
Definition c.h:1107
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1008
size_t Size
Definition c.h:689
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
static DataChecksumsWorkerOperation operation
int64 TimestampTz
Definition timestamp.h:39
struct cursor * cur
Definition ecpg.c:29
Datum arg
Definition elog.c:1322
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
#define _(x)
Definition elog.c:95
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
#define errcontext
Definition elog.h:200
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
#define DEBUG3
Definition elog.h:29
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define LOG_SERVER_ONLY
Definition elog.h:33
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:37
#define DEBUG2
Definition elog.h:30
#define PANIC
Definition elog.h:44
#define DEBUG1
Definition elog.h:31
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
#define palloc_array(type, count)
Definition fe_memutils.h:76
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:331
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:426
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:184
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:608
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:712
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:752
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
int NBuffers
Definition globals.c:144
bool enableFsync
Definition globals.c:131
ProcNumber MyProcNumber
Definition globals.c:92
int VacuumCostPageMiss
Definition globals.c:155
bool VacuumCostActive
Definition globals.c:161
bool IsUnderPostmaster
Definition globals.c:122
int VacuumCostBalance
Definition globals.c:160
int MaxBackends
Definition globals.c:149
int VacuumCostPageDirty
Definition globals.c:156
int VacuumCostPageHit
Definition globals.c:154
const char * str
long val
Definition informix.c:689
BufferUsage pgBufferUsage
Definition instrument.c:25
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
int b
Definition isn.c:74
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
int32 * LocalRefCount
Definition localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:857
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1019
StartBufferIOResult StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition localbuf.c:524
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1030
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:821
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:492
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:718
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:578
int NLocBuffer
Definition localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:347
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:864
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:681
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
@ LW_WS_WAITING
Definition lwlock.h:31
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define RESUME_INTERRUPTS()
Definition miscadmin.h:138
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
#define HOLD_INTERRUPTS()
Definition miscadmin.h:136
#define END_CRIT_SECTION()
Definition miscadmin.h:154
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
static PgChecksumMode mode
static int64 current_size
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
const void * data
#define PG_IOV_MAX
Definition pg_iovec.h:47
static char buf[DEFAULT_XLOG_SEG_SIZE]
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:742
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_EXTEND
Definition pgstat.h:318
@ IOOP_READ
Definition pgstat.h:319
@ IOOP_WRITEBACK
Definition pgstat.h:315
@ IOOP_HIT
Definition pgstat.h:313
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
@ IOOP_WRITE
Definition pgstat.h:320
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:747
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define qsort(a, b, c, d)
Definition port.h:495
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:333
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:313
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define NUM_AUXILIARY_PROCS
Definition proc.h:527
#define GetPGProcByNumber(n)
Definition proc.h:504
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
int ProcNumber
Definition procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition procsignal.c:503
void set_ps_display_remove_suffix(void)
Definition ps_status.c:440
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:388
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:578
#define RelationUsesLocalBuffers(relation)
Definition rel.h:648
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:669
#define RelationIsValid(relation)
Definition rel.h:491
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
#define free(a)
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2027
PGPROC * MyProc
Definition proc.c:71
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:771
int DeadlockTimeout
Definition proc.c:62
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:2015
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:795
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:49
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187
BufferTag tag
pg_atomic_uint64 state
int64 shared_blks_dirtied
Definition instrument.h:28
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 shared_blks_written
Definition instrument.h:29
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
int ckpt_bufs_written
Definition xlog.h:179
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_scanned
Definition bufmgr.c:169
float8 progress
Definition bufmgr.c:163
int num_to_scan
Definition bufmgr.c:167
struct ErrorContextCallback * previous
Definition elog.h:299
void(* callback)(void *arg)
Definition elog.h:300
Definition proc.h:179
uint8 lwWaitMode
Definition proc.h:284
PGSemaphore sem
Definition proc.h:258
uint8 lwWaiting
Definition proc.h:283
PgAioHandleCallbackStage stage
Definition aio.h:219
uint32 status
Definition aio_types.h:108
PgAioResult result
Definition aio_types.h:132
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248
PgStat_Counter buffers_written
Definition pgstat.h:270
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
const char * name
Definition resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
SMgrRelation srel
Definition bufmgr.c:185
RelFileLocator rlocator
Definition bufmgr.c:184
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1940
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@131 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83
static volatile sig_atomic_t waiting
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
bool RecoveryInProgress(void)
Definition xlog.c:6830
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3163
CheckpointStatsData CheckpointStats
Definition xlog.c:216
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2801
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:155
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:152
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:151
#define XLogIsNeeded()
Definition xlog.h:112
#define XLogHintBitIsNeeded()
Definition xlog.h:123
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
#define InHotStandby
Definition xlogutils.h:60