PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/proclist.h"
62#include "storage/procsignal.h"
63#include "storage/read_stream.h"
64#include "storage/smgr.h"
65#include "storage/standby.h"
66#include "utils/memdebug.h"
67#include "utils/ps_status.h"
68#include "utils/rel.h"
69#include "utils/resowner.h"
70#include "utils/timestamp.h"
71
72
73/* Note: these two macros only work on shared buffers, not local ones! */
74#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
75#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
76
77/* Note: this macro only works on local buffers, not shared ones! */
78#define LocalBufHdrGetBlock(bufHdr) \
79 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
80
81/* Bits in SyncOneBuffer's return value */
82#define BUF_WRITTEN 0x01
83#define BUF_REUSABLE 0x02
84
85#define RELS_BSEARCH_THRESHOLD 20
86
87/*
88 * This is the size (in the number of blocks) above which we scan the
89 * entire buffer pool to remove the buffers for all the pages of relation
90 * being dropped. For the relations with size below this threshold, we find
91 * the buffers by doing lookups in BufMapping table.
92 */
93#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
94
95/*
96 * This is separated out from PrivateRefCountEntry to allow for copying all
97 * the data members via struct assignment.
98 */
99typedef struct PrivateRefCountData
100{
101 /*
102 * How many times has the buffer been pinned by this backend.
103 */
105
106 /*
107 * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
108 * the buffer is not locked.
109 */
112
114{
115 /*
116 * Note that this needs to be same as the entry's corresponding
117 * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
118 * store it in both places as this is used for the hashtable key and
119 * because it is more convenient (passing around a PrivateRefCountEntry
120 * suffices to identify the buffer) and faster (checking the keys array is
121 * faster when checking many entries, checking the entry is faster if just
122 * checking a single entry).
123 */
125
128
129/* 64 bytes, about the size of a cache line on common systems */
130#define REFCOUNT_ARRAY_ENTRIES 8
131
132/*
133 * Status of buffers to checkpoint for a particular tablespace, used
134 * internally in BufferSync.
135 */
136typedef struct CkptTsStatus
137{
138 /* oid of the tablespace */
140
141 /*
142 * Checkpoint progress for this tablespace. To make progress comparable
143 * between tablespaces the progress is, for each tablespace, measured as a
144 * number between 0 and the total number of to-be-checkpointed pages. Each
145 * page checkpointed in this tablespace increments this space's progress
146 * by progress_slice.
147 */
150
151 /* number of to-be checkpointed pages in this tablespace */
153 /* already processed pages in this tablespace */
155
156 /* current offset in CkptBufferIds for this tablespace */
157 int index;
159
160/*
161 * Type for array used to sort SMgrRelations
162 *
163 * FlushRelationsAllBuffers shares the same comparator function with
164 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
165 * compatible.
166 */
167typedef struct SMgrSortArray
168{
169 RelFileLocator rlocator; /* This must be the first member */
172
173/* GUC variables */
177bool track_io_timing = false;
178
179/*
180 * How many buffers PrefetchBuffer callers should try to stay ahead of their
181 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
182 * for buffers not belonging to tablespaces that have their
183 * effective_io_concurrency parameter set.
184 */
186
187/*
188 * Like effective_io_concurrency, but used by maintenance code paths that might
189 * benefit from a higher setting because they work on behalf of many sessions.
190 * Overridden by the tablespace setting of the same name.
191 */
193
194/*
195 * Limit on how many blocks should be handled in single I/O operations.
196 * StartReadBuffers() callers should respect it, as should other operations
197 * that call smgr APIs directly. It is computed as the minimum of underlying
198 * GUCs io_combine_limit_guc and io_max_combine_limit.
199 */
203
204/*
205 * GUC variables about triggering kernel writeback for buffers written; OS
206 * dependent defaults are set via the GUC mechanism.
207 */
211
212/* local state for LockBufferForCleanup */
214
215/*
216 * Backend-Private refcount management:
217 *
218 * Each buffer also has a private refcount that keeps track of the number of
219 * times the buffer is pinned in the current process. This is so that the
220 * shared refcount needs to be modified only once if a buffer is pinned more
221 * than once by an individual backend. It's also used to check that no
222 * buffers are still pinned at the end of transactions and when exiting. We
223 * also use this mechanism to track whether this backend has a buffer locked,
224 * and, if so, in what mode.
225 *
226 *
227 * To avoid - as we used to - requiring an array with NBuffers entries to keep
228 * track of local buffers, we use a small sequentially searched array
229 * (PrivateRefCountArrayKeys, with the corresponding data stored in
230 * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
231 * keep track of backend local pins.
232 *
233 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
234 * refcounts are kept track of in the array; after that, new array entries
235 * displace old ones into the hash table. That way a frequently used entry
236 * can't get "stuck" in the hashtable while infrequent ones clog the array.
237 *
238 * Note that in most scenarios the number of pinned buffers will not exceed
239 * REFCOUNT_ARRAY_ENTRIES.
240 *
241 *
242 * To enter a buffer into the refcount tracking mechanism first reserve a free
243 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
244 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
245 * memory allocations in NewPrivateRefCountEntry() which can be important
246 * because in some scenarios it's called with a spinlock held...
247 */
253static int ReservedRefCountSlot = -1;
255
257
258static void ReservePrivateRefCountEntry(void);
263
264/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
265static void ResOwnerReleaseBufferIO(Datum res);
266static char *ResOwnerPrintBufferIO(Datum res);
267static void ResOwnerReleaseBuffer(Datum res);
268static char *ResOwnerPrintBuffer(Datum res);
269
271{
272 .name = "buffer io",
273 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
274 .release_priority = RELEASE_PRIO_BUFFER_IOS,
275 .ReleaseResource = ResOwnerReleaseBufferIO,
276 .DebugPrint = ResOwnerPrintBufferIO
277};
278
280{
281 .name = "buffer",
282 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
283 .release_priority = RELEASE_PRIO_BUFFER_PINS,
284 .ReleaseResource = ResOwnerReleaseBuffer,
285 .DebugPrint = ResOwnerPrintBuffer
286};
287
288/*
289 * Ensure that the PrivateRefCountArray has sufficient space to store one more
290 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
291 * a new entry - but it's perfectly fine to not use a reserved entry.
292 */
293static void
295{
296 /* Already reserved (or freed), nothing to do */
297 if (ReservedRefCountSlot != -1)
298 return;
299
300 /*
301 * First search for a free entry the array, that'll be sufficient in the
302 * majority of cases.
303 */
304 {
305 int i;
306
307 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
308 {
310 {
312
313 /*
314 * We could return immediately, but iterating till the end of
315 * the array allows compiler-autovectorization.
316 */
317 }
318 }
319
320 if (ReservedRefCountSlot != -1)
321 return;
322 }
323
324 /*
325 * No luck. All array entries are full. Move one array entry into the hash
326 * table.
327 */
328 {
329 /*
330 * Move entry from the current clock position in the array into the
331 * hashtable. Use that slot.
332 */
333 int victim_slot;
336 bool found;
337
338 /* select victim slot */
342
343 /* Better be used, otherwise we shouldn't get here. */
347
348 /* enter victim array entry into hashtable */
352 &found);
353 Assert(!found);
354 /* move data from the entry in the array to the hash entry */
355 hashent->data = victim_entry->data;
356
357 /* clear the now free array slot */
359 victim_entry->buffer = InvalidBuffer;
360
361 /* clear the whole data member, just for future proofing */
362 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
363 victim_entry->data.refcount = 0;
364 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
365
367 }
368}
369
370/*
371 * Fill a previously reserved refcount entry.
372 */
375{
377
378 /* only allowed to be called when a reservation has been made */
380
381 /* use up the reserved entry */
383
384 /* and fill it */
386 res->buffer = buffer;
387 res->data.refcount = 0;
389
390 /* update cache for the next lookup */
392
394
395 return res;
396}
397
398/*
399 * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
400 * inlining. This particularly seems to be true if the compiler is capable of
401 * auto-vectorizing the code, as that imposes additional stack-alignment
402 * requirements etc.
403 */
406{
408 int match = -1;
409 int i;
410
411 /*
412 * First search for references in the array, that'll be sufficient in the
413 * majority of cases.
414 */
415 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
416 {
418 {
419 match = i;
420 /* see ReservePrivateRefCountEntry() for why we don't return */
421 }
422 }
423
424 if (likely(match != -1))
425 {
426 /* update cache for the next lookup */
428
429 return &PrivateRefCountArray[match];
430 }
431
432 /*
433 * By here we know that the buffer, if already pinned, isn't residing in
434 * the array.
435 *
436 * Only look up the buffer in the hashtable if we've previously overflowed
437 * into it.
438 */
440 return NULL;
441
443
444 if (res == NULL)
445 return NULL;
446 else if (!do_move)
447 {
448 /* caller doesn't want us to move the hash entry into the array */
449 return res;
450 }
451 else
452 {
453 /* move buffer from hashtable into the free array slot */
454 bool found;
456
457 /* Ensure there's a free array slot */
459
460 /* Use up the reserved slot */
464 Assert(free->buffer == InvalidBuffer);
465
466 /* and fill it */
467 free->buffer = buffer;
468 free->data = res->data;
470 /* update cache for the next lookup */
472
474
475
476 /* delete from hashtable */
478 Assert(found);
481
482 return free;
483 }
484}
485
486/*
487 * Return the PrivateRefCount entry for the passed buffer.
488 *
489 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
490 * do_move is true, and the entry resides in the hashtable the entry is
491 * optimized for frequent access by moving it to the array.
492 */
493static inline PrivateRefCountEntry *
495{
498
499 /*
500 * It's very common to look up the same buffer repeatedly. To make that
501 * fast, we have a one-entry cache.
502 *
503 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
504 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
505 * fewer addresses are computed and fewer cachelines are accessed. Whereas
506 * in GetPrivateRefCountEntrySlow()'s case, checking
507 * PrivateRefCountArrayKeys saves a lot of memory accesses.
508 */
509 if (likely(PrivateRefCountEntryLast != -1) &&
511 {
513 }
514
515 /*
516 * The code for the cached lookup is small enough to be worth inlining
517 * into the caller. In the miss case however, that empirically doesn't
518 * seem worth it.
519 */
521}
522
523/*
524 * Returns how many times the passed buffer is pinned by this backend.
525 *
526 * Only works for shared memory buffers!
527 */
528static inline int32
530{
532
535
536 /*
537 * Not moving the entry - that's ok for the current users, but we might
538 * want to change this one day.
539 */
541
542 if (ref == NULL)
543 return 0;
544 return ref->data.refcount;
545}
546
547/*
548 * Release resources used to track the reference count of a buffer which we no
549 * longer have pinned and don't want to pin again immediately.
550 */
551static void
553{
554 Assert(ref->data.refcount == 0);
555 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
556
557 if (ref >= &PrivateRefCountArray[0] &&
559 {
560 ref->buffer = InvalidBuffer;
562
563
564 /*
565 * Mark the just used entry as reserved - in many scenarios that
566 * allows us to avoid ever having to search the array/hash for free
567 * entries.
568 */
570 }
571 else
572 {
573 bool found;
575
577 Assert(found);
580 }
581}
582
583/*
584 * BufferIsPinned
585 * True iff the buffer is pinned (also checks for valid buffer number).
586 *
587 * NOTE: what we check here is that *this* backend holds a pin on
588 * the buffer. We do not care whether some other backend does.
589 */
590#define BufferIsPinned(bufnum) \
591( \
592 !BufferIsValid(bufnum) ? \
593 false \
594 : \
595 BufferIsLocal(bufnum) ? \
596 (LocalRefCount[-(bufnum) - 1] > 0) \
597 : \
598 (GetPrivateRefCount(bufnum) > 0) \
599)
600
601
604 ForkNumber forkNum, BlockNumber blockNum,
608 BufferAccessStrategy strategy,
609 uint32 flags,
612 Buffer *buffers,
616 BufferAccessStrategy strategy,
617 uint32 flags,
620 Buffer *buffers,
622static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
623 bool skip_if_not_valid);
624static void PinBuffer_Locked(BufferDesc *buf);
625static void UnpinBuffer(BufferDesc *buf);
626static void UnpinBufferNoOwner(BufferDesc *buf);
627static void BufferSync(int flags);
628static int SyncOneBuffer(int buf_id, bool skip_recently_used,
630static void WaitIO(BufferDesc *buf);
631static void AbortBufferIO(Buffer buffer);
632static void shared_buffer_write_error_callback(void *arg);
633static void local_buffer_write_error_callback(void *arg);
634static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
635 char relpersistence,
636 ForkNumber forkNum,
637 BlockNumber blockNum,
638 BufferAccessStrategy strategy,
640static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
641static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
647static void FindAndDropRelationBuffers(RelFileLocator rlocator,
648 ForkNumber forkNum,
653 ForkNumber forkNum, bool permanent);
654static void AtProcExit_Buffers(int code, Datum arg);
655static void CheckForBufferLeaks(void);
656#ifdef USE_ASSERT_CHECKING
658#endif
659static int rlocator_comparator(const void *p1, const void *p2);
660static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
661static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
662static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
663
669static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
674static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
677
678
679/*
680 * Implementation of PrefetchBuffer() for shared buffers.
681 */
684 ForkNumber forkNum,
685 BlockNumber blockNum)
686{
687 PrefetchBufferResult result = {InvalidBuffer, false};
688 BufferTag newTag; /* identity of requested block */
689 uint32 newHash; /* hash value for newTag */
690 LWLock *newPartitionLock; /* buffer partition lock for it */
691 int buf_id;
692
693 Assert(BlockNumberIsValid(blockNum));
694
695 /* create a tag so we can lookup the buffer */
696 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
697 forkNum, blockNum);
698
699 /* determine its hash code and partition lock ID */
702
703 /* see if the block is in the buffer pool already */
705 buf_id = BufTableLookup(&newTag, newHash);
707
708 /* If not in buffers, initiate prefetch */
709 if (buf_id < 0)
710 {
711#ifdef USE_PREFETCH
712 /*
713 * Try to initiate an asynchronous read. This returns false in
714 * recovery if the relation file doesn't exist.
715 */
716 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
717 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
718 {
719 result.initiated_io = true;
720 }
721#endif /* USE_PREFETCH */
722 }
723 else
724 {
725 /*
726 * Report the buffer it was in at that time. The caller may be able
727 * to avoid a buffer table lookup, but it's not pinned and it must be
728 * rechecked!
729 */
730 result.recent_buffer = buf_id + 1;
731 }
732
733 /*
734 * If the block *is* in buffers, we do nothing. This is not really ideal:
735 * the block might be just about to be evicted, which would be stupid
736 * since we know we are going to need it soon. But the only easy answer
737 * is to bump the usage_count, which does not seem like a great solution:
738 * when the caller does ultimately touch the block, usage_count would get
739 * bumped again, resulting in too much favoritism for blocks that are
740 * involved in a prefetch sequence. A real fix would involve some
741 * additional per-buffer state, and it's not clear that there's enough of
742 * a problem to justify that.
743 */
744
745 return result;
746}
747
748/*
749 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
750 *
751 * This is named by analogy to ReadBuffer but doesn't actually allocate a
752 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
753 * block will not be delayed by the I/O. Prefetching is optional.
754 *
755 * There are three possible outcomes:
756 *
757 * 1. If the block is already cached, the result includes a valid buffer that
758 * could be used by the caller to avoid the need for a later buffer lookup, but
759 * it's not pinned, so the caller must recheck it.
760 *
761 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
762 * true. Currently there is no way to know if the data was already cached by
763 * the kernel and therefore didn't really initiate I/O, and no way to know when
764 * the I/O completes other than using synchronous ReadBuffer().
765 *
766 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
767 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
768 * lack of a kernel facility), direct I/O is enabled, or the underlying
769 * relation file wasn't found and we are in recovery. (If the relation file
770 * wasn't found and we are not in recovery, an error is raised).
771 */
774{
776 Assert(BlockNumberIsValid(blockNum));
777
779 {
780 /* see comments in ReadBufferExtended */
784 errmsg("cannot access temporary tables of other sessions")));
785
786 /* pass it off to localbuf.c */
787 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
788 }
789 else
790 {
791 /* pass it to the shared buffer version */
792 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
793 }
794}
795
796/*
797 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
798 *
799 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
800 * successful. Return true if the buffer is valid and still has the expected
801 * tag. In that case, the buffer is pinned and the usage count is bumped.
802 */
803bool
805 Buffer recent_buffer)
806{
808 BufferTag tag;
810
811 Assert(BufferIsValid(recent_buffer));
812
815 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
816
817 if (BufferIsLocal(recent_buffer))
818 {
819 int b = -recent_buffer - 1;
820
823
824 /* Is it still valid and holding the right tag? */
825 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
826 {
827 PinLocalBuffer(bufHdr, true);
828
830
831 return true;
832 }
833 }
834 else
835 {
836 bufHdr = GetBufferDescriptor(recent_buffer - 1);
837
838 /*
839 * Is it still valid and holding the right tag? We do an unlocked tag
840 * comparison first, to make it unlikely that we'll increment the
841 * usage counter of the wrong buffer, if someone calls us with a very
842 * out of date recent_buffer. Then we'll check it again if we get the
843 * pin.
844 */
845 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
846 PinBuffer(bufHdr, NULL, true))
847 {
848 if (BufferTagsEqual(&tag, &bufHdr->tag))
849 {
851 return true;
852 }
854 }
855 }
856
857 return false;
858}
859
860/*
861 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
862 * fork with RBM_NORMAL mode and default strategy.
863 */
864Buffer
869
870/*
871 * ReadBufferExtended -- returns a buffer containing the requested
872 * block of the requested relation. If the blknum
873 * requested is P_NEW, extend the relation file and
874 * allocate a new block. (Caller is responsible for
875 * ensuring that only one backend tries to extend a
876 * relation at the same time!)
877 *
878 * Returns: the buffer number for the buffer containing
879 * the block read. The returned buffer has been pinned.
880 * Does not return on error --- elog's instead.
881 *
882 * Assume when this function is called, that reln has been opened already.
883 *
884 * In RBM_NORMAL mode, the page is read from disk, and the page header is
885 * validated. An error is thrown if the page header is not valid. (But
886 * note that an all-zero page is considered "valid"; see
887 * PageIsVerified().)
888 *
889 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
890 * valid, the page is zeroed instead of throwing an error. This is intended
891 * for non-critical data, where the caller is prepared to repair errors.
892 *
893 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
894 * filled with zeros instead of reading it from disk. Useful when the caller
895 * is going to fill the page from scratch, since this saves I/O and avoids
896 * unnecessary failure if the page-on-disk has corrupt page headers.
897 * The page is returned locked to ensure that the caller has a chance to
898 * initialize the page before it's made visible to others.
899 * Caution: do not use this mode to read a page that is beyond the relation's
900 * current physical EOF; that is likely to cause problems in md.c when
901 * the page is modified and written out. P_NEW is OK, though.
902 *
903 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
904 * a cleanup-strength lock on the page.
905 *
906 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
907 *
908 * If strategy is not NULL, a nondefault buffer access strategy is used.
909 * See buffer/README for details.
910 */
911inline Buffer
914{
915 Buffer buf;
916
917 /*
918 * Reject attempts to read non-local temporary relations; we would be
919 * likely to get wrong data since we have no visibility into the owning
920 * session's local buffers.
921 */
925 errmsg("cannot access temporary tables of other sessions")));
926
927 /*
928 * Read the buffer, and update pgstat counters to reflect a cache hit or
929 * miss.
930 */
932 forkNum, blockNum, mode, strategy);
933
934 return buf;
935}
936
937
938/*
939 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
940 * a relcache entry for the relation.
941 *
942 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
943 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
944 * cannot be used for temporary relations (and making that work might be
945 * difficult, unless we only want to read temporary relations for our own
946 * ProcNumber).
947 */
948Buffer
951 BufferAccessStrategy strategy, bool permanent)
952{
953 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
954
955 return ReadBuffer_common(NULL, smgr,
957 forkNum, blockNum,
958 mode, strategy);
959}
960
961/*
962 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
963 */
964Buffer
966 ForkNumber forkNum,
967 BufferAccessStrategy strategy,
968 uint32 flags)
969{
970 Buffer buf;
971 uint32 extend_by = 1;
972
973 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
974 &buf, &extend_by);
975
976 return buf;
977}
978
979/*
980 * Extend relation by multiple blocks.
981 *
982 * Tries to extend the relation by extend_by blocks. Depending on the
983 * availability of resources the relation may end up being extended by a
984 * smaller number of pages (unless an error is thrown, always by at least one
985 * page). *extended_by is updated to the number of pages the relation has been
986 * extended to.
987 *
988 * buffers needs to be an array that is at least extend_by long. Upon
989 * completion, the first extend_by array elements will point to a pinned
990 * buffer.
991 *
992 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
993 * locked. This is useful for callers that want a buffer that is guaranteed to
994 * be empty.
995 */
999 BufferAccessStrategy strategy,
1000 uint32 flags,
1002 Buffer *buffers,
1004{
1005 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1006 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1007 Assert(extend_by > 0);
1008
1009 if (bmr.relpersistence == '\0')
1010 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1011
1012 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1014 buffers, extended_by);
1015}
1016
1017/*
1018 * Extend the relation so it is at least extend_to blocks large, return buffer
1019 * (extend_to - 1).
1020 *
1021 * This is useful for callers that want to write a specific page, regardless
1022 * of the current size of the relation (e.g. useful for visibilitymap and for
1023 * crash recovery).
1024 */
1025Buffer
1028 BufferAccessStrategy strategy,
1029 uint32 flags,
1032{
1034 uint32 extended_by = 0;
1036 Buffer buffers[64];
1037
1038 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1039 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1041
1042 if (bmr.relpersistence == '\0')
1043 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1044
1045 /*
1046 * If desired, create the file if it doesn't exist. If
1047 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1048 * an smgrexists call.
1049 */
1050 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1051 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1052 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1054 {
1056
1057 /* recheck, fork might have been created concurrently */
1060
1062 }
1063
1064 /*
1065 * If requested, invalidate size cache, so that smgrnblocks asks the
1066 * kernel.
1067 */
1068 if (flags & EB_CLEAR_SIZE_CACHE)
1069 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1070
1071 /*
1072 * Estimate how many pages we'll need to extend by. This avoids acquiring
1073 * unnecessarily many victim buffers.
1074 */
1076
1077 /*
1078 * Since no-one else can be looking at the page contents yet, there is no
1079 * difference between an exclusive lock and a cleanup-strength lock. Note
1080 * that we pass the original mode to ReadBuffer_common() below, when
1081 * falling back to reading the buffer to a concurrent relation extension.
1082 */
1084 flags |= EB_LOCK_TARGET;
1085
1086 while (current_size < extend_to)
1087 {
1088 uint32 num_pages = lengthof(buffers);
1090
1091 if ((uint64) current_size + num_pages > extend_to)
1092 num_pages = extend_to - current_size;
1093
1094 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1095 num_pages, extend_to,
1096 buffers, &extended_by);
1097
1099 Assert(num_pages != 0 || current_size >= extend_to);
1100
1101 for (uint32 i = 0; i < extended_by; i++)
1102 {
1103 if (first_block + i != extend_to - 1)
1104 ReleaseBuffer(buffers[i]);
1105 else
1106 buffer = buffers[i];
1107 }
1108 }
1109
1110 /*
1111 * It's possible that another backend concurrently extended the relation.
1112 * In that case read the buffer.
1113 *
1114 * XXX: Should we control this via a flag?
1115 */
1116 if (buffer == InvalidBuffer)
1117 {
1118 Assert(extended_by == 0);
1119 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1120 fork, extend_to - 1, mode, strategy);
1121 }
1122
1123 return buffer;
1124}
1125
1126/*
1127 * Lock and optionally zero a buffer, as part of the implementation of
1128 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1129 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1130 */
1131static void
1133{
1135 bool need_to_zero;
1137
1139
1140 if (already_valid)
1141 {
1142 /*
1143 * If the caller already knew the buffer was valid, we can skip some
1144 * header interaction. The caller just wants to lock the buffer.
1145 */
1146 need_to_zero = false;
1147 }
1148 else if (isLocalBuf)
1149 {
1150 /* Simple case for non-shared buffers. */
1152 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1153 }
1154 else
1155 {
1156 /*
1157 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1158 * concurrently. Even though we aren't doing I/O, that ensures that
1159 * we don't zero a page that someone else has pinned. An exclusive
1160 * content lock wouldn't be enough, because readers are allowed to
1161 * drop the content lock after determining that a tuple is visible
1162 * (see buffer access rules in README).
1163 */
1165 need_to_zero = StartBufferIO(bufHdr, true, false);
1166 }
1167
1168 if (need_to_zero)
1169 {
1171
1172 /*
1173 * Grab the buffer content lock before marking the page as valid, to
1174 * make sure that no other backend sees the zeroed page before the
1175 * caller has had a chance to initialize it.
1176 *
1177 * Since no-one else can be looking at the page contents yet, there is
1178 * no difference between an exclusive lock and a cleanup-strength
1179 * lock. (Note that we cannot use LockBuffer() or
1180 * LockBufferForCleanup() here, because they assert that the buffer is
1181 * already valid.)
1182 */
1183 if (!isLocalBuf)
1185
1186 /* Set BM_VALID, terminate IO, and wake up any waiters */
1187 if (isLocalBuf)
1188 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1189 else
1190 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1191 }
1192 else if (!isLocalBuf)
1193 {
1194 /*
1195 * The buffer is valid, so we can't zero it. The caller still expects
1196 * the page to be locked on return.
1197 */
1198 if (mode == RBM_ZERO_AND_LOCK)
1200 else
1202 }
1203}
1204
1205/*
1206 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1207 * already present, or false if more work is required to either read it in or
1208 * zero it.
1209 */
1212 SMgrRelation smgr,
1213 char persistence,
1214 ForkNumber forkNum,
1215 BlockNumber blockNum,
1216 BufferAccessStrategy strategy,
1217 bool *foundPtr)
1218{
1222
1223 Assert(blockNum != P_NEW);
1224
1225 /* Persistence should be set before */
1226 Assert((persistence == RELPERSISTENCE_TEMP ||
1227 persistence == RELPERSISTENCE_PERMANENT ||
1228 persistence == RELPERSISTENCE_UNLOGGED));
1229
1230 if (persistence == RELPERSISTENCE_TEMP)
1231 {
1234 }
1235 else
1236 {
1237 io_context = IOContextForStrategy(strategy);
1239 }
1240
1241 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1245 smgr->smgr_rlocator.backend);
1246
1247 if (persistence == RELPERSISTENCE_TEMP)
1248 {
1249 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1250 if (*foundPtr)
1252 }
1253 else
1254 {
1255 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1256 strategy, foundPtr, io_context);
1257 if (*foundPtr)
1259 }
1260 if (rel)
1261 {
1262 /*
1263 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1264 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1265 * zeroed instead), the per-relation stats always count them.
1266 */
1268 if (*foundPtr)
1270 }
1271 if (*foundPtr)
1272 {
1274 if (VacuumCostActive)
1276
1277 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1281 smgr->smgr_rlocator.backend,
1282 true);
1283 }
1284
1286}
1287
1288/*
1289 * ReadBuffer_common -- common logic for all ReadBuffer variants
1290 *
1291 * smgr is required, rel is optional unless using P_NEW.
1292 */
1295 ForkNumber forkNum,
1297 BufferAccessStrategy strategy)
1298{
1299 ReadBuffersOperation operation;
1300 Buffer buffer;
1301 int flags;
1302 char persistence;
1303
1304 /*
1305 * Backward compatibility path, most code should use ExtendBufferedRel()
1306 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1307 * scales a lot better.
1308 */
1309 if (unlikely(blockNum == P_NEW))
1310 {
1312
1313 /*
1314 * Since no-one else can be looking at the page contents yet, there is
1315 * no difference between an exclusive lock and a cleanup-strength
1316 * lock.
1317 */
1319 flags |= EB_LOCK_FIRST;
1320
1321 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1322 }
1323
1324 if (rel)
1325 persistence = rel->rd_rel->relpersistence;
1326 else
1327 persistence = smgr_persistence;
1328
1331 {
1332 bool found;
1333
1334 buffer = PinBufferForBlock(rel, smgr, persistence,
1335 forkNum, blockNum, strategy, &found);
1336 ZeroAndLockBuffer(buffer, mode, found);
1337 return buffer;
1338 }
1339
1340 /*
1341 * Signal that we are going to immediately wait. If we're immediately
1342 * waiting, there is no benefit in actually executing the IO
1343 * asynchronously, it would just add dispatch overhead.
1344 */
1346 if (mode == RBM_ZERO_ON_ERROR)
1348 operation.smgr = smgr;
1349 operation.rel = rel;
1350 operation.persistence = persistence;
1351 operation.forknum = forkNum;
1352 operation.strategy = strategy;
1353 if (StartReadBuffer(&operation,
1354 &buffer,
1355 blockNum,
1356 flags))
1357 WaitReadBuffers(&operation);
1358
1359 return buffer;
1360}
1361
1364 Buffer *buffers,
1365 BlockNumber blockNum,
1366 int *nblocks,
1367 int flags,
1368 bool allow_forwarding)
1369{
1370 int actual_nblocks = *nblocks;
1371 int maxcombine = 0;
1372 bool did_start_io;
1373
1374 Assert(*nblocks == 1 || allow_forwarding);
1375 Assert(*nblocks > 0);
1376 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1377
1378 for (int i = 0; i < actual_nblocks; ++i)
1379 {
1380 bool found;
1381
1382 if (allow_forwarding && buffers[i] != InvalidBuffer)
1383 {
1385
1386 /*
1387 * This is a buffer that was pinned by an earlier call to
1388 * StartReadBuffers(), but couldn't be handled in one operation at
1389 * that time. The operation was split, and the caller has passed
1390 * an already pinned buffer back to us to handle the rest of the
1391 * operation. It must continue at the expected block number.
1392 */
1393 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1394
1395 /*
1396 * It might be an already valid buffer (a hit) that followed the
1397 * final contiguous block of an earlier I/O (a miss) marking the
1398 * end of it, or a buffer that some other backend has since made
1399 * valid by performing the I/O for us, in which case we can handle
1400 * it as a hit now. It is safe to check for a BM_VALID flag with
1401 * a relaxed load, because we got a fresh view of it while pinning
1402 * it in the previous call.
1403 *
1404 * On the other hand if we don't see BM_VALID yet, it must be an
1405 * I/O that was split by the previous call and we need to try to
1406 * start a new I/O from this block. We're also racing against any
1407 * other backend that might start the I/O or even manage to mark
1408 * it BM_VALID after this check, but StartBufferIO() will handle
1409 * those cases.
1410 */
1411 if (BufferIsLocal(buffers[i]))
1412 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1413 else
1414 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1416 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1417 }
1418 else
1419 {
1420 buffers[i] = PinBufferForBlock(operation->rel,
1421 operation->smgr,
1422 operation->persistence,
1423 operation->forknum,
1424 blockNum + i,
1425 operation->strategy,
1426 &found);
1427 }
1428
1429 if (found)
1430 {
1431 /*
1432 * We have a hit. If it's the first block in the requested range,
1433 * we can return it immediately and report that WaitReadBuffers()
1434 * does not need to be called. If the initial value of *nblocks
1435 * was larger, the caller will have to call again for the rest.
1436 */
1437 if (i == 0)
1438 {
1439 *nblocks = 1;
1440
1441#ifdef USE_ASSERT_CHECKING
1442
1443 /*
1444 * Initialize enough of ReadBuffersOperation to make
1445 * CheckReadBuffersOperation() work. Outside of assertions
1446 * that's not necessary when no IO is issued.
1447 */
1448 operation->buffers = buffers;
1449 operation->blocknum = blockNum;
1450 operation->nblocks = 1;
1451 operation->nblocks_done = 1;
1452 CheckReadBuffersOperation(operation, true);
1453#endif
1454 return false;
1455 }
1456
1457 /*
1458 * Otherwise we already have an I/O to perform, but this block
1459 * can't be included as it is already valid. Split the I/O here.
1460 * There may or may not be more blocks requiring I/O after this
1461 * one, we haven't checked, but they can't be contiguous with this
1462 * one in the way. We'll leave this buffer pinned, forwarding it
1463 * to the next call, avoiding the need to unpin it here and re-pin
1464 * it in the next call.
1465 */
1466 actual_nblocks = i;
1467 break;
1468 }
1469 else
1470 {
1471 /*
1472 * Check how many blocks we can cover with the same IO. The smgr
1473 * implementation might e.g. be limited due to a segment boundary.
1474 */
1475 if (i == 0 && actual_nblocks > 1)
1476 {
1477 maxcombine = smgrmaxcombine(operation->smgr,
1478 operation->forknum,
1479 blockNum);
1481 {
1482 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1483 blockNum, actual_nblocks, maxcombine);
1485 }
1486 }
1487 }
1488 }
1489 *nblocks = actual_nblocks;
1490
1491 /* Populate information needed for I/O. */
1492 operation->buffers = buffers;
1493 operation->blocknum = blockNum;
1494 operation->flags = flags;
1495 operation->nblocks = actual_nblocks;
1496 operation->nblocks_done = 0;
1497 pgaio_wref_clear(&operation->io_wref);
1498
1499 /*
1500 * When using AIO, start the IO in the background. If not, issue prefetch
1501 * requests if desired by the caller.
1502 *
1503 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1504 * de-risk the introduction of AIO somewhat. It's a large architectural
1505 * change, with lots of chances for unanticipated performance effects.
1506 *
1507 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1508 * asynchronously, but without the check here we'd execute IO earlier than
1509 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1510 */
1511 if (io_method != IOMETHOD_SYNC)
1512 {
1513 /*
1514 * Try to start IO asynchronously. It's possible that no IO needs to
1515 * be started, if another backend already performed the IO.
1516 *
1517 * Note that if an IO is started, it might not cover the entire
1518 * requested range, e.g. because an intermediary block has been read
1519 * in by another backend. In that case any "trailing" buffers we
1520 * already pinned above will be "forwarded" by read_stream.c to the
1521 * next call to StartReadBuffers().
1522 *
1523 * This is signalled to the caller by decrementing *nblocks *and*
1524 * reducing operation->nblocks. The latter is done here, but not below
1525 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1526 * overall read size anymore, we need to retry until done in its
1527 * entirety or until failed.
1528 */
1529 did_start_io = AsyncReadBuffers(operation, nblocks);
1530
1531 operation->nblocks = *nblocks;
1532 }
1533 else
1534 {
1535 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1536
1537 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1538 {
1539 /*
1540 * In theory we should only do this if PinBufferForBlock() had to
1541 * allocate new buffers above. That way, if two calls to
1542 * StartReadBuffers() were made for the same blocks before
1543 * WaitReadBuffers(), only the first would issue the advice.
1544 * That'd be a better simulation of true asynchronous I/O, which
1545 * would only start the I/O once, but isn't done here for
1546 * simplicity.
1547 */
1548 smgrprefetch(operation->smgr,
1549 operation->forknum,
1550 blockNum,
1552 }
1553
1554 /*
1555 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1556 * will initiate the necessary IO.
1557 */
1558 did_start_io = true;
1559 }
1560
1562
1563 return did_start_io;
1564}
1565
1566/*
1567 * Begin reading a range of blocks beginning at blockNum and extending for
1568 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1569 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1570 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1571 * and is now being continued. On return, *nblocks holds the number of blocks
1572 * accepted by this operation. If it is less than the original number then
1573 * this operation has been split, but buffer elements up to the original
1574 * requested size may hold forwarded buffers to be used for a continuing
1575 * operation. The caller must either start a new I/O beginning at the block
1576 * immediately following the blocks accepted by this call and pass those
1577 * buffers back in, or release them if it chooses not to. It shouldn't make
1578 * any other use of or assumptions about forwarded buffers.
1579 *
1580 * If false is returned, no I/O is necessary and the buffers covered by
1581 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1582 * an I/O has been started, and WaitReadBuffers() must be called with the same
1583 * operation object before the buffers covered by *nblocks on exit can be
1584 * accessed. Along with the operation object, the caller-supplied array of
1585 * buffers must remain valid until WaitReadBuffers() is called, and any
1586 * forwarded buffers must also be preserved for a continuing call unless
1587 * they are explicitly released.
1588 */
1589bool
1591 Buffer *buffers,
1592 BlockNumber blockNum,
1593 int *nblocks,
1594 int flags)
1595{
1596 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1597 true /* expect forwarded buffers */ );
1598}
1599
1600/*
1601 * Single block version of the StartReadBuffers(). This might save a few
1602 * instructions when called from another translation unit, because it is
1603 * specialized for nblocks == 1.
1604 *
1605 * This version does not support "forwarded" buffers: they cannot be created
1606 * by reading only one block and *buffer is ignored on entry.
1607 */
1608bool
1610 Buffer *buffer,
1611 BlockNumber blocknum,
1612 int flags)
1613{
1614 int nblocks = 1;
1615 bool result;
1616
1617 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1618 false /* single block, no forwarding */ );
1619 Assert(nblocks == 1); /* single block can't be short */
1620
1621 return result;
1622}
1623
1624/*
1625 * Perform sanity checks on the ReadBuffersOperation.
1626 */
1627static void
1629{
1630#ifdef USE_ASSERT_CHECKING
1631 Assert(operation->nblocks_done <= operation->nblocks);
1632 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1633
1634 for (int i = 0; i < operation->nblocks; i++)
1635 {
1636 Buffer buffer = operation->buffers[i];
1640
1641 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1643
1644 if (i < operation->nblocks_done)
1646 }
1647#endif
1648}
1649
1650/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1651static inline bool
1653{
1654 if (BufferIsLocal(buffer))
1656 true, nowait);
1657 else
1658 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1659}
1660
1661/*
1662 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1663 */
1664static inline bool
1666{
1667 /*
1668 * If this backend currently has staged IO, we need to submit the pending
1669 * IO before waiting for the right to issue IO, to avoid the potential for
1670 * deadlocks (and, more commonly, unnecessary delays for other backends).
1671 */
1672 if (!nowait && pgaio_have_staged())
1673 {
1675 return true;
1676
1677 /*
1678 * Unfortunately StartBufferIO() returning false doesn't allow to
1679 * distinguish between the buffer already being valid and IO already
1680 * being in progress. Since IO already being in progress is quite
1681 * rare, this approach seems fine.
1682 */
1684 }
1685
1686 return ReadBuffersCanStartIOOnce(buffer, nowait);
1687}
1688
1689/*
1690 * Helper for WaitReadBuffers() that processes the results of a readv
1691 * operation, raising an error if necessary.
1692 */
1693static void
1695{
1696 PgAioReturn *aio_ret = &operation->io_return;
1698 int newly_read_blocks = 0;
1699
1700 Assert(pgaio_wref_valid(&operation->io_wref));
1701 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1702
1703 /*
1704 * SMGR reports the number of blocks successfully read as the result of
1705 * the IO operation. Thus we can simply add that to ->nblocks_done.
1706 */
1707
1708 if (likely(rs != PGAIO_RS_ERROR))
1709 newly_read_blocks = aio_ret->result.result;
1710
1711 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1712 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1713 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1714 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1715 {
1716 /*
1717 * We'll retry, so we just emit a debug message to the server log (or
1718 * not even that in prod scenarios).
1719 */
1720 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1721 elog(DEBUG3, "partial read, will retry");
1722 }
1723
1726
1727 operation->nblocks_done += newly_read_blocks;
1728
1729 Assert(operation->nblocks_done <= operation->nblocks);
1730}
1731
1732void
1734{
1735 PgAioReturn *aio_ret = &operation->io_return;
1738
1739 if (operation->persistence == RELPERSISTENCE_TEMP)
1740 {
1743 }
1744 else
1745 {
1748 }
1749
1750 /*
1751 * If we get here without an IO operation having been issued, the
1752 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1753 * caller should not have called WaitReadBuffers().
1754 *
1755 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1756 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1757 * of the retry logic below, no extra code is required.
1758 *
1759 * This path is expected to eventually go away.
1760 */
1761 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1762 elog(ERROR, "waiting for read operation that didn't read");
1763
1764 /*
1765 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1766 * done. We may need multiple retries, not just because we could get
1767 * multiple partial reads, but also because some of the remaining
1768 * to-be-read buffers may have been read in by other backends, limiting
1769 * the IO size.
1770 */
1771 while (true)
1772 {
1774
1775 CheckReadBuffersOperation(operation, false);
1776
1777 /*
1778 * If there is an IO associated with the operation, we may need to
1779 * wait for it.
1780 */
1781 if (pgaio_wref_valid(&operation->io_wref))
1782 {
1783 /*
1784 * Track the time spent waiting for the IO to complete. As
1785 * tracking a wait even if we don't actually need to wait
1786 *
1787 * a) is not cheap, due to the timestamping overhead
1788 *
1789 * b) reports some time as waiting, even if we never waited
1790 *
1791 * we first check if we already know the IO is complete.
1792 */
1793 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1794 !pgaio_wref_check_done(&operation->io_wref))
1795 {
1797
1798 pgaio_wref_wait(&operation->io_wref);
1799
1800 /*
1801 * The IO operation itself was already counted earlier, in
1802 * AsyncReadBuffers(), this just accounts for the wait time.
1803 */
1805 io_start, 0, 0);
1806 }
1807 else
1808 {
1809 Assert(pgaio_wref_check_done(&operation->io_wref));
1810 }
1811
1812 /*
1813 * We now are sure the IO completed. Check the results. This
1814 * includes reporting on errors if there were any.
1815 */
1816 ProcessReadBuffersResult(operation);
1817 }
1818
1819 /*
1820 * Most of the time, the one IO we already started, will read in
1821 * everything. But we need to deal with partial reads and buffers not
1822 * needing IO anymore.
1823 */
1824 if (operation->nblocks_done == operation->nblocks)
1825 break;
1826
1828
1829 /*
1830 * This may only complete the IO partially, either because some
1831 * buffers were already valid, or because of a partial read.
1832 *
1833 * NB: In contrast to after the AsyncReadBuffers() call in
1834 * StartReadBuffers(), we do *not* reduce
1835 * ReadBuffersOperation->nblocks here, callers expect the full
1836 * operation to be completed at this point (as more operations may
1837 * have been queued).
1838 */
1840 }
1841
1842 CheckReadBuffersOperation(operation, true);
1843
1844 /* NB: READ_DONE tracepoint was already executed in completion callback */
1845}
1846
1847/*
1848 * Initiate IO for the ReadBuffersOperation
1849 *
1850 * This function only starts a single IO at a time. The size of the IO may be
1851 * limited to below the to-be-read blocks, if one of the buffers has
1852 * concurrently been read in. If the first to-be-read buffer is already valid,
1853 * no IO will be issued.
1854 *
1855 * To support retries after partial reads, the first operation->nblocks_done
1856 * buffers are skipped.
1857 *
1858 * On return *nblocks_progress is updated to reflect the number of buffers
1859 * affected by the call. If the first buffer is valid, *nblocks_progress is
1860 * set to 1 and operation->nblocks_done is incremented.
1861 *
1862 * Returns true if IO was initiated, false if no IO was necessary.
1863 */
1864static bool
1866{
1867 Buffer *buffers = &operation->buffers[0];
1868 int flags = operation->flags;
1869 BlockNumber blocknum = operation->blocknum;
1870 ForkNumber forknum = operation->forknum;
1871 char persistence = operation->persistence;
1872 int16 nblocks_done = operation->nblocks_done;
1873 Buffer *io_buffers = &operation->buffers[nblocks_done];
1874 int io_buffers_len = 0;
1876 uint32 ioh_flags = 0;
1880 bool did_start_io;
1881
1882 /*
1883 * When this IO is executed synchronously, either because the caller will
1884 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1885 * the AIO subsystem needs to know.
1886 */
1887 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1889
1890 if (persistence == RELPERSISTENCE_TEMP)
1891 {
1895 }
1896 else
1897 {
1900 }
1901
1902 /*
1903 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1904 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1905 * set globally, but on a per-session basis. The completion callback,
1906 * which may be run in other processes, e.g. in IO workers, may have a
1907 * different value of the zero_damaged_pages GUC.
1908 *
1909 * XXX: We probably should eventually use a different flag for
1910 * zero_damaged_pages, so we can report different log levels / error codes
1911 * for zero_damaged_pages and ZERO_ON_ERROR.
1912 */
1915
1916 /*
1917 * For the same reason as with zero_damaged_pages we need to use this
1918 * backend's ignore_checksum_failure value.
1919 */
1922
1923
1924 /*
1925 * To be allowed to report stats in the local completion callback we need
1926 * to prepare to report stats now. This ensures we can safely report the
1927 * checksum failure even in a critical section.
1928 */
1930
1931 /*
1932 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1933 * might block, which we don't want after setting IO_IN_PROGRESS.
1934 *
1935 * If we need to wait for IO before we can get a handle, submit
1936 * already-staged IO first, so that other backends don't need to wait.
1937 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1938 * wait for already submitted IO, which doesn't require additional locks,
1939 * but it could still cause undesirable waits.
1940 *
1941 * A secondary benefit is that this would allow us to measure the time in
1942 * pgaio_io_acquire() without causing undue timer overhead in the common,
1943 * non-blocking, case. However, currently the pgstats infrastructure
1944 * doesn't really allow that, as it a) asserts that an operation can't
1945 * have time without operations b) doesn't have an API to report
1946 * "accumulated" time.
1947 */
1949 if (unlikely(!ioh))
1950 {
1952
1954 }
1955
1956 /*
1957 * Check if we can start IO on the first to-be-read buffer.
1958 *
1959 * If an I/O is already in progress in another backend, we want to wait
1960 * for the outcome: either done, or something went wrong and we will
1961 * retry.
1962 */
1963 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1964 {
1965 /*
1966 * Someone else has already completed this block, we're done.
1967 *
1968 * When IO is necessary, ->nblocks_done is updated in
1969 * ProcessReadBuffersResult(), but that is not called if no IO is
1970 * necessary. Thus update here.
1971 */
1972 operation->nblocks_done += 1;
1973 *nblocks_progress = 1;
1974
1976 pgaio_wref_clear(&operation->io_wref);
1977 did_start_io = false;
1978
1979 /*
1980 * Report and track this as a 'hit' for this backend, even though it
1981 * must have started out as a miss in PinBufferForBlock(). The other
1982 * backend will track this as a 'read'.
1983 */
1984 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1985 operation->smgr->smgr_rlocator.locator.spcOid,
1986 operation->smgr->smgr_rlocator.locator.dbOid,
1987 operation->smgr->smgr_rlocator.locator.relNumber,
1988 operation->smgr->smgr_rlocator.backend,
1989 true);
1990
1991 if (persistence == RELPERSISTENCE_TEMP)
1993 else
1995
1996 if (operation->rel)
1997 pgstat_count_buffer_hit(operation->rel);
1998
2000
2001 if (VacuumCostActive)
2003 }
2004 else
2005 {
2007
2008 /* We found a buffer that we need to read in. */
2009 Assert(io_buffers[0] == buffers[nblocks_done]);
2010 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2011 io_buffers_len = 1;
2012
2013 /*
2014 * How many neighboring-on-disk blocks can we scatter-read into other
2015 * buffers at the same time? In this case we don't wait if we see an
2016 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2017 * head block, so we should get on with that I/O as soon as possible.
2018 */
2019 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2020 {
2021 if (!ReadBuffersCanStartIO(buffers[i], true))
2022 break;
2023 /* Must be consecutive block numbers. */
2024 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2025 BufferGetBlockNumber(buffers[i]) - 1);
2026 Assert(io_buffers[io_buffers_len] == buffers[i]);
2027
2028 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2029 }
2030
2031 /* get a reference to wait for in WaitReadBuffers() */
2032 pgaio_io_get_wref(ioh, &operation->io_wref);
2033
2034 /* provide the list of buffers to the completion callbacks */
2036
2038 persistence == RELPERSISTENCE_TEMP ?
2041 flags);
2042
2044
2045 /* ---
2046 * Even though we're trying to issue IO asynchronously, track the time
2047 * in smgrstartreadv():
2048 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2049 * immediately
2050 * - the io method might not support the IO (e.g. worker IO for a temp
2051 * table)
2052 * ---
2053 */
2055 smgrstartreadv(ioh, operation->smgr, forknum,
2056 blocknum + nblocks_done,
2060
2061 if (persistence == RELPERSISTENCE_TEMP)
2063 else
2065
2066 /*
2067 * Track vacuum cost when issuing IO, not after waiting for it.
2068 * Otherwise we could end up issuing a lot of IO in a short timespan,
2069 * despite a low cost limit.
2070 */
2071 if (VacuumCostActive)
2073
2075 did_start_io = true;
2076 }
2077
2078 return did_start_io;
2079}
2080
2081/*
2082 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2083 * buffer. If no buffer exists already, selects a replacement victim and
2084 * evicts the old page, but does NOT read in new page.
2085 *
2086 * "strategy" can be a buffer replacement strategy object, or NULL for
2087 * the default strategy. The selected buffer's usage_count is advanced when
2088 * using the default strategy, but otherwise possibly not (see PinBuffer).
2089 *
2090 * The returned buffer is pinned and is already marked as holding the
2091 * desired page. If it already did have the desired page, *foundPtr is
2092 * set true. Otherwise, *foundPtr is set false.
2093 *
2094 * io_context is passed as an output parameter to avoid calling
2095 * IOContextForStrategy() when there is a shared buffers hit and no IO
2096 * statistics need be captured.
2097 *
2098 * No locks are held either at entry or exit.
2099 */
2101BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2102 BlockNumber blockNum,
2103 BufferAccessStrategy strategy,
2105{
2106 BufferTag newTag; /* identity of requested block */
2107 uint32 newHash; /* hash value for newTag */
2108 LWLock *newPartitionLock; /* buffer partition lock for it */
2109 int existing_buf_id;
2113 uint64 set_bits = 0;
2114
2115 /* Make sure we will have room to remember the buffer pin */
2118
2119 /* create a tag so we can lookup the buffer */
2120 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2121
2122 /* determine its hash code and partition lock ID */
2125
2126 /* see if the block is in the buffer pool already */
2129 if (existing_buf_id >= 0)
2130 {
2131 BufferDesc *buf;
2132 bool valid;
2133
2134 /*
2135 * Found it. Now, pin the buffer so no one can steal it from the
2136 * buffer pool, and check to see if the correct data has been loaded
2137 * into the buffer.
2138 */
2140
2141 valid = PinBuffer(buf, strategy, false);
2142
2143 /* Can release the mapping lock as soon as we've pinned it */
2145
2146 *foundPtr = true;
2147
2148 if (!valid)
2149 {
2150 /*
2151 * We can only get here if (a) someone else is still reading in
2152 * the page, (b) a previous read attempt failed, or (c) someone
2153 * called StartReadBuffers() but not yet WaitReadBuffers().
2154 */
2155 *foundPtr = false;
2156 }
2157
2158 return buf;
2159 }
2160
2161 /*
2162 * Didn't find it in the buffer pool. We'll have to initialize a new
2163 * buffer. Remember to unlock the mapping lock while doing the work.
2164 */
2166
2167 /*
2168 * Acquire a victim buffer. Somebody else might try to do the same, we
2169 * don't hold any conflicting locks. If so we'll have to undo our work
2170 * later.
2171 */
2174
2175 /*
2176 * Try to make a hashtable entry for the buffer under its new tag. If
2177 * somebody else inserted another buffer for the tag, we'll release the
2178 * victim buffer we acquired and use the already inserted one.
2179 */
2182 if (existing_buf_id >= 0)
2183 {
2185 bool valid;
2186
2187 /*
2188 * Got a collision. Someone has already done what we were about to do.
2189 * We'll just handle this as if it were found in the buffer pool in
2190 * the first place. First, give up the buffer we were planning to
2191 * use.
2192 *
2193 * We could do this after releasing the partition lock, but then we'd
2194 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2195 * before acquiring the lock, for the rare case of such a collision.
2196 */
2198
2199 /* remaining code should match code at top of routine */
2200
2202
2203 valid = PinBuffer(existing_buf_hdr, strategy, false);
2204
2205 /* Can release the mapping lock as soon as we've pinned it */
2207
2208 *foundPtr = true;
2209
2210 if (!valid)
2211 {
2212 /*
2213 * We can only get here if (a) someone else is still reading in
2214 * the page, (b) a previous read attempt failed, or (c) someone
2215 * called StartReadBuffers() but not yet WaitReadBuffers().
2216 */
2217 *foundPtr = false;
2218 }
2219
2220 return existing_buf_hdr;
2221 }
2222
2223 /*
2224 * Need to lock the buffer header too in order to change its tag.
2225 */
2227
2228 /* some sanity checks while we hold the buffer header lock */
2231
2232 victim_buf_hdr->tag = newTag;
2233
2234 /*
2235 * Make sure BM_PERMANENT is set for buffers that must be written at every
2236 * checkpoint. Unlogged buffers only need to be written at shutdown
2237 * checkpoints, except for their "init" forks, which need to be treated
2238 * just like permanent relations.
2239 */
2241 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2243
2245 set_bits, 0, 0);
2246
2248
2249 /*
2250 * Buffer contents are currently invalid.
2251 */
2252 *foundPtr = false;
2253
2254 return victim_buf_hdr;
2255}
2256
2257/*
2258 * InvalidateBuffer -- mark a shared buffer invalid.
2259 *
2260 * The buffer header spinlock must be held at entry. We drop it before
2261 * returning. (This is sane because the caller must have locked the
2262 * buffer in order to be sure it should be dropped.)
2263 *
2264 * This is used only in contexts such as dropping a relation. We assume
2265 * that no other backend could possibly be interested in using the page,
2266 * so the only reason the buffer might be pinned is if someone else is
2267 * trying to write it out. We have to let them finish before we can
2268 * reclaim the buffer.
2269 *
2270 * The buffer could get reclaimed by someone else while we are waiting
2271 * to acquire the necessary locks; if so, don't mess it up.
2272 */
2273static void
2275{
2277 uint32 oldHash; /* hash value for oldTag */
2278 LWLock *oldPartitionLock; /* buffer partition lock for it */
2281
2282 /* Save the original buffer tag before dropping the spinlock */
2283 oldTag = buf->tag;
2284
2286
2287 /*
2288 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2289 * worth storing the hashcode in BufferDesc so we need not recompute it
2290 * here? Probably not.
2291 */
2294
2295retry:
2296
2297 /*
2298 * Acquire exclusive mapping lock in preparation for changing the buffer's
2299 * association.
2300 */
2302
2303 /* Re-lock the buffer header */
2305
2306 /* If it's changed while we were waiting for lock, do nothing */
2307 if (!BufferTagsEqual(&buf->tag, &oldTag))
2308 {
2311 return;
2312 }
2313
2314 /*
2315 * We assume the reason for it to be pinned is that either we were
2316 * asynchronously reading the page in before erroring out or someone else
2317 * is flushing the page out. Wait for the IO to finish. (This could be
2318 * an infinite loop if the refcount is messed up... it would be nice to
2319 * time out after awhile, but there seems no way to be sure how many loops
2320 * may be needed. Note that if the other guy has pinned the buffer but
2321 * not yet done StartBufferIO, WaitIO will fall through and we'll
2322 * effectively be busy-looping here.)
2323 */
2325 {
2328 /* safety check: should definitely not be our *own* pin */
2330 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2331 WaitIO(buf);
2332 goto retry;
2333 }
2334
2335 /*
2336 * An invalidated buffer should not have any backends waiting to lock the
2337 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2338 */
2340
2341 /*
2342 * Clear out the buffer's tag and flags. We must do this to ensure that
2343 * linear scans of the buffer array don't think the buffer is valid.
2344 */
2346 ClearBufferTag(&buf->tag);
2347
2349 0,
2351 0);
2352
2353 /*
2354 * Remove the buffer from the lookup hashtable, if it was in there.
2355 */
2356 if (oldFlags & BM_TAG_VALID)
2358
2359 /*
2360 * Done with mapping lock.
2361 */
2363}
2364
2365/*
2366 * Helper routine for GetVictimBuffer()
2367 *
2368 * Needs to be called on a buffer with a valid tag, pinned, but without the
2369 * buffer header spinlock held.
2370 *
2371 * Returns true if the buffer can be reused, in which case the buffer is only
2372 * pinned by this backend and marked as invalid, false otherwise.
2373 */
2374static bool
2376{
2378 uint32 hash;
2380 BufferTag tag;
2381
2383
2384 /* have buffer pinned, so it's safe to read tag without lock */
2385 tag = buf_hdr->tag;
2386
2387 hash = BufTableHashCode(&tag);
2389
2391
2392 /* lock the buffer header */
2394
2395 /*
2396 * We have the buffer pinned nobody else should have been able to unset
2397 * this concurrently.
2398 */
2401 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2402
2403 /*
2404 * If somebody else pinned the buffer since, or even worse, dirtied it,
2405 * give up on this buffer: It's clearly in use.
2406 */
2408 {
2410
2413
2414 return false;
2415 }
2416
2417 /*
2418 * An invalidated buffer should not have any backends waiting to lock the
2419 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2420 */
2422
2423 /*
2424 * Clear out the buffer's tag and flags and usagecount. This is not
2425 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2426 * doing anything with the buffer. But currently it's beneficial, as the
2427 * cheaper pre-check for several linear scans of shared buffers use the
2428 * tag (see e.g. FlushDatabaseBuffers()).
2429 */
2430 ClearBufferTag(&buf_hdr->tag);
2432 0,
2434 0);
2435
2437
2438 /* finally delete buffer from the buffer mapping table */
2439 BufTableDelete(&tag, hash);
2440
2442
2447
2448 return true;
2449}
2450
2451static Buffer
2453{
2455 Buffer buf;
2457 bool from_ring;
2458
2459 /*
2460 * Ensure, before we pin a victim buffer, that there's a free refcount
2461 * entry and resource owner slot for the pin.
2462 */
2465
2466 /* we return here if a prospective victim buffer gets used concurrently */
2467again:
2468
2469 /*
2470 * Select a victim buffer. The buffer is returned pinned and owned by
2471 * this backend.
2472 */
2475
2476 /*
2477 * We shouldn't have any other pins for this buffer.
2478 */
2480
2481 /*
2482 * If the buffer was dirty, try to write it out. There is a race
2483 * condition here, in that someone might dirty it after we released the
2484 * buffer header lock above, or even while we are writing it out (since
2485 * our share-lock won't prevent hint-bit updates). We will recheck the
2486 * dirty bit after re-locking the buffer header.
2487 */
2488 if (buf_state & BM_DIRTY)
2489 {
2492
2493 /*
2494 * We need a share-lock on the buffer contents to write it out (else
2495 * we might write invalid data, eg because someone else is compacting
2496 * the page contents while we write). We must use a conditional lock
2497 * acquisition here to avoid deadlock. Even though the buffer was not
2498 * pinned (and therefore surely not locked) when StrategyGetBuffer
2499 * returned it, someone else could have pinned and exclusive-locked it
2500 * by the time we get here. If we try to get the lock unconditionally,
2501 * we'd block waiting for them; if they later block waiting for us,
2502 * deadlock ensues. (This has been observed to happen when two
2503 * backends are both trying to split btree index pages, and the second
2504 * one just happens to be trying to split the page the first one got
2505 * from StrategyGetBuffer.)
2506 */
2508 {
2509 /*
2510 * Someone else has locked the buffer, so give it up and loop back
2511 * to get another one.
2512 */
2514 goto again;
2515 }
2516
2517 /*
2518 * If using a nondefault strategy, and writing the buffer would
2519 * require a WAL flush, let the strategy decide whether to go ahead
2520 * and write/reuse the buffer or to choose another victim. We need a
2521 * lock to inspect the page LSN, so this can't be done inside
2522 * StrategyGetBuffer.
2523 */
2524 if (strategy != NULL)
2525 {
2526 XLogRecPtr lsn;
2527
2528 /* Read the LSN while holding buffer header lock */
2530 lsn = BufferGetLSN(buf_hdr);
2532
2533 if (XLogNeedsFlush(lsn)
2534 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2535 {
2538 goto again;
2539 }
2540 }
2541
2542 /* OK, do the I/O */
2545
2547 &buf_hdr->tag);
2548 }
2549
2550
2551 if (buf_state & BM_VALID)
2552 {
2553 /*
2554 * When a BufferAccessStrategy is in use, blocks evicted from shared
2555 * buffers are counted as IOOP_EVICT in the corresponding context
2556 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2557 * strategy in two cases: 1) while initially claiming buffers for the
2558 * strategy ring 2) to replace an existing strategy ring buffer
2559 * because it is pinned or in use and cannot be reused.
2560 *
2561 * Blocks evicted from buffers already in the strategy ring are
2562 * counted as IOOP_REUSE in the corresponding strategy context.
2563 *
2564 * At this point, we can accurately count evictions and reuses,
2565 * because we have successfully claimed the valid buffer. Previously,
2566 * we may have been forced to release the buffer due to concurrent
2567 * pinners or erroring out.
2568 */
2570 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2571 }
2572
2573 /*
2574 * If the buffer has an entry in the buffer mapping table, delete it. This
2575 * can fail because another backend could have pinned or dirtied the
2576 * buffer.
2577 */
2579 {
2581 goto again;
2582 }
2583
2584 /* a final set of sanity checks */
2585#ifdef USE_ASSERT_CHECKING
2587
2590
2592#endif
2593
2594 return buf;
2595}
2596
2597/*
2598 * Return the maximum number of buffers that a backend should try to pin once,
2599 * to avoid exceeding its fair share. This is the highest value that
2600 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2601 * system with a very small buffer pool relative to max_connections.
2602 */
2603uint32
2605{
2606 return MaxProportionalPins;
2607}
2608
2609/*
2610 * Return the maximum number of additional buffers that this backend should
2611 * pin if it wants to stay under the per-backend limit, considering the number
2612 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2613 * return by this function can be zero.
2614 */
2615uint32
2617{
2619
2620 /*
2621 * We get the number of "overflowed" pins for free, but don't know the
2622 * number of pins in PrivateRefCountArray. The cost of calculating that
2623 * exactly doesn't seem worth it, so just assume the max.
2624 */
2626
2627 /* Is this backend already holding more than its fair share? */
2629 return 0;
2630
2632}
2633
2634/*
2635 * Limit the number of pins a batch operation may additionally acquire, to
2636 * avoid running out of pinnable buffers.
2637 *
2638 * One additional pin is always allowed, on the assumption that the operation
2639 * requires at least one to make progress.
2640 */
2641void
2643{
2644 uint32 limit;
2645
2646 if (*additional_pins <= 1)
2647 return;
2648
2649 limit = GetAdditionalPinLimit();
2650 limit = Max(limit, 1);
2651 if (limit < *additional_pins)
2652 *additional_pins = limit;
2653}
2654
2655/*
2656 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2657 * avoid duplicating the tracing and relpersistence related logic.
2658 */
2659static BlockNumber
2662 BufferAccessStrategy strategy,
2663 uint32 flags,
2666 Buffer *buffers,
2668{
2670
2672 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2673 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2674 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2675 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2676 extend_by);
2677
2678 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2681 buffers, &extend_by);
2682 else
2683 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2685 buffers, &extend_by);
2687
2689 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2690 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2691 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2692 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2693 *extended_by,
2694 first_block);
2695
2696 return first_block;
2697}
2698
2699/*
2700 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2701 * shared buffers.
2702 */
2703static BlockNumber
2706 BufferAccessStrategy strategy,
2707 uint32 flags,
2710 Buffer *buffers,
2712{
2716
2718
2719 /*
2720 * Acquire victim buffers for extension without holding extension lock.
2721 * Writing out victim buffers is the most expensive part of extending the
2722 * relation, particularly when doing so requires WAL flushes. Zeroing out
2723 * the buffers is also quite expensive, so do that before holding the
2724 * extension lock as well.
2725 *
2726 * These pages are pinned by us and not valid. While we hold the pin they
2727 * can't be acquired as victim buffers by another backend.
2728 */
2729 for (uint32 i = 0; i < extend_by; i++)
2730 {
2732
2733 buffers[i] = GetVictimBuffer(strategy, io_context);
2735
2736 /* new buffers are zero-filled */
2737 MemSet(buf_block, 0, BLCKSZ);
2738 }
2739
2740 /*
2741 * Lock relation against concurrent extensions, unless requested not to.
2742 *
2743 * We use the same extension lock for all forks. That's unnecessarily
2744 * restrictive, but currently extensions for forks don't happen often
2745 * enough to make it worth locking more granularly.
2746 *
2747 * Note that another backend might have extended the relation by the time
2748 * we get the lock.
2749 */
2750 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2752
2753 /*
2754 * If requested, invalidate size cache, so that smgrnblocks asks the
2755 * kernel.
2756 */
2757 if (flags & EB_CLEAR_SIZE_CACHE)
2758 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2759
2761
2762 /*
2763 * Now that we have the accurate relation size, check if the caller wants
2764 * us to extend to only up to a specific size. If there were concurrent
2765 * extensions, we might have acquired too many buffers and need to release
2766 * them.
2767 */
2769 {
2771
2773 extend_by = 0;
2774 else if ((uint64) first_block + extend_by > extend_upto)
2776
2777 for (uint32 i = extend_by; i < orig_extend_by; i++)
2778 {
2779 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2780
2782 }
2783
2784 if (extend_by == 0)
2785 {
2786 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2789 return first_block;
2790 }
2791 }
2792
2793 /* Fail if relation is already at maximum possible length */
2795 ereport(ERROR,
2797 errmsg("cannot extend relation %s beyond %u blocks",
2798 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2799 MaxBlockNumber)));
2800
2801 /*
2802 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2803 *
2804 * This needs to happen before we extend the relation, because as soon as
2805 * we do, other backends can start to read in those pages.
2806 */
2807 for (uint32 i = 0; i < extend_by; i++)
2808 {
2809 Buffer victim_buf = buffers[i];
2811 BufferTag tag;
2812 uint32 hash;
2814 int existing_id;
2815
2816 /* in case we need to pin an existing buffer below */
2819
2820 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2821 first_block + i);
2822 hash = BufTableHashCode(&tag);
2824
2826
2828
2829 /*
2830 * We get here only in the corner case where we are trying to extend
2831 * the relation but we found a pre-existing buffer. This can happen
2832 * because a prior attempt at extending the relation failed, and
2833 * because mdread doesn't complain about reads beyond EOF (when
2834 * zero_damaged_pages is ON) and so a previous attempt to read a block
2835 * beyond EOF could have left a "valid" zero-filled buffer.
2836 *
2837 * This has also been observed when relation was overwritten by
2838 * external process. Since the legitimate cases should always have
2839 * left a zero-filled buffer, complain if not PageIsNew.
2840 */
2841 if (existing_id >= 0)
2842 {
2845 bool valid;
2846
2847 /*
2848 * Pin the existing buffer before releasing the partition lock,
2849 * preventing it from being evicted.
2850 */
2851 valid = PinBuffer(existing_hdr, strategy, false);
2852
2855
2858
2859 if (valid && !PageIsNew((Page) buf_block))
2860 ereport(ERROR,
2861 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2862 existing_hdr->tag.blockNum,
2863 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2864
2865 /*
2866 * We *must* do smgr[zero]extend before succeeding, else the page
2867 * will not be reserved by the kernel, and the next P_NEW call
2868 * will decide to return the same page. Clear the BM_VALID bit,
2869 * do StartBufferIO() and proceed.
2870 *
2871 * Loop to handle the very small possibility that someone re-sets
2872 * BM_VALID between our clearing it and StartBufferIO inspecting
2873 * it.
2874 */
2875 do
2876 {
2878 } while (!StartBufferIO(existing_hdr, true, false));
2879 }
2880 else
2881 {
2883 uint64 set_bits = 0;
2884
2886
2887 /* some sanity checks while we hold the buffer header lock */
2890
2891 victim_buf_hdr->tag = tag;
2892
2894 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2896
2898 set_bits, 0,
2899 0);
2900
2902
2903 /* XXX: could combine the locked operations in it with the above */
2904 StartBufferIO(victim_buf_hdr, true, false);
2905 }
2906 }
2907
2909
2910 /*
2911 * Note: if smgrzeroextend fails, we will end up with buffers that are
2912 * allocated but not marked BM_VALID. The next relation extension will
2913 * still select the same block number (because the relation didn't get any
2914 * longer on disk) and so future attempts to extend the relation will find
2915 * the same buffers (if they have not been recycled) but come right back
2916 * here to try smgrzeroextend again.
2917 *
2918 * We don't need to set checksum for all-zero pages.
2919 */
2921
2922 /*
2923 * Release the file-extension lock; it's now OK for someone else to extend
2924 * the relation some more.
2925 *
2926 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2927 * take noticeable time.
2928 */
2929 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2931
2933 io_start, 1, extend_by * BLCKSZ);
2934
2935 /* Set BM_VALID, terminate IO, and wake up any waiters */
2936 for (uint32 i = 0; i < extend_by; i++)
2937 {
2938 Buffer buf = buffers[i];
2940 bool lock = false;
2941
2942 if (flags & EB_LOCK_FIRST && i == 0)
2943 lock = true;
2944 else if (flags & EB_LOCK_TARGET)
2945 {
2947 if (first_block + i + 1 == extend_upto)
2948 lock = true;
2949 }
2950
2951 if (lock)
2953
2954 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2955 }
2956
2958
2960
2961 return first_block;
2962}
2963
2964/*
2965 * BufferIsLockedByMe
2966 *
2967 * Checks if this backend has the buffer locked in any mode.
2968 *
2969 * Buffer must be pinned.
2970 */
2971bool
2973{
2975
2977
2978 if (BufferIsLocal(buffer))
2979 {
2980 /* Content locks are not maintained for local buffers. */
2981 return true;
2982 }
2983 else
2984 {
2986 return BufferLockHeldByMe(bufHdr);
2987 }
2988}
2989
2990/*
2991 * BufferIsLockedByMeInMode
2992 *
2993 * Checks if this backend has the buffer locked in the specified mode.
2994 *
2995 * Buffer must be pinned.
2996 */
2997bool
2999{
3001
3003
3004 if (BufferIsLocal(buffer))
3005 {
3006 /* Content locks are not maintained for local buffers. */
3007 return true;
3008 }
3009 else
3010 {
3013 }
3014}
3015
3016/*
3017 * BufferIsDirty
3018 *
3019 * Checks if buffer is already dirty.
3020 *
3021 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
3022 * the result may be stale before it's returned.)
3023 */
3024bool
3026{
3028
3030
3031 if (BufferIsLocal(buffer))
3032 {
3033 int bufid = -buffer - 1;
3034
3036 /* Content locks are not maintained for local buffers. */
3037 }
3038 else
3039 {
3042 }
3043
3044 return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3045}
3046
3047/*
3048 * MarkBufferDirty
3049 *
3050 * Marks buffer contents as dirty (actual write happens later).
3051 *
3052 * Buffer must be pinned and exclusive-locked. (If caller does not hold
3053 * exclusive lock, then somebody could be in process of writing the buffer,
3054 * leading to risk of bad data written to disk.)
3055 */
3056void
3058{
3062
3063 if (!BufferIsValid(buffer))
3064 elog(ERROR, "bad buffer ID: %d", buffer);
3065
3066 if (BufferIsLocal(buffer))
3067 {
3069 return;
3070 }
3071
3073
3076
3077 /*
3078 * NB: We have to wait for the buffer header spinlock to be not held, as
3079 * TerminateBufferIO() relies on the spinlock.
3080 */
3082 for (;;)
3083 {
3086
3088
3091
3093 buf_state))
3094 break;
3095 }
3096
3097 /*
3098 * If the buffer was not dirty already, do vacuum accounting.
3099 */
3100 if (!(old_buf_state & BM_DIRTY))
3101 {
3103 if (VacuumCostActive)
3105 }
3106}
3107
3108/*
3109 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3110 *
3111 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3112 * compared to calling the two routines separately. Now it's mainly just
3113 * a convenience function. However, if the passed buffer is valid and
3114 * already contains the desired block, we just return it as-is; and that
3115 * does save considerable work compared to a full release and reacquire.
3116 *
3117 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3118 * buffer actually needs to be released. This case is the same as ReadBuffer,
3119 * but can save some tests in the caller.
3120 */
3121Buffer
3123 Relation relation,
3124 BlockNumber blockNum)
3125{
3126 ForkNumber forkNum = MAIN_FORKNUM;
3128
3129 if (BufferIsValid(buffer))
3130 {
3132 if (BufferIsLocal(buffer))
3133 {
3135 if (bufHdr->tag.blockNum == blockNum &&
3136 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3137 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3138 return buffer;
3140 }
3141 else
3142 {
3144 /* we have pin, so it's ok to examine tag without spinlock */
3145 if (bufHdr->tag.blockNum == blockNum &&
3146 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3147 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3148 return buffer;
3150 }
3151 }
3152
3153 return ReadBuffer(relation, blockNum);
3154}
3155
3156/*
3157 * PinBuffer -- make buffer unavailable for replacement.
3158 *
3159 * For the default access strategy, the buffer's usage_count is incremented
3160 * when we first pin it; for other strategies we just make sure the usage_count
3161 * isn't zero. (The idea of the latter is that we don't want synchronized
3162 * heap scans to inflate the count, but we need it to not be zero to discourage
3163 * other backends from stealing buffers from our ring. As long as we cycle
3164 * through the ring faster than the global clock-sweep cycles, buffers in
3165 * our ring won't be chosen as victims for replacement by other backends.)
3166 *
3167 * This should be applied only to shared buffers, never local ones.
3168 *
3169 * Since buffers are pinned/unpinned very frequently, pin buffers without
3170 * taking the buffer header lock; instead update the state variable in loop of
3171 * CAS operations. Hopefully it's just a single CAS.
3172 *
3173 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3174 * must have been done already.
3175 *
3176 * Returns true if buffer is BM_VALID, else false. This provision allows
3177 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3178 * true, then a false return value also indicates that the buffer was
3179 * (recently) invalid and has not been pinned.
3180 */
3181static bool
3183 bool skip_if_not_valid)
3184{
3186 bool result;
3188
3191
3192 ref = GetPrivateRefCountEntry(b, true);
3193
3194 if (ref == NULL)
3195 {
3198
3200 for (;;)
3201 {
3203 return false;
3204
3205 /*
3206 * We're not allowed to increase the refcount while the buffer
3207 * header spinlock is held. Wait for the lock to be released.
3208 */
3211
3213
3214 /* increase refcount */
3216
3217 if (strategy == NULL)
3218 {
3219 /* Default case: increase usagecount unless already max. */
3222 }
3223 else
3224 {
3225 /*
3226 * Ring buffers shouldn't evict others from pool. Thus we
3227 * don't make usagecount more than 1.
3228 */
3231 }
3232
3234 buf_state))
3235 {
3236 result = (buf_state & BM_VALID) != 0;
3237
3239 break;
3240 }
3241 }
3242 }
3243 else
3244 {
3245 /*
3246 * If we previously pinned the buffer, it is likely to be valid, but
3247 * it may not be if StartReadBuffers() was called and
3248 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3249 * the flags without locking. This is racy, but it's OK to return
3250 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3251 * it'll see that it's now valid.
3252 *
3253 * Note: We deliberately avoid a Valgrind client request here.
3254 * Individual access methods can optionally superimpose buffer page
3255 * client requests on top of our client requests to enforce that
3256 * buffers are only accessed while locked (and pinned). It's possible
3257 * that the buffer page is legitimately non-accessible here. We
3258 * cannot meddle with that.
3259 */
3260 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3261
3262 Assert(ref->data.refcount > 0);
3263 ref->data.refcount++;
3265 }
3266
3267 return result;
3268}
3269
3270/*
3271 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3272 * The spinlock is released before return.
3273 *
3274 * As this function is called with the spinlock held, the caller has to
3275 * previously call ReservePrivateRefCountEntry() and
3276 * ResourceOwnerEnlarge(CurrentResourceOwner);
3277 *
3278 * Currently, no callers of this function want to modify the buffer's
3279 * usage_count at all, so there's no need for a strategy parameter.
3280 * Also we don't bother with a BM_VALID test (the caller could check that for
3281 * itself).
3282 *
3283 * Also all callers only ever use this function when it's known that the
3284 * buffer can't have a preexisting pin by this backend. That allows us to skip
3285 * searching the private refcount array & hash, which is a boon, because the
3286 * spinlock is still held.
3287 *
3288 * Note: use of this routine is frequently mandatory, not just an optimization
3289 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3290 * its state can change under us.
3291 */
3292static void
3294{
3296
3297 /*
3298 * As explained, We don't expect any preexisting pins. That allows us to
3299 * manipulate the PrivateRefCount after releasing the spinlock
3300 */
3302
3303 /*
3304 * Since we hold the buffer spinlock, we can update the buffer state and
3305 * release the lock in one operation.
3306 */
3308
3310 0, 0, 1);
3311
3313}
3314
3315/*
3316 * Support for waking up another backend that is waiting for the cleanup lock
3317 * to be released using BM_PIN_COUNT_WAITER.
3318 *
3319 * See LockBufferForCleanup().
3320 *
3321 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3322 * not just reducing the backend-local pincount for the buffer).
3323 */
3324static void
3326{
3327 /*
3328 * Acquire the buffer header lock, re-check that there's a waiter. Another
3329 * backend could have unpinned this buffer, and already woken up the
3330 * waiter.
3331 *
3332 * There's no danger of the buffer being replaced after we unpinned it
3333 * above, as it's pinned by the waiter. The waiter removes
3334 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3335 * backend waking it up.
3336 */
3338
3341 {
3342 /* we just released the last pin other than the waiter's */
3343 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3344
3347 0);
3348 ProcSendSignal(wait_backend_pgprocno);
3349 }
3350 else
3352}
3353
3354/*
3355 * UnpinBuffer -- make buffer available for replacement.
3356 *
3357 * This should be applied only to shared buffers, never local ones. This
3358 * always adjusts CurrentResourceOwner.
3359 */
3360static void
3368
3369static void
3371{
3374
3376
3377 /* not moving as we're likely deleting it soon anyway */
3378 ref = GetPrivateRefCountEntry(b, false);
3379 Assert(ref != NULL);
3380 Assert(ref->data.refcount > 0);
3381 ref->data.refcount--;
3382 if (ref->data.refcount == 0)
3383 {
3385
3386 /*
3387 * Mark buffer non-accessible to Valgrind.
3388 *
3389 * Note that the buffer may have already been marked non-accessible
3390 * within access method code that enforces that buffers are only
3391 * accessed while a buffer lock is held.
3392 */
3394
3395 /*
3396 * I'd better not still hold the buffer content lock. Can't use
3397 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3398 */
3400
3401 /* decrement the shared reference count */
3403
3404 /* Support LockBufferForCleanup() */
3407
3409 }
3410}
3411
3412/*
3413 * Set up backend-local tracking of a buffer pinned the first time by this
3414 * backend.
3415 */
3416inline void
3418{
3420
3422 ref->data.refcount++;
3423
3425
3426 /*
3427 * This is the first pin for this page by this backend, mark its page as
3428 * defined to valgrind. While the page contents might not actually be
3429 * valid yet, we don't currently guarantee that such pages are marked
3430 * undefined or non-accessible.
3431 *
3432 * It's not necessarily the prettiest to do this here, but otherwise we'd
3433 * need this block of code in multiple places.
3434 */
3436 BLCKSZ);
3437}
3438
3439#define ST_SORT sort_checkpoint_bufferids
3440#define ST_ELEMENT_TYPE CkptSortItem
3441#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3442#define ST_SCOPE static
3443#define ST_DEFINE
3444#include "lib/sort_template.h"
3445
3446/*
3447 * BufferSync -- Write out all dirty buffers in the pool.
3448 *
3449 * This is called at checkpoint time to write out all dirty shared buffers.
3450 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3451 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3452 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3453 * even unlogged buffers, which are otherwise skipped. The remaining flags
3454 * currently have no effect here.
3455 */
3456static void
3457BufferSync(int flags)
3458{
3460 int buf_id;
3461 int num_to_scan;
3462 int num_spaces;
3463 int num_processed;
3464 int num_written;
3466 Oid last_tsid;
3468 int i;
3469 uint64 mask = BM_DIRTY;
3471
3472 /*
3473 * Unless this is a shutdown checkpoint or we have been explicitly told,
3474 * we write only permanent, dirty buffers. But at shutdown or end of
3475 * recovery, we write all dirty buffers.
3476 */
3479 mask |= BM_PERMANENT;
3480
3481 /*
3482 * Loop over all buffers, and mark the ones that need to be written with
3483 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3484 * can estimate how much work needs to be done.
3485 *
3486 * This allows us to write only those pages that were dirty when the
3487 * checkpoint began, and not those that get dirtied while it proceeds.
3488 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3489 * later in this function, or by normal backends or the bgwriter cleaning
3490 * scan, the flag is cleared. Any buffer dirtied after this point won't
3491 * have the flag set.
3492 *
3493 * Note that if we fail to write some buffer, we may leave buffers with
3494 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3495 * certainly need to be written for the next checkpoint attempt, too.
3496 */
3497 num_to_scan = 0;
3498 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3499 {
3501 uint64 set_bits = 0;
3502
3503 /*
3504 * Header spinlock is enough to examine BM_DIRTY, see comment in
3505 * SyncOneBuffer.
3506 */
3508
3509 if ((buf_state & mask) == mask)
3510 {
3511 CkptSortItem *item;
3512
3514
3515 item = &CkptBufferIds[num_to_scan++];
3516 item->buf_id = buf_id;
3517 item->tsId = bufHdr->tag.spcOid;
3518 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3519 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3520 item->blockNum = bufHdr->tag.blockNum;
3521 }
3522
3524 set_bits, 0,
3525 0);
3526
3527 /* Check for barrier events in case NBuffers is large. */
3530 }
3531
3532 if (num_to_scan == 0)
3533 return; /* nothing to do */
3534
3536
3538
3539 /*
3540 * Sort buffers that need to be written to reduce the likelihood of random
3541 * IO. The sorting is also important for the implementation of balancing
3542 * writes between tablespaces. Without balancing writes we'd potentially
3543 * end up writing to the tablespaces one-by-one; possibly overloading the
3544 * underlying system.
3545 */
3547
3548 num_spaces = 0;
3549
3550 /*
3551 * Allocate progress status for each tablespace with buffers that need to
3552 * be flushed. This requires the to-be-flushed array to be sorted.
3553 */
3555 for (i = 0; i < num_to_scan; i++)
3556 {
3557 CkptTsStatus *s;
3558 Oid cur_tsid;
3559
3561
3562 /*
3563 * Grow array of per-tablespace status structs, every time a new
3564 * tablespace is found.
3565 */
3567 {
3568 Size sz;
3569
3570 num_spaces++;
3571
3572 /*
3573 * Not worth adding grow-by-power-of-2 logic here - even with a
3574 * few hundred tablespaces this should be fine.
3575 */
3576 sz = sizeof(CkptTsStatus) * num_spaces;
3577
3578 if (per_ts_stat == NULL)
3580 else
3582
3583 s = &per_ts_stat[num_spaces - 1];
3584 memset(s, 0, sizeof(*s));
3585 s->tsId = cur_tsid;
3586
3587 /*
3588 * The first buffer in this tablespace. As CkptBufferIds is sorted
3589 * by tablespace all (s->num_to_scan) buffers in this tablespace
3590 * will follow afterwards.
3591 */
3592 s->index = i;
3593
3594 /*
3595 * progress_slice will be determined once we know how many buffers
3596 * are in each tablespace, i.e. after this loop.
3597 */
3598
3600 }
3601 else
3602 {
3603 s = &per_ts_stat[num_spaces - 1];
3604 }
3605
3606 s->num_to_scan++;
3607
3608 /* Check for barrier events. */
3611 }
3612
3613 Assert(num_spaces > 0);
3614
3615 /*
3616 * Build a min-heap over the write-progress in the individual tablespaces,
3617 * and compute how large a portion of the total progress a single
3618 * processed buffer is.
3619 */
3622 NULL);
3623
3624 for (i = 0; i < num_spaces; i++)
3625 {
3627
3628 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3629
3631 }
3632
3634
3635 /*
3636 * Iterate through to-be-checkpointed buffers and write the ones (still)
3637 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3638 * tablespaces; otherwise the sorting would lead to only one tablespace
3639 * receiving writes at a time, making inefficient use of the hardware.
3640 */
3641 num_processed = 0;
3642 num_written = 0;
3643 while (!binaryheap_empty(ts_heap))
3644 {
3648
3649 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3650 Assert(buf_id != -1);
3651
3652 bufHdr = GetBufferDescriptor(buf_id);
3653
3654 num_processed++;
3655
3656 /*
3657 * We don't need to acquire the lock here, because we're only looking
3658 * at a single bit. It's possible that someone else writes the buffer
3659 * and clears the flag right after we check, but that doesn't matter
3660 * since SyncOneBuffer will then do nothing. However, there is a
3661 * further race condition: it's conceivable that between the time we
3662 * examine the bit here and the time SyncOneBuffer acquires the lock,
3663 * someone else not only wrote the buffer but replaced it with another
3664 * page and dirtied it. In that improbable case, SyncOneBuffer will
3665 * write the buffer though we didn't need to. It doesn't seem worth
3666 * guarding against this, though.
3667 */
3669 {
3670 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3671 {
3674 num_written++;
3675 }
3676 }
3677
3678 /*
3679 * Measure progress independent of actually having to flush the buffer
3680 * - otherwise writing become unbalanced.
3681 */
3682 ts_stat->progress += ts_stat->progress_slice;
3683 ts_stat->num_scanned++;
3684 ts_stat->index++;
3685
3686 /* Have all the buffers from the tablespace been processed? */
3687 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3688 {
3690 }
3691 else
3692 {
3693 /* update heap with the new progress */
3695 }
3696
3697 /*
3698 * Sleep to throttle our I/O rate.
3699 *
3700 * (This will check for barrier events even if it doesn't sleep.)
3701 */
3702 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3703 }
3704
3705 /*
3706 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3707 * IOContext will always be IOCONTEXT_NORMAL.
3708 */
3710
3712 per_ts_stat = NULL;
3714
3715 /*
3716 * Update checkpoint statistics. As noted above, this doesn't include
3717 * buffers written by other backends or bgwriter scan.
3718 */
3720
3722}
3723
3724/*
3725 * BgBufferSync -- Write out some dirty buffers in the pool.
3726 *
3727 * This is called periodically by the background writer process.
3728 *
3729 * Returns true if it's appropriate for the bgwriter process to go into
3730 * low-power hibernation mode. (This happens if the strategy clock-sweep
3731 * has been "lapped" and no buffer allocations have occurred recently,
3732 * or if the bgwriter has been effectively disabled by setting
3733 * bgwriter_lru_maxpages to 0.)
3734 */
3735bool
3737{
3738 /* info obtained from freelist.c */
3739 int strategy_buf_id;
3742
3743 /*
3744 * Information saved between calls so we can determine the strategy
3745 * point's advance rate and avoid scanning already-cleaned buffers.
3746 */
3747 static bool saved_info_valid = false;
3748 static int prev_strategy_buf_id;
3750 static int next_to_clean;
3751 static uint32 next_passes;
3752
3753 /* Moving averages of allocation rate and clean-buffer density */
3754 static float smoothed_alloc = 0;
3755 static float smoothed_density = 10.0;
3756
3757 /* Potentially these could be tunables, but for now, not */
3758 float smoothing_samples = 16;
3759 float scan_whole_pool_milliseconds = 120000.0;
3760
3761 /* Used to compute how far we scan ahead */
3762 long strategy_delta;
3763 int bufs_to_lap;
3764 int bufs_ahead;
3765 float scans_per_alloc;
3768 int min_scan_buffers;
3769
3770 /* Variables for the scanning loop proper */
3771 int num_to_scan;
3772 int num_written;
3773 int reusable_buffers;
3774
3775 /* Variables for final smoothed_density update */
3776 long new_strategy_delta;
3778
3779 /*
3780 * Find out where the clock-sweep currently is, and how many buffer
3781 * allocations have happened since our last call.
3782 */
3784
3785 /* Report buffer alloc counts to pgstat */
3787
3788 /*
3789 * If we're not running the LRU scan, just stop after doing the stats
3790 * stuff. We mark the saved state invalid so that we can recover sanely
3791 * if LRU scan is turned back on later.
3792 */
3793 if (bgwriter_lru_maxpages <= 0)
3794 {
3795 saved_info_valid = false;
3796 return true;
3797 }
3798
3799 /*
3800 * Compute strategy_delta = how many buffers have been scanned by the
3801 * clock-sweep since last time. If first time through, assume none. Then
3802 * see if we are still ahead of the clock-sweep, and if so, how many
3803 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3804 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3805 * behavior when the passes counts wrap around.
3806 */
3807 if (saved_info_valid)
3808 {
3810
3813
3814 Assert(strategy_delta >= 0);
3815
3816 if ((int32) (next_passes - strategy_passes) > 0)
3817 {
3818 /* we're one pass ahead of the strategy point */
3820#ifdef BGW_DEBUG
3821 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3825#endif
3826 }
3827 else if (next_passes == strategy_passes &&
3829 {
3830 /* on same pass, but ahead or at least not behind */
3832#ifdef BGW_DEBUG
3833 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3837#endif
3838 }
3839 else
3840 {
3841 /*
3842 * We're behind, so skip forward to the strategy point and start
3843 * cleaning from there.
3844 */
3845#ifdef BGW_DEBUG
3846 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3850#endif
3854 }
3855 }
3856 else
3857 {
3858 /*
3859 * Initializing at startup or after LRU scanning had been off. Always
3860 * start at the strategy point.
3861 */
3862#ifdef BGW_DEBUG
3863 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3865#endif
3866 strategy_delta = 0;
3870 }
3871
3872 /* Update saved info for next time */
3875 saved_info_valid = true;
3876
3877 /*
3878 * Compute how many buffers had to be scanned for each new allocation, ie,
3879 * 1/density of reusable buffers, and track a moving average of that.
3880 *
3881 * If the strategy point didn't move, we don't update the density estimate
3882 */
3883 if (strategy_delta > 0 && recent_alloc > 0)
3884 {
3888 }
3889
3890 /*
3891 * Estimate how many reusable buffers there are between the current
3892 * strategy point and where we've scanned ahead to, based on the smoothed
3893 * density estimate.
3894 */
3897
3898 /*
3899 * Track a moving average of recent buffer allocations. Here, rather than
3900 * a true average we want a fast-attack, slow-decline behavior: we
3901 * immediately follow any increase.
3902 */
3903 if (smoothed_alloc <= (float) recent_alloc)
3905 else
3908
3909 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3911
3912 /*
3913 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3914 * eventually underflow to zero, and the underflows produce annoying
3915 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3916 * zero, there's no point in tracking smaller and smaller values of
3917 * smoothed_alloc, so just reset it to exactly zero to avoid this
3918 * syndrome. It will pop back up as soon as recent_alloc increases.
3919 */
3920 if (upcoming_alloc_est == 0)
3921 smoothed_alloc = 0;
3922
3923 /*
3924 * Even in cases where there's been little or no buffer allocation
3925 * activity, we want to make a small amount of progress through the buffer
3926 * cache so that as many reusable buffers as possible are clean after an
3927 * idle period.
3928 *
3929 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3930 * the BGW will be called during the scan_whole_pool time; slice the
3931 * buffer pool into that many sections.
3932 */
3934
3936 {
3937#ifdef BGW_DEBUG
3938 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3940#endif
3942 }
3943
3944 /*
3945 * Now write out dirty reusable buffers, working forward from the
3946 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3947 * enough buffers to match our estimate of the next cycle's allocation
3948 * requirements, or hit the bgwriter_lru_maxpages limit.
3949 */
3950
3951 num_to_scan = bufs_to_lap;
3952 num_written = 0;
3954
3955 /* Execute the LRU scan */
3956 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3957 {
3959 wb_context);
3960
3961 if (++next_to_clean >= NBuffers)
3962 {
3963 next_to_clean = 0;
3964 next_passes++;
3965 }
3966 num_to_scan--;
3967
3968 if (sync_state & BUF_WRITTEN)
3969 {
3972 {
3974 break;
3975 }
3976 }
3977 else if (sync_state & BUF_REUSABLE)
3979 }
3980
3982
3983#ifdef BGW_DEBUG
3984 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3987 bufs_to_lap - num_to_scan,
3990#endif
3991
3992 /*
3993 * Consider the above scan as being like a new allocation scan.
3994 * Characterize its density and update the smoothed one based on it. This
3995 * effectively halves the moving average period in cases where both the
3996 * strategy and the background writer are doing some useful scanning,
3997 * which is helpful because a long memory isn't as desirable on the
3998 * density estimates.
3999 */
4000 new_strategy_delta = bufs_to_lap - num_to_scan;
4002 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4003 {
4007
4008#ifdef BGW_DEBUG
4009 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4012#endif
4013 }
4014
4015 /* Return true if OK to hibernate */
4016 return (bufs_to_lap == 0 && recent_alloc == 0);
4017}
4018
4019/*
4020 * SyncOneBuffer -- process a single buffer during syncing.
4021 *
4022 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4023 * buffers marked recently used, as these are not replacement candidates.
4024 *
4025 * Returns a bitmask containing the following flag bits:
4026 * BUF_WRITTEN: we wrote the buffer.
4027 * BUF_REUSABLE: buffer is available for replacement, ie, it has
4028 * pin count 0 and usage count 0.
4029 *
4030 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4031 * after locking it, but we don't care all that much.)
4032 */
4033static int
4035{
4037 int result = 0;
4039 BufferTag tag;
4040
4041 /* Make sure we can handle the pin */
4044
4045 /*
4046 * Check whether buffer needs writing.
4047 *
4048 * We can make this check without taking the buffer content lock so long
4049 * as we mark pages dirty in access methods *before* logging changes with
4050 * XLogInsert(): if someone marks the buffer dirty just after our check we
4051 * don't worry because our checkpoint.redo points before log record for
4052 * upcoming changes and so we are not required to write such dirty buffer.
4053 */
4055
4058 {
4059 result |= BUF_REUSABLE;
4060 }
4061 else if (skip_recently_used)
4062 {
4063 /* Caller told us not to write recently-used buffers */
4065 return result;
4066 }
4067
4068 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4069 {
4070 /* It's clean, so nothing to do */
4072 return result;
4073 }
4074
4075 /*
4076 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
4077 * buffer is clean by the time we've locked it.)
4078 */
4080
4082
4083 tag = bufHdr->tag;
4084
4086
4087 /*
4088 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4089 * IOContext will always be IOCONTEXT_NORMAL.
4090 */
4092
4093 return result | BUF_WRITTEN;
4094}
4095
4096/*
4097 * AtEOXact_Buffers - clean up at end of transaction.
4098 *
4099 * As of PostgreSQL 8.0, buffer pins should get released by the
4100 * ResourceOwner mechanism. This routine is just a debugging
4101 * cross-check that no pins remain.
4102 */
4103void
4112
4113/*
4114 * Initialize access to shared buffer pool
4115 *
4116 * This is called during backend startup (whether standalone or under the
4117 * postmaster). It sets up for this backend's access to the already-existing
4118 * buffer pool.
4119 */
4120void
4122{
4124
4125 /*
4126 * An advisory limit on the number of pins each backend should hold, based
4127 * on shared_buffers and the maximum number of connections possible.
4128 * That's very pessimistic, but outside toy-sized shared_buffers it should
4129 * allow plenty of pins. LimitAdditionalPins() and
4130 * GetAdditionalPinLimit() can be used to check the remaining balance.
4131 */
4133
4136
4137 hash_ctl.keysize = sizeof(Buffer);
4138 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4139
4140 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4142
4143 /*
4144 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4145 * the corresponding phase of backend shutdown.
4146 */
4147 Assert(MyProc != NULL);
4149}
4150
4151/*
4152 * During backend exit, ensure that we released all shared-buffer locks and
4153 * assert that we have no remaining pins.
4154 */
4155static void
4157{
4158 UnlockBuffers();
4159
4161
4162 /* localbuf.c needs a chance too */
4164}
4165
4166/*
4167 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4168 *
4169 * As of PostgreSQL 8.0, buffer pins should get released by the
4170 * ResourceOwner mechanism. This routine is just a debugging
4171 * cross-check that no pins remain.
4172 */
4173static void
4175{
4176#ifdef USE_ASSERT_CHECKING
4177 int RefCountErrors = 0;
4179 int i;
4180 char *s;
4181
4182 /* check the array */
4183 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4184 {
4186 {
4187 res = &PrivateRefCountArray[i];
4188
4190 elog(WARNING, "buffer refcount leak: %s", s);
4191 pfree(s);
4192
4194 }
4195 }
4196
4197 /* if necessary search the hash */
4199 {
4201
4203 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4204 {
4206 elog(WARNING, "buffer refcount leak: %s", s);
4207 pfree(s);
4209 }
4210 }
4211
4212 Assert(RefCountErrors == 0);
4213#endif
4214}
4215
4216#ifdef USE_ASSERT_CHECKING
4217/*
4218 * Check for exclusive-locked catalog buffers. This is the core of
4219 * AssertCouldGetRelation().
4220 *
4221 * A backend would self-deadlock on the content lock if the catalog scan read
4222 * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4223 * of catalogs used in relcache, because a catcache search on any catalog may
4224 * build that catalog's relcache entry. We don't have an inventory of
4225 * catalogs relcache uses, so just check buffers of most catalogs.
4226 *
4227 * It's better to minimize waits while holding an exclusive buffer lock, so it
4228 * would be nice to broaden this check not to be catalog-specific. However,
4229 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4230 * read tables. That is deadlock-free as long as there's no loop in the
4231 * dependency graph: modifying table A may cause an opclass to read table B,
4232 * but it must not cause a read of table A.
4233 */
4234void
4236{
4238
4239 /* check the array */
4240 for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4241 {
4243 {
4244 res = &PrivateRefCountArray[i];
4245
4246 if (res->buffer == InvalidBuffer)
4247 continue;
4248
4250 }
4251 }
4252
4253 /* if necessary search the hash */
4255 {
4257
4259 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4260 {
4262 }
4263 }
4264}
4265
4266static void
4268{
4270 BufferTag tag;
4271 Oid relid;
4272
4274 return;
4275
4276 tag = bufHdr->tag;
4277
4278 /*
4279 * This relNumber==relid assumption holds until a catalog experiences
4280 * VACUUM FULL or similar. After a command like that, relNumber will be
4281 * in the normal (non-catalog) range, and we lose the ability to detect
4282 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4283 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4284 * held lock.
4285 */
4286 relid = tag.relNumber;
4287
4288 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4289 return;
4290
4292}
4293#endif
4294
4295
4296/*
4297 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4298 */
4299char *
4301{
4302 BufferDesc *buf;
4304 char *result;
4305 ProcNumber backend;
4307
4309 if (BufferIsLocal(buffer))
4310 {
4313 backend = MyProcNumber;
4314 }
4315 else
4316 {
4319 backend = INVALID_PROC_NUMBER;
4320 }
4321
4322 /* theoretically we should lock the bufHdr here */
4323 buf_state = pg_atomic_read_u64(&buf->state);
4324
4325 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4326 buffer,
4328 BufTagGetForkNum(&buf->tag)).str,
4329 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4331 return result;
4332}
4333
4334/*
4335 * CheckPointBuffers
4336 *
4337 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4338 *
4339 * Note: temporary relations do not participate in checkpoints, so they don't
4340 * need to be flushed.
4341 */
4342void
4344{
4345 BufferSync(flags);
4346}
4347
4348/*
4349 * BufferGetBlockNumber
4350 * Returns the block number associated with a buffer.
4351 *
4352 * Note:
4353 * Assumes that the buffer is valid and pinned, else the
4354 * value may be obsolete immediately...
4355 */
4358{
4360
4362
4363 if (BufferIsLocal(buffer))
4365 else
4367
4368 /* pinned, so OK to read tag without spinlock */
4369 return bufHdr->tag.blockNum;
4370}
4371
4372/*
4373 * BufferGetTag
4374 * Returns the relfilelocator, fork number and block number associated with
4375 * a buffer.
4376 */
4377void
4380{
4382
4383 /* Do the same checks as BufferGetBlockNumber. */
4385
4386 if (BufferIsLocal(buffer))
4388 else
4390
4391 /* pinned, so OK to read tag without spinlock */
4392 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4393 *forknum = BufTagGetForkNum(&bufHdr->tag);
4394 *blknum = bufHdr->tag.blockNum;
4395}
4396
4397/*
4398 * FlushBuffer
4399 * Physically write out a shared buffer.
4400 *
4401 * NOTE: this actually just passes the buffer contents to the kernel; the
4402 * real write to disk won't happen until the kernel feels like it. This
4403 * is okay from our point of view since we can redo the changes from WAL.
4404 * However, we will need to force the changes to disk via fsync before
4405 * we can checkpoint WAL.
4406 *
4407 * The caller must hold a pin on the buffer and have share-locked the
4408 * buffer contents. (Note: a share-lock does not prevent updates of
4409 * hint bits in the buffer, so the page could change while the write
4410 * is in progress, but we assume that that will not invalidate the data
4411 * written.)
4412 *
4413 * If the caller has an smgr reference for the buffer's relation, pass it
4414 * as the second parameter. If not, pass NULL.
4415 */
4416static void
4419{
4421 ErrorContextCallback errcallback;
4424 char *bufToWrite;
4426
4427 /*
4428 * Try to start an I/O operation. If StartBufferIO returns false, then
4429 * someone else flushed the buffer before we could, so we need not do
4430 * anything.
4431 */
4432 if (!StartBufferIO(buf, false, false))
4433 return;
4434
4435 /* Setup error traceback support for ereport() */
4437 errcallback.arg = buf;
4438 errcallback.previous = error_context_stack;
4439 error_context_stack = &errcallback;
4440
4441 /* Find smgr relation for buffer */
4442 if (reln == NULL)
4444
4446 buf->tag.blockNum,
4447 reln->smgr_rlocator.locator.spcOid,
4448 reln->smgr_rlocator.locator.dbOid,
4449 reln->smgr_rlocator.locator.relNumber);
4450
4452
4453 /*
4454 * Run PageGetLSN while holding header lock, since we don't have the
4455 * buffer locked exclusively in all cases.
4456 */
4458
4459 /* To check if block content changes while flushing. - vadim 01/17/97 */
4461 0, BM_JUST_DIRTIED,
4462 0);
4463
4464 /*
4465 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4466 * rule that log updates must hit disk before any of the data-file changes
4467 * they describe do.
4468 *
4469 * However, this rule does not apply to unlogged relations, which will be
4470 * lost after a crash anyway. Most unlogged relation pages do not bear
4471 * LSNs since we never emit WAL records for them, and therefore flushing
4472 * up through the buffer LSN would be useless, but harmless. However,
4473 * GiST indexes use LSNs internally to track page-splits, and therefore
4474 * unlogged GiST pages bear "fake" LSNs generated by
4475 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4476 * LSN counter could advance past the WAL insertion point; and if it did
4477 * happen, attempting to flush WAL through that location would fail, with
4478 * disastrous system-wide consequences. To make sure that can't happen,
4479 * skip the flush if the buffer isn't permanent.
4480 */
4481 if (buf_state & BM_PERMANENT)
4483
4484 /*
4485 * Now it's safe to write the buffer to disk. Note that no one else should
4486 * have been able to write it, while we were busy with log flushing,
4487 * because we got the exclusive right to perform I/O by setting the
4488 * BM_IO_IN_PROGRESS bit.
4489 */
4491
4492 /*
4493 * Update page checksum if desired. Since we have only shared lock on the
4494 * buffer, other processes might be updating hint bits in it, so we must
4495 * copy the page to private storage if we do checksumming.
4496 */
4497 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4498
4500
4501 /*
4502 * bufToWrite is either the shared buffer or a copy, as appropriate.
4503 */
4505 BufTagGetForkNum(&buf->tag),
4506 buf->tag.blockNum,
4507 bufToWrite,
4508 false);
4509
4510 /*
4511 * When a strategy is in use, only flushes of dirty buffers already in the
4512 * strategy ring are counted as strategy writes (IOCONTEXT
4513 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4514 * statistics tracking.
4515 *
4516 * If a shared buffer initially added to the ring must be flushed before
4517 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4518 *
4519 * If a shared buffer which was added to the ring later because the
4520 * current strategy buffer is pinned or in use or because all strategy
4521 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4522 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4523 * (from_ring will be false).
4524 *
4525 * When a strategy is not in use, the write can only be a "regular" write
4526 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4527 */
4530
4532
4533 /*
4534 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4535 * end the BM_IO_IN_PROGRESS state.
4536 */
4537 TerminateBufferIO(buf, true, 0, true, false);
4538
4540 buf->tag.blockNum,
4541 reln->smgr_rlocator.locator.spcOid,
4542 reln->smgr_rlocator.locator.dbOid,
4543 reln->smgr_rlocator.locator.relNumber);
4544
4545 /* Pop the error context stack */
4546 error_context_stack = errcallback.previous;
4547}
4548
4549/*
4550 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4551 * before/after calling FlushBuffer().
4552 */
4553static void
4563
4564/*
4565 * RelationGetNumberOfBlocksInFork
4566 * Determines the current number of pages in the specified relation fork.
4567 *
4568 * Note that the accuracy of the result will depend on the details of the
4569 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4570 * it might not be.
4571 */
4574{
4575 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4576 {
4577 /*
4578 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4579 * tableam returns the size in bytes - but for the purpose of this
4580 * routine, we want the number of blocks. Therefore divide, rounding
4581 * up.
4582 */
4584
4585 szbytes = table_relation_size(relation, forkNum);
4586
4587 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4588 }
4589 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4590 {
4591 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4592 }
4593 else
4594 Assert(false);
4595
4596 return 0; /* keep compiler quiet */
4597}
4598
4599/*
4600 * BufferIsPermanent
4601 * Determines whether a buffer will potentially still be around after
4602 * a crash. Caller must hold a buffer pin.
4603 */
4604bool
4606{
4608
4609 /* Local buffers are used only for temp relations. */
4610 if (BufferIsLocal(buffer))
4611 return false;
4612
4613 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4616
4617 /*
4618 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4619 * need not bother with the buffer header spinlock. Even if someone else
4620 * changes the buffer header state while we're doing this, the state is
4621 * changed atomically, so we'll read the old value or the new value, but
4622 * not random garbage.
4623 */
4625 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4626}
4627
4628/*
4629 * BufferGetLSNAtomic
4630 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4631 * This is necessary for some callers who may not have an exclusive lock
4632 * on the buffer.
4633 */
4636{
4637 char *page = BufferGetPage(buffer);
4639 XLogRecPtr lsn;
4640
4641 /*
4642 * If we don't need locking for correctness, fastpath out.
4643 */
4645 return PageGetLSN(page);
4646
4647 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4650
4653 lsn = PageGetLSN(page);
4655
4656 return lsn;
4657}
4658
4659/* ---------------------------------------------------------------------
4660 * DropRelationBuffers
4661 *
4662 * This function removes from the buffer pool all the pages of the
4663 * specified relation forks that have block numbers >= firstDelBlock.
4664 * (In particular, with firstDelBlock = 0, all pages are removed.)
4665 * Dirty pages are simply dropped, without bothering to write them
4666 * out first. Therefore, this is NOT rollback-able, and so should be
4667 * used only with extreme caution!
4668 *
4669 * Currently, this is called only from smgr.c when the underlying file
4670 * is about to be deleted or truncated (firstDelBlock is needed for
4671 * the truncation case). The data in the affected pages would therefore
4672 * be deleted momentarily anyway, and there is no point in writing it.
4673 * It is the responsibility of higher-level code to ensure that the
4674 * deletion or truncation does not lose any data that could be needed
4675 * later. It is also the responsibility of higher-level code to ensure
4676 * that no other process could be trying to load more pages of the
4677 * relation into buffers.
4678 * --------------------------------------------------------------------
4679 */
4680void
4683{
4684 int i;
4685 int j;
4686 RelFileLocatorBackend rlocator;
4689
4690 rlocator = smgr_reln->smgr_rlocator;
4691
4692 /* If it's a local relation, it's localbuf.c's problem. */
4693 if (RelFileLocatorBackendIsTemp(rlocator))
4694 {
4695 if (rlocator.backend == MyProcNumber)
4696 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4698
4699 return;
4700 }
4701
4702 /*
4703 * To remove all the pages of the specified relation forks from the buffer
4704 * pool, we need to scan the entire buffer pool but we can optimize it by
4705 * finding the buffers from BufMapping table provided we know the exact
4706 * size of each fork of the relation. The exact size is required to ensure
4707 * that we don't leave any buffer for the relation being dropped as
4708 * otherwise the background writer or checkpointer can lead to a PANIC
4709 * error while flushing buffers corresponding to files that don't exist.
4710 *
4711 * To know the exact size, we rely on the size cached for each fork by us
4712 * during recovery which limits the optimization to recovery and on
4713 * standbys but we can easily extend it once we have shared cache for
4714 * relation size.
4715 *
4716 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4717 * and the future writes keeps the cached value up-to-date. See
4718 * smgrextend. It is possible that the value of the first lseek is smaller
4719 * than the actual number of existing blocks in the file due to buggy
4720 * Linux kernels that might not have accounted for the recent write. But
4721 * that should be fine because there must not be any buffers after that
4722 * file size.
4723 */
4724 for (i = 0; i < nforks; i++)
4725 {
4726 /* Get the number of blocks for a relation's fork */
4728
4730 {
4732 break;
4733 }
4734
4735 /* calculate the number of blocks to be invalidated */
4737 }
4738
4739 /*
4740 * We apply the optimization iff the total number of blocks to invalidate
4741 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4742 */
4745 {
4746 for (j = 0; j < nforks; j++)
4747 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4749 return;
4750 }
4751
4752 for (i = 0; i < NBuffers; i++)
4753 {
4755
4756 /*
4757 * We can make this a tad faster by prechecking the buffer tag before
4758 * we attempt to lock the buffer; this saves a lot of lock
4759 * acquisitions in typical cases. It should be safe because the
4760 * caller must have AccessExclusiveLock on the relation, or some other
4761 * reason to be certain that no one is loading new pages of the rel
4762 * into the buffer pool. (Otherwise we might well miss such pages
4763 * entirely.) Therefore, while the tag might be changing while we
4764 * look at it, it can't be changing *to* a value we care about, only
4765 * *away* from such a value. So false negatives are impossible, and
4766 * false positives are safe because we'll recheck after getting the
4767 * buffer lock.
4768 *
4769 * We could check forkNum and blockNum as well as the rlocator, but
4770 * the incremental win from doing so seems small.
4771 */
4772 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4773 continue;
4774
4776
4777 for (j = 0; j < nforks; j++)
4778 {
4779 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4780 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4781 bufHdr->tag.blockNum >= firstDelBlock[j])
4782 {
4783 InvalidateBuffer(bufHdr); /* releases spinlock */
4784 break;
4785 }
4786 }
4787 if (j >= nforks)
4789 }
4790}
4791
4792/* ---------------------------------------------------------------------
4793 * DropRelationsAllBuffers
4794 *
4795 * This function removes from the buffer pool all the pages of all
4796 * forks of the specified relations. It's equivalent to calling
4797 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4798 * --------------------------------------------------------------------
4799 */
4800void
4802{
4803 int i;
4804 int n = 0;
4805 SMgrRelation *rels;
4806 BlockNumber (*block)[MAX_FORKNUM + 1];
4809 bool cached = true;
4810 bool use_bsearch;
4811
4812 if (nlocators == 0)
4813 return;
4814
4815 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4816
4817 /* If it's a local relation, it's localbuf.c's problem. */
4818 for (i = 0; i < nlocators; i++)
4819 {
4820 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4821 {
4822 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4823 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4824 }
4825 else
4826 rels[n++] = smgr_reln[i];
4827 }
4828
4829 /*
4830 * If there are no non-local relations, then we're done. Release the
4831 * memory and return.
4832 */
4833 if (n == 0)
4834 {
4835 pfree(rels);
4836 return;
4837 }
4838
4839 /*
4840 * This is used to remember the number of blocks for all the relations
4841 * forks.
4842 */
4843 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4844 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4845
4846 /*
4847 * We can avoid scanning the entire buffer pool if we know the exact size
4848 * of each of the given relation forks. See DropRelationBuffers.
4849 */
4850 for (i = 0; i < n && cached; i++)
4851 {
4852 for (int j = 0; j <= MAX_FORKNUM; j++)
4853 {
4854 /* Get the number of blocks for a relation's fork. */
4855 block[i][j] = smgrnblocks_cached(rels[i], j);
4856
4857 /* We need to only consider the relation forks that exists. */
4858 if (block[i][j] == InvalidBlockNumber)
4859 {
4860 if (!smgrexists(rels[i], j))
4861 continue;
4862 cached = false;
4863 break;
4864 }
4865
4866 /* calculate the total number of blocks to be invalidated */
4867 nBlocksToInvalidate += block[i][j];
4868 }
4869 }
4870
4871 /*
4872 * We apply the optimization iff the total number of blocks to invalidate
4873 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4874 */
4876 {
4877 for (i = 0; i < n; i++)
4878 {
4879 for (int j = 0; j <= MAX_FORKNUM; j++)
4880 {
4881 /* ignore relation forks that doesn't exist */
4882 if (!BlockNumberIsValid(block[i][j]))
4883 continue;
4884
4885 /* drop all the buffers for a particular relation fork */
4886 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4887 j, block[i][j], 0);
4888 }
4889 }
4890
4891 pfree(block);
4892 pfree(rels);
4893 return;
4894 }
4895
4896 pfree(block);
4897 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4898 for (i = 0; i < n; i++)
4899 locators[i] = rels[i]->smgr_rlocator.locator;
4900
4901 /*
4902 * For low number of relations to drop just use a simple walk through, to
4903 * save the bsearch overhead. The threshold to use is rather a guess than
4904 * an exactly determined value, as it depends on many factors (CPU and RAM
4905 * speeds, amount of shared buffers etc.).
4906 */
4908
4909 /* sort the list of rlocators if necessary */
4910 if (use_bsearch)
4912
4913 for (i = 0; i < NBuffers; i++)
4914 {
4915 RelFileLocator *rlocator = NULL;
4917
4918 /*
4919 * As in DropRelationBuffers, an unlocked precheck should be safe and
4920 * saves some cycles.
4921 */
4922
4923 if (!use_bsearch)
4924 {
4925 int j;
4926
4927 for (j = 0; j < n; j++)
4928 {
4930 {
4931 rlocator = &locators[j];
4932 break;
4933 }
4934 }
4935 }
4936 else
4937 {
4938 RelFileLocator locator;
4939
4940 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4941 rlocator = bsearch(&locator,
4942 locators, n, sizeof(RelFileLocator),
4944 }
4945
4946 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4947 if (rlocator == NULL)
4948 continue;
4949
4951 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4952 InvalidateBuffer(bufHdr); /* releases spinlock */
4953 else
4955 }
4956
4957 pfree(locators);
4958 pfree(rels);
4959}
4960
4961/* ---------------------------------------------------------------------
4962 * FindAndDropRelationBuffers
4963 *
4964 * This function performs look up in BufMapping table and removes from the
4965 * buffer pool all the pages of the specified relation fork that has block
4966 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4967 * pages are removed.)
4968 * --------------------------------------------------------------------
4969 */
4970static void
4974{
4975 BlockNumber curBlock;
4976
4977 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4978 {
4979 uint32 bufHash; /* hash value for tag */
4980 BufferTag bufTag; /* identity of requested block */
4981 LWLock *bufPartitionLock; /* buffer partition lock for it */
4982 int buf_id;
4984
4985 /* create a tag so we can lookup the buffer */
4986 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4987
4988 /* determine its hash code and partition lock ID */
4991
4992 /* Check that it is in the buffer pool. If not, do nothing. */
4994 buf_id = BufTableLookup(&bufTag, bufHash);
4996
4997 if (buf_id < 0)
4998 continue;
4999
5000 bufHdr = GetBufferDescriptor(buf_id);
5001
5002 /*
5003 * We need to lock the buffer header and recheck if the buffer is
5004 * still associated with the same block because the buffer could be
5005 * evicted by some other backend loading blocks for a different
5006 * relation after we release lock on the BufMapping table.
5007 */
5009
5010 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5011 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5012 bufHdr->tag.blockNum >= firstDelBlock)
5013 InvalidateBuffer(bufHdr); /* releases spinlock */
5014 else
5016 }
5017}
5018
5019/* ---------------------------------------------------------------------
5020 * DropDatabaseBuffers
5021 *
5022 * This function removes all the buffers in the buffer cache for a
5023 * particular database. Dirty pages are simply dropped, without
5024 * bothering to write them out first. This is used when we destroy a
5025 * database, to avoid trying to flush data to disk when the directory
5026 * tree no longer exists. Implementation is pretty similar to
5027 * DropRelationBuffers() which is for destroying just one relation.
5028 * --------------------------------------------------------------------
5029 */
5030void
5032{
5033 int i;
5034
5035 /*
5036 * We needn't consider local buffers, since by assumption the target
5037 * database isn't our own.
5038 */
5039
5040 for (i = 0; i < NBuffers; i++)
5041 {
5043
5044 /*
5045 * As in DropRelationBuffers, an unlocked precheck should be safe and
5046 * saves some cycles.
5047 */
5048 if (bufHdr->tag.dbOid != dbid)
5049 continue;
5050
5052 if (bufHdr->tag.dbOid == dbid)
5053 InvalidateBuffer(bufHdr); /* releases spinlock */
5054 else
5056 }
5057}
5058
5059/* ---------------------------------------------------------------------
5060 * FlushRelationBuffers
5061 *
5062 * This function writes all dirty pages of a relation out to disk
5063 * (or more accurately, out to kernel disk buffers), ensuring that the
5064 * kernel has an up-to-date view of the relation.
5065 *
5066 * Generally, the caller should be holding AccessExclusiveLock on the
5067 * target relation to ensure that no other backend is busy dirtying
5068 * more blocks of the relation; the effects can't be expected to last
5069 * after the lock is released.
5070 *
5071 * XXX currently it sequentially searches the buffer pool, should be
5072 * changed to more clever ways of searching. This routine is not
5073 * used in any performance-critical code paths, so it's not worth
5074 * adding additional overhead to normal paths to make it go faster.
5075 * --------------------------------------------------------------------
5076 */
5077void
5079{
5080 int i;
5082 SMgrRelation srel = RelationGetSmgr(rel);
5083
5084 if (RelationUsesLocalBuffers(rel))
5085 {
5086 for (i = 0; i < NLocBuffer; i++)
5087 {
5089
5091 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5092 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5093 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5094 {
5095 ErrorContextCallback errcallback;
5096
5097 /* Setup error traceback support for ereport() */
5099 errcallback.arg = bufHdr;
5100 errcallback.previous = error_context_stack;
5101 error_context_stack = &errcallback;
5102
5103 /* Make sure we can handle the pin */
5106
5107 /*
5108 * Pin/unpin mostly to make valgrind work, but it also seems
5109 * like the right thing to do.
5110 */
5111 PinLocalBuffer(bufHdr, false);
5112
5113
5114 FlushLocalBuffer(bufHdr, srel);
5115
5117
5118 /* Pop the error context stack */
5119 error_context_stack = errcallback.previous;
5120 }
5121 }
5122
5123 return;
5124 }
5125
5126 for (i = 0; i < NBuffers; i++)
5127 {
5129
5131
5132 /*
5133 * As in DropRelationBuffers, an unlocked precheck should be safe and
5134 * saves some cycles.
5135 */
5137 continue;
5138
5139 /* Make sure we can handle the pin */
5142
5144 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5146 {
5150 }
5151 else
5153 }
5154}
5155
5156/* ---------------------------------------------------------------------
5157 * FlushRelationsAllBuffers
5158 *
5159 * This function flushes out of the buffer pool all the pages of all
5160 * forks of the specified smgr relations. It's equivalent to calling
5161 * FlushRelationBuffers once per relation. The relations are assumed not
5162 * to use local buffers.
5163 * --------------------------------------------------------------------
5164 */
5165void
5167{
5168 int i;
5170 bool use_bsearch;
5171
5172 if (nrels == 0)
5173 return;
5174
5175 /* fill-in array for qsort */
5177
5178 for (i = 0; i < nrels; i++)
5179 {
5180 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5181
5182 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5183 srels[i].srel = smgrs[i];
5184 }
5185
5186 /*
5187 * Save the bsearch overhead for low number of relations to sync. See
5188 * DropRelationsAllBuffers for details.
5189 */
5191
5192 /* sort the list of SMgrRelations if necessary */
5193 if (use_bsearch)
5194 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5195
5196 for (i = 0; i < NBuffers; i++)
5197 {
5201
5202 /*
5203 * As in DropRelationBuffers, an unlocked precheck should be safe and
5204 * saves some cycles.
5205 */
5206
5207 if (!use_bsearch)
5208 {
5209 int j;
5210
5211 for (j = 0; j < nrels; j++)
5212 {
5213 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5214 {
5215 srelent = &srels[j];
5216 break;
5217 }
5218 }
5219 }
5220 else
5221 {
5222 RelFileLocator rlocator;
5223
5224 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5225 srelent = bsearch(&rlocator,
5226 srels, nrels, sizeof(SMgrSortArray),
5228 }
5229
5230 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5231 if (srelent == NULL)
5232 continue;
5233
5234 /* Make sure we can handle the pin */
5237
5239 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5241 {
5245 }
5246 else
5248 }
5249
5250 pfree(srels);
5251}
5252
5253/* ---------------------------------------------------------------------
5254 * RelationCopyStorageUsingBuffer
5255 *
5256 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5257 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5258 *
5259 * Refer comments atop CreateAndCopyRelationData() for details about
5260 * 'permanent' parameter.
5261 * --------------------------------------------------------------------
5262 */
5263static void
5266 ForkNumber forkNum, bool permanent)
5267{
5268 Buffer srcBuf;
5269 Buffer dstBuf;
5270 Page srcPage;
5271 Page dstPage;
5272 bool use_wal;
5273 BlockNumber nblocks;
5274 BlockNumber blkno;
5281
5282 /*
5283 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5284 * can skip it when copying any fork of an unlogged relation other than
5285 * the init fork.
5286 */
5287 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5288
5289 /* Get number of blocks in the source relation. */
5291 forkNum);
5292
5293 /* Nothing to copy; just return. */
5294 if (nblocks == 0)
5295 return;
5296
5297 /*
5298 * Bulk extend the destination relation of the same size as the source
5299 * relation before starting to copy block by block.
5300 */
5301 memset(buf.data, 0, BLCKSZ);
5302 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5303 buf.data, true);
5304
5305 /* This is a bulk operation, so use buffer access strategies. */
5308
5309 /* Initialize streaming read */
5310 p.current_blocknum = 0;
5311 p.last_exclusive = nblocks;
5313
5314 /*
5315 * It is safe to use batchmode as block_range_read_stream_cb takes no
5316 * locks.
5317 */
5321 src_smgr,
5323 forkNum,
5325 &p,
5326 0);
5327
5328 /* Iterate over each block of the source relation file. */
5329 for (blkno = 0; blkno < nblocks; blkno++)
5330 {
5332
5333 /* Read block from source relation. */
5337
5341 permanent);
5343
5345
5346 /* Copy page data from the source to the destination. */
5349
5350 /* WAL-log the copied page. */
5351 if (use_wal)
5353
5355
5358 }
5361
5364}
5365
5366/* ---------------------------------------------------------------------
5367 * CreateAndCopyRelationData
5368 *
5369 * Create destination relation storage and copy all forks from the
5370 * source relation to the destination.
5371 *
5372 * Pass permanent as true for permanent relations and false for
5373 * unlogged relations. Currently this API is not supported for
5374 * temporary relations.
5375 * --------------------------------------------------------------------
5376 */
5377void
5379 RelFileLocator dst_rlocator, bool permanent)
5380{
5381 char relpersistence;
5384
5385 /* Set the relpersistence. */
5386 relpersistence = permanent ?
5388
5391
5392 /*
5393 * Create and copy all forks of the relation. During create database we
5394 * have a separate cleanup mechanism which deletes complete database
5395 * directory. Therefore, each individual relation doesn't need to be
5396 * registered for cleanup.
5397 */
5398 RelationCreateStorage(dst_rlocator, relpersistence, false);
5399
5400 /* copy main fork. */
5402 permanent);
5403
5404 /* copy those extra forks that exist */
5405 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5406 forkNum <= MAX_FORKNUM; forkNum++)
5407 {
5408 if (smgrexists(src_rel, forkNum))
5409 {
5410 smgrcreate(dst_rel, forkNum, false);
5411
5412 /*
5413 * WAL log creation if the relation is persistent, or this is the
5414 * init fork of an unlogged relation.
5415 */
5416 if (permanent || forkNum == INIT_FORKNUM)
5417 log_smgrcreate(&dst_rlocator, forkNum);
5418
5419 /* Copy a fork's data, block by block. */
5421 permanent);
5422 }
5423 }
5424}
5425
5426/* ---------------------------------------------------------------------
5427 * FlushDatabaseBuffers
5428 *
5429 * This function writes all dirty pages of a database out to disk
5430 * (or more accurately, out to kernel disk buffers), ensuring that the
5431 * kernel has an up-to-date view of the database.
5432 *
5433 * Generally, the caller should be holding an appropriate lock to ensure
5434 * no other backend is active in the target database; otherwise more
5435 * pages could get dirtied.
5436 *
5437 * Note we don't worry about flushing any pages of temporary relations.
5438 * It's assumed these wouldn't be interesting.
5439 * --------------------------------------------------------------------
5440 */
5441void
5443{
5444 int i;
5446
5447 for (i = 0; i < NBuffers; i++)
5448 {
5450
5452
5453 /*
5454 * As in DropRelationBuffers, an unlocked precheck should be safe and
5455 * saves some cycles.
5456 */
5457 if (bufHdr->tag.dbOid != dbid)
5458 continue;
5459
5460 /* Make sure we can handle the pin */
5463
5465 if (bufHdr->tag.dbOid == dbid &&
5467 {
5471 }
5472 else
5474 }
5475}
5476
5477/*
5478 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5479 * OS.
5480 */
5481void
5483{
5485
5486 /* currently not needed, but no fundamental reason not to support */
5488
5490
5492
5494
5496}
5497
5498/*
5499 * ReleaseBuffer -- release the pin on a buffer
5500 */
5501void
5503{
5504 if (!BufferIsValid(buffer))
5505 elog(ERROR, "bad buffer ID: %d", buffer);
5506
5507 if (BufferIsLocal(buffer))
5509 else
5511}
5512
5513/*
5514 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5515 *
5516 * This is just a shorthand for a common combination.
5517 */
5518void
5524
5525/*
5526 * IncrBufferRefCount
5527 * Increment the pin count on a buffer that we have *already* pinned
5528 * at least once.
5529 *
5530 * This function cannot be used on a buffer we do not have pinned,
5531 * because it doesn't change the shared buffer state.
5532 */
5533void
5550
5551/*
5552 * MarkBufferDirtyHint
5553 *
5554 * Mark a buffer dirty for non-critical changes.
5555 *
5556 * This is essentially the same as MarkBufferDirty, except:
5557 *
5558 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5559 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5560 * 2. The caller might have only share-lock instead of exclusive-lock on the
5561 * buffer's content lock.
5562 * 3. This function does not guarantee that the buffer is always marked dirty
5563 * (due to a race condition), so it cannot be used for important changes.
5564 */
5565void
5567{
5569 Page page = BufferGetPage(buffer);
5570
5571 if (!BufferIsValid(buffer))
5572 elog(ERROR, "bad buffer ID: %d", buffer);
5573
5574 if (BufferIsLocal(buffer))
5575 {
5577 return;
5578 }
5579
5581
5583 /* here, either share or exclusive lock is OK */
5585
5586 /*
5587 * This routine might get called many times on the same page, if we are
5588 * making the first scan after commit of an xact that added/deleted many
5589 * tuples. So, be as quick as we can if the buffer is already dirty. We
5590 * do this by not acquiring spinlock if it looks like the status bits are
5591 * already set. Since we make this test unlocked, there's a chance we
5592 * might fail to notice that the flags have just been cleared, and failed
5593 * to reset them, due to memory-ordering issues. But since this function
5594 * is only intended to be used in cases where failing to write out the
5595 * data would be harmless anyway, it doesn't really matter.
5596 */
5597 if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5599 {
5601 bool dirtied = false;
5602 bool delayChkptFlags = false;
5604
5605 /*
5606 * If we need to protect hint bit updates from torn writes, WAL-log a
5607 * full page image of the page. This full page image is only necessary
5608 * if the hint bit update is the first change to the page since the
5609 * last checkpoint.
5610 *
5611 * We don't check full_page_writes here because that logic is included
5612 * when we call XLogInsert() since the value changes dynamically.
5613 */
5614 if (XLogHintBitIsNeeded() &&
5616 {
5617 /*
5618 * If we must not write WAL, due to a relfilelocator-specific
5619 * condition or being in recovery, don't dirty the page. We can
5620 * set the hint, just not dirty the page as a result so the hint
5621 * is lost when we evict the page or shutdown.
5622 *
5623 * See src/backend/storage/page/README for longer discussion.
5624 */
5625 if (RecoveryInProgress() ||
5627 return;
5628
5629 /*
5630 * If the block is already dirty because we either made a change
5631 * or set a hint already, then we don't need to write a full page
5632 * image. Note that aggressive cleaning of blocks dirtied by hint
5633 * bit setting would increase the call rate. Bulk setting of hint
5634 * bits would reduce the call rate...
5635 *
5636 * We must issue the WAL record before we mark the buffer dirty.
5637 * Otherwise we might write the page before we write the WAL. That
5638 * causes a race condition, since a checkpoint might occur between
5639 * writing the WAL record and marking the buffer dirty. We solve
5640 * that with a kluge, but one that is already in use during
5641 * transaction commit to prevent race conditions. Basically, we
5642 * simply prevent the checkpoint WAL record from being written
5643 * until we have marked the buffer dirty. We don't start the
5644 * checkpoint flush until we have marked dirty, so our checkpoint
5645 * must flush the change to disk successfully or the checkpoint
5646 * never gets written, so crash recovery will fix.
5647 *
5648 * It's possible we may enter here without an xid, so it is
5649 * essential that CreateCheckPoint waits for virtual transactions
5650 * rather than full transactionids.
5651 */
5654 delayChkptFlags = true;
5656 }
5657
5659
5661
5662 if (!(buf_state & BM_DIRTY))
5663 {
5664 dirtied = true; /* Means "will be dirtied by this action" */
5665
5666 /*
5667 * Set the page LSN if we wrote a backup block. We aren't supposed
5668 * to set this when only holding a share lock but as long as we
5669 * serialise it somehow we're OK. We choose to set LSN while
5670 * holding the buffer header lock, which causes any reader of an
5671 * LSN who holds only a share lock to also obtain a buffer header
5672 * lock before using PageGetLSN(), which is enforced in
5673 * BufferGetLSNAtomic().
5674 *
5675 * If checksums are enabled, you might think we should reset the
5676 * checksum here. That will happen when the page is written
5677 * sometime later in this checkpoint cycle.
5678 */
5679 if (XLogRecPtrIsValid(lsn))
5680 PageSetLSN(page, lsn);
5681 }
5682
5685 0, 0);
5686
5687 if (delayChkptFlags)
5689
5690 if (dirtied)
5691 {
5693 if (VacuumCostActive)
5695 }
5696 }
5697}
5698
5699/*
5700 * Release buffer content locks for shared buffers.
5701 *
5702 * Used to clean up after errors.
5703 *
5704 * Currently, we can expect that resource owner cleanup, via
5705 * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5706 * se; the only thing we need to deal with here is clearing any PIN_COUNT
5707 * request that was in progress.
5708 */
5709void
5711{
5713
5714 if (buf)
5715 {
5717 uint64 unset_bits = 0;
5718
5720
5721 /*
5722 * Don't complain if flag bit not set; it could have been reset but we
5723 * got a cancel/die interrupt before getting the signal.
5724 */
5725 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5726 buf->wait_backend_pgprocno == MyProcNumber)
5728
5730 0, unset_bits,
5731 0);
5732
5734 }
5735}
5736
5737/*
5738 * Acquire the buffer content lock in the specified mode
5739 *
5740 * If the lock is not available, sleep until it is.
5741 *
5742 * Side effect: cancel/die interrupts are held off until lock release.
5743 *
5744 * This uses almost the same locking approach as lwlock.c's
5745 * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5746 * detailed discussion.
5747 *
5748 * The reason that this, and most of the other BufferLock* functions, get both
5749 * the Buffer and BufferDesc* as parameters, is that looking up one from the
5750 * other repeatedly shows up noticeably in profiles.
5751 *
5752 * Callers should provide a constant for mode, for more efficient code
5753 * generation.
5754 */
5755static inline void
5757{
5758 PrivateRefCountEntry *entry;
5759 int extraWaits = 0;
5760
5761 /*
5762 * Get reference to the refcount entry before we hold the lock, it seems
5763 * better to do before holding the lock.
5764 */
5765 entry = GetPrivateRefCountEntry(buffer, true);
5766
5767 /*
5768 * We better not already hold a lock on the buffer.
5769 */
5771
5772 /*
5773 * Lock out cancel/die interrupts until we exit the code section protected
5774 * by the content lock. This ensures that interrupts will not interfere
5775 * with manipulations of data structures in shared memory.
5776 */
5778
5779 for (;;)
5780 {
5781 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5782 bool mustwait;
5783
5784 /*
5785 * Try to grab the lock the first time, we're not in the waitqueue
5786 * yet/anymore.
5787 */
5789
5790 if (likely(!mustwait))
5791 {
5792 break;
5793 }
5794
5795 /*
5796 * Ok, at this point we couldn't grab the lock on the first try. We
5797 * cannot simply queue ourselves to the end of the list and wait to be
5798 * woken up because by now the lock could long have been released.
5799 * Instead add us to the queue and try to grab the lock again. If we
5800 * succeed we need to revert the queuing and be happy, otherwise we
5801 * recheck the lock. If we still couldn't grab it, we know that the
5802 * other locker will see our queue entries when releasing since they
5803 * existed before we checked for the lock.
5804 */
5805
5806 /* add to the queue */
5808
5809 /* we're now guaranteed to be woken up if necessary */
5811
5812 /* ok, grabbed the lock the second time round, need to undo queueing */
5813 if (!mustwait)
5814 {
5816 break;
5817 }
5818
5819 switch (mode)
5820 {
5823 break;
5826 break;
5827 case BUFFER_LOCK_SHARE:
5829 break;
5830 case BUFFER_LOCK_UNLOCK:
5832
5833 }
5835
5836 /*
5837 * Wait until awakened.
5838 *
5839 * It is possible that we get awakened for a reason other than being
5840 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5841 * Once we've gotten the lock, re-increment the sema by the number of
5842 * additional signals received.
5843 */
5844 for (;;)
5845 {
5848 break;
5849 extraWaits++;
5850 }
5851
5853
5854 /* Retrying, allow BufferLockRelease to release waiters again. */
5856 }
5857
5858 /* Remember that we now hold this lock */
5859 entry->data.lockmode = mode;
5860
5861 /*
5862 * Fix the process wait semaphore's count for any absorbed wakeups.
5863 */
5864 while (unlikely(extraWaits-- > 0))
5866}
5867
5868/*
5869 * Release a previously acquired buffer content lock.
5870 */
5871static void
5873{
5876 uint64 sub;
5877
5879
5880 /*
5881 * Release my hold on lock, after that it can immediately be acquired by
5882 * others, even if we still have to wakeup other waiters.
5883 */
5885
5887
5889
5890 /*
5891 * Now okay to allow cancel/die interrupts.
5892 */
5894}
5895
5896
5897/*
5898 * Acquire the content lock for the buffer, but only if we don't have to wait.
5899 *
5900 * It is allowed to try to conditionally acquire a lock on a buffer that this
5901 * backend has already locked, but the lock acquisition will always fail, even
5902 * if the new lock acquisition does not conflict with an already held lock
5903 * (e.g. two share locks). This is because we currently do not have space to
5904 * track multiple lock ownerships of the same buffer within one backend. That
5905 * is ok for the current uses of BufferLockConditional().
5906 */
5907static bool
5909{
5911 bool mustwait;
5912
5913 /*
5914 * As described above, if we're trying to lock a buffer this backend
5915 * already has locked, return false, independent of the existing and
5916 * desired lock level.
5917 */
5918 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
5919 return false;
5920
5921 /*
5922 * Lock out cancel/die interrupts until we exit the code section protected
5923 * by the content lock. This ensures that interrupts will not interfere
5924 * with manipulations of data structures in shared memory.
5925 */
5927
5928 /* Check for the lock */
5930
5931 if (mustwait)
5932 {
5933 /* Failed to get lock, so release interrupt holdoff */
5935 }
5936 else
5937 {
5938 entry->data.lockmode = mode;
5939 }
5940
5941 return !mustwait;
5942}
5943
5944/*
5945 * Internal function that tries to atomically acquire the content lock in the
5946 * passed in mode.
5947 *
5948 * This function will not block waiting for a lock to become free - that's the
5949 * caller's job.
5950 *
5951 * Similar to LWLockAttemptLock().
5952 */
5953static inline bool
5955{
5957
5958 /*
5959 * Read once outside the loop, later iterations will get the newer value
5960 * via compare & exchange.
5961 */
5963
5964 /* loop until we've determined whether we could acquire the lock or not */
5965 while (true)
5966 {
5968 bool lock_free;
5969
5971
5973 {
5974 lock_free = (old_state & BM_LOCK_MASK) == 0;
5975 if (lock_free)
5977 }
5979 {
5981 if (lock_free)
5983 }
5984 else
5985 {
5987 if (lock_free)
5989 }
5990
5991 /*
5992 * Attempt to swap in the state we are expecting. If we didn't see
5993 * lock to be free, that's just the old value. If we saw it as free,
5994 * we'll attempt to mark it acquired. The reason that we always swap
5995 * in the value is that this doubles as a memory barrier. We could try
5996 * to be smarter and only swap in values if we saw the lock as free,
5997 * but benchmark haven't shown it as beneficial so far.
5998 *
5999 * Retry if the value changed since we last looked at it.
6000 */
6003 {
6004 if (lock_free)
6005 {
6006 /* Great! Got the lock. */
6007 return false;
6008 }
6009 else
6010 return true; /* somebody else has the lock */
6011 }
6012 }
6013
6015}
6016
6017/*
6018 * Add ourselves to the end of the content lock's wait queue.
6019 */
6020static void
6022{
6023 /*
6024 * If we don't have a PGPROC structure, there's no way to wait. This
6025 * should never occur, since MyProc should only be null during shared
6026 * memory initialization.
6027 */
6028 if (MyProc == NULL)
6029 elog(PANIC, "cannot wait without a PGPROC structure");
6030
6032 elog(PANIC, "queueing for lock while waiting on another one");
6033
6035
6036 /* setting the flag is protected by the spinlock */
6038
6039 /*
6040 * These are currently used both for lwlocks and buffer content locks,
6041 * which is acceptable, although not pretty, because a backend can't wait
6042 * for both types of locks at the same time.
6043 */
6046
6047 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6048
6049 /* Can release the mutex now */
6051}
6052
6053/*
6054 * Remove ourselves from the waitlist.
6055 *
6056 * This is used if we queued ourselves because we thought we needed to sleep
6057 * but, after further checking, we discovered that we don't actually need to
6058 * do so.
6059 */
6060static void
6062{
6063 bool on_waitlist;
6064
6066
6068 if (on_waitlist)
6069 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6070
6071 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6073 {
6075 }
6076
6077 /* XXX: combine with fetch_and above? */
6079
6080 /* clear waiting state again, nice for debugging */
6081 if (on_waitlist)
6083 else
6084 {
6085 int extraWaits = 0;
6086
6087
6088 /*
6089 * Somebody else dequeued us and has or will wake us up. Deal with the
6090 * superfluous absorption of a wakeup.
6091 */
6092
6093 /*
6094 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6095 * removed ourselves - they'll have set it.
6096 */
6098
6099 /*
6100 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6101 * get reset at some inconvenient point later. Most of the time this
6102 * will immediately return.
6103 */
6104 for (;;)
6105 {
6108 break;
6109 extraWaits++;
6110 }
6111
6112 /*
6113 * Fix the process wait semaphore's count for any absorbed wakeups.
6114 */
6115 while (extraWaits-- > 0)
6117 }
6118}
6119
6120/*
6121 * Stop treating lock as held by current backend.
6122 *
6123 * After calling this function it's the callers responsibility to ensure that
6124 * the lock gets released, even in case of an error. This only is desirable if
6125 * the lock is going to be released in a different process than the process
6126 * that acquired it.
6127 */
6128static inline void
6134
6135/*
6136 * Stop treating lock as held by current backend.
6137 *
6138 * This is the code that can be shared between actually releasing a lock
6139 * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6140 * without releasing the lock (BufferLockDisown()).
6141 */
6142static inline int
6144{
6147
6149 if (ref == NULL)
6150 elog(ERROR, "lock %d is not held", buffer);
6151 mode = ref->data.lockmode;
6152 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6153
6154 return mode;
6155}
6156
6157/*
6158 * Wakeup all the lockers that currently have a chance to acquire the lock.
6159 *
6160 * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6161 */
6162static void
6164{
6165 bool new_wake_in_progress = false;
6166 bool wake_share_exclusive = true;
6169
6171
6172 /* lock wait list while collecting backends to wake up */
6174
6175 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6176 {
6177 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6178
6179 /*
6180 * Already woke up a conflicting lock, so skip over this wait list
6181 * entry.
6182 */
6184 continue;
6186 continue;
6187
6188 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6189 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6190
6191 /*
6192 * Prevent additional wakeups until retryer gets to run. Backends that
6193 * are just waiting for the lock to become free don't retry
6194 * automatically.
6195 */
6196 new_wake_in_progress = true;
6197
6198 /*
6199 * Signal that the process isn't on the wait list anymore. This allows
6200 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6201 * proclist_delete(), rather than having to check if it has been
6202 * removed from the list.
6203 */
6204 Assert(waiter->lwWaiting == LW_WS_WAITING);
6206
6207 /*
6208 * Don't wakeup further waiters after waking a conflicting waiter.
6209 */
6210 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6211 {
6212 /*
6213 * Share locks conflict with exclusive locks.
6214 */
6215 wake_exclusive = false;
6216 }
6217 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6218 {
6219 /*
6220 * Share-exclusive locks conflict with share-exclusive and
6221 * exclusive locks.
6222 */
6223 wake_exclusive = false;
6224 wake_share_exclusive = false;
6225 }
6226 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6227 {
6228 /*
6229 * Exclusive locks conflict with all other locks, there's no point
6230 * in waking up anybody else.
6231 */
6232 break;
6233 }
6234 }
6235
6237
6238 /* unset required flags, and release lock, in one fell swoop */
6239 {
6242
6244 while (true)
6245 {
6247
6248 /* compute desired flags */
6249
6252 else
6254
6255 if (proclist_is_empty(&buf_hdr->lock_waiters))
6257
6258 desired_state &= ~BM_LOCKED; /* release lock */
6259
6262 break;
6263 }
6264 }
6265
6266 /* Awaken any waiters I removed from the queue. */
6267 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6268 {
6269 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6270
6271 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6272
6273 /*
6274 * Guarantee that lwWaiting being unset only becomes visible once the
6275 * unlink from the link has completed. Otherwise the target backend
6276 * could be woken up for other reason and enqueue for a new lock - if
6277 * that happens before the list unlink happens, the list would end up
6278 * being corrupted.
6279 *
6280 * The barrier pairs with the LockBufHdr() when enqueuing for another
6281 * lock.
6282 */
6284 waiter->lwWaiting = LW_WS_NOT_WAITING;
6285 PGSemaphoreUnlock(waiter->sem);
6286 }
6287}
6288
6289/*
6290 * Compute subtraction from buffer state for a release of a held lock in
6291 * `mode`.
6292 *
6293 * This is separated from BufferLockUnlock() as we want to combine the lock
6294 * release with other atomic operations when possible, leading to the lock
6295 * release being done in multiple places, each needing to compute what to
6296 * subtract from the lock state.
6297 */
6298static inline uint64
6300{
6301 /*
6302 * Turns out that a switch() leads gcc to generate sufficiently worse code
6303 * for this to show up in profiles...
6304 */
6306 return BM_LOCK_VAL_EXCLUSIVE;
6309 else
6310 {
6312 return BM_LOCK_VAL_SHARED;
6313 }
6314
6315 return 0; /* keep compiler quiet */
6316}
6317
6318/*
6319 * Handle work that needs to be done after releasing a lock that was held in
6320 * `mode`, where `lockstate` is the result of the atomic operation modifying
6321 * the state variable.
6322 *
6323 * This is separated from BufferLockUnlock() as we want to combine the lock
6324 * release with other atomic operations when possible, leading to the lock
6325 * release being done in multiple places.
6326 */
6327static void
6329{
6330 bool check_waiters = false;
6331 bool wake_exclusive = false;
6332
6333 /* nobody else can have that kind of lock */
6335
6336 /*
6337 * If we're still waiting for backends to get scheduled, don't wake them
6338 * up again. Otherwise check if we need to look through the waitqueue to
6339 * wake other backends.
6340 */
6343 {
6344 if ((lockstate & BM_LOCK_MASK) == 0)
6345 {
6346 /*
6347 * We released a lock and the lock was, in that moment, free. We
6348 * therefore can wake waiters for any kind of lock.
6349 */
6350 check_waiters = true;
6351 wake_exclusive = true;
6352 }
6354 {
6355 /*
6356 * We released the lock, but another backend still holds a lock.
6357 * We can't have released an exclusive lock, as there couldn't
6358 * have been other lock holders. If we released a share lock, no
6359 * waiters need to be woken up, as there must be other share
6360 * lockers. However, if we held a share-exclusive lock, another
6361 * backend now could acquire a share-exclusive lock.
6362 */
6363 check_waiters = true;
6364 wake_exclusive = false;
6365 }
6366 }
6367
6368 /*
6369 * As waking up waiters requires the spinlock to be acquired, only do so
6370 * if necessary.
6371 */
6372 if (check_waiters)
6374}
6375
6376/*
6377 * BufferLockHeldByMeInMode - test whether my process holds the content lock
6378 * in the specified mode
6379 *
6380 * This is meant as debug support only.
6381 */
6382static bool
6384{
6385 PrivateRefCountEntry *entry =
6387
6388 if (!entry)
6389 return false;
6390 else
6391 return entry->data.lockmode == mode;
6392}
6393
6394/*
6395 * BufferLockHeldByMe - test whether my process holds the content lock in any
6396 * mode
6397 *
6398 * This is meant as debug support only.
6399 */
6400static bool
6402{
6403 PrivateRefCountEntry *entry =
6405
6406 if (!entry)
6407 return false;
6408 else
6409 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6410}
6411
6412/*
6413 * Release the content lock for the buffer.
6414 */
6415void
6417{
6419
6421 if (BufferIsLocal(buffer))
6422 return; /* local buffers need no lock */
6423
6426}
6427
6428/*
6429 * Acquire the content_lock for the buffer.
6430 */
6431void
6433{
6435
6436 /*
6437 * We can't wait if we haven't got a PGPROC. This should only occur
6438 * during bootstrap or shared memory initialization. Put an Assert here
6439 * to catch unsafe coding practices.
6440 */
6442
6443 /* handled in LockBuffer() wrapper */
6445
6447 if (BufferIsLocal(buffer))
6448 return; /* local buffers need no lock */
6449
6451
6452 /*
6453 * Test the most frequent lock modes first. While a switch (mode) would be
6454 * nice, at least gcc generates considerably worse code for it.
6455 *
6456 * Call BufferLockAcquire() with a constant argument for mode, to generate
6457 * more efficient code for the different lock modes.
6458 */
6459 if (mode == BUFFER_LOCK_SHARE)
6461 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6465 else
6466 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6467}
6468
6469/*
6470 * Acquire the content_lock for the buffer, but only if we don't have to wait.
6471 *
6472 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6473 */
6474bool
6476{
6477 BufferDesc *buf;
6478
6480 if (BufferIsLocal(buffer))
6481 return true; /* act as though we got it */
6482
6484
6486}
6487
6488/*
6489 * Verify that this backend is pinning the buffer exactly once.
6490 *
6491 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6492 * holds a pin on the buffer. We do not care whether some other backend does.
6493 */
6494void
6496{
6497 if (BufferIsLocal(buffer))
6498 {
6499 if (LocalRefCount[-buffer - 1] != 1)
6500 elog(ERROR, "incorrect local pin count: %d",
6501 LocalRefCount[-buffer - 1]);
6502 }
6503 else
6504 {
6505 if (GetPrivateRefCount(buffer) != 1)
6506 elog(ERROR, "incorrect local pin count: %d",
6508 }
6509}
6510
6511/*
6512 * LockBufferForCleanup - lock a buffer in preparation for deleting items
6513 *
6514 * Items may be deleted from a disk page only when the caller (a) holds an
6515 * exclusive lock on the buffer and (b) has observed that no other backend
6516 * holds a pin on the buffer. If there is a pin, then the other backend
6517 * might have a pointer into the buffer (for example, a heapscan reference
6518 * to an item --- see README for more details). It's OK if a pin is added
6519 * after the cleanup starts, however; the newly-arrived backend will be
6520 * unable to look at the page until we release the exclusive lock.
6521 *
6522 * To implement this protocol, a would-be deleter must pin the buffer and
6523 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6524 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6525 * it has successfully observed pin count = 1.
6526 */
6527void
6529{
6531 TimestampTz waitStart = 0;
6532 bool waiting = false;
6533 bool logged_recovery_conflict = false;
6534
6537
6539
6540 /*
6541 * We do not yet need to be worried about in-progress AIOs holding a pin,
6542 * as we, so far, only support doing reads via AIO and this function can
6543 * only be called once the buffer is valid (i.e. no read can be in
6544 * flight).
6545 */
6546
6547 /* Nobody else to wait for */
6548 if (BufferIsLocal(buffer))
6549 return;
6550
6552
6553 for (;;)
6554 {
6556 uint64 unset_bits = 0;
6557
6558 /* Try to acquire lock */
6561
6564 {
6565 /* Successfully acquired exclusive lock with pincount 1 */
6567
6568 /*
6569 * Emit the log message if recovery conflict on buffer pin was
6570 * resolved but the startup process waited longer than
6571 * deadlock_timeout for it.
6572 */
6575 waitStart, GetCurrentTimestamp(),
6576 NULL, false);
6577
6578 if (waiting)
6579 {
6580 /* reset ps display to remove the suffix if we added one */
6582 waiting = false;
6583 }
6584 return;
6585 }
6586 /* Failed, so mark myself as waiting for pincount 1 */
6588 {
6591 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6592 }
6593 bufHdr->wait_backend_pgprocno = MyProcNumber;
6597 0);
6599
6600 /* Wait to be signaled by UnpinBuffer() */
6601 if (InHotStandby)
6602 {
6603 if (!waiting)
6604 {
6605 /* adjust the process title to indicate that it's waiting */
6606 set_ps_display_suffix("waiting");
6607 waiting = true;
6608 }
6609
6610 /*
6611 * Emit the log message if the startup process is waiting longer
6612 * than deadlock_timeout for recovery conflict on buffer pin.
6613 *
6614 * Skip this if first time through because the startup process has
6615 * not started waiting yet in this case. So, the wait start
6616 * timestamp is set after this logic.
6617 */
6618 if (waitStart != 0 && !logged_recovery_conflict)
6619 {
6621
6622 if (TimestampDifferenceExceeds(waitStart, now,
6624 {
6626 waitStart, now, NULL, true);
6628 }
6629 }
6630
6631 /*
6632 * Set the wait start timestamp if logging is enabled and first
6633 * time through.
6634 */
6635 if (log_recovery_conflict_waits && waitStart == 0)
6636 waitStart = GetCurrentTimestamp();
6637
6638 /* Publish the bufid that Startup process waits on */
6640 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6642 /* Reset the published bufid */
6644 }
6645 else
6647
6648 /*
6649 * Remove flag marking us as waiter. Normally this will not be set
6650 * anymore, but ProcWaitForSignal() can return for other signals as
6651 * well. We take care to only reset the flag if we're the waiter, as
6652 * theoretically another backend could have started waiting. That's
6653 * impossible with the current usages due to table level locking, but
6654 * better be safe.
6655 */
6657 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6658 bufHdr->wait_backend_pgprocno == MyProcNumber)
6660
6662 0, unset_bits,
6663 0);
6664
6666 /* Loop back and try again */
6667 }
6668}
6669
6670/*
6671 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6672 * requests cancellation of all pin holders that are blocking it.
6673 */
6674bool
6676{
6678
6679 /*
6680 * If we get woken slowly then it's possible that the Startup process was
6681 * already woken by other backends before we got here. Also possible that
6682 * we get here by multiple interrupts or interrupts at inappropriate
6683 * times, so make sure we do nothing if the bufid is not set.
6684 */
6685 if (bufid < 0)
6686 return false;
6687
6688 if (GetPrivateRefCount(bufid + 1) > 0)
6689 return true;
6690
6691 return false;
6692}
6693
6694/*
6695 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6696 *
6697 * We won't loop, but just check once to see if the pin count is OK. If
6698 * not, return false with no lock held.
6699 */
6700bool
6702{
6705 refcount;
6706
6708
6709 /* see AIO related comment in LockBufferForCleanup() */
6710
6711 if (BufferIsLocal(buffer))
6712 {
6713 refcount = LocalRefCount[-buffer - 1];
6714 /* There should be exactly one pin */
6715 Assert(refcount > 0);
6716 if (refcount != 1)
6717 return false;
6718 /* Nobody else to wait for */
6719 return true;
6720 }
6721
6722 /* There should be exactly one local pin */
6723 refcount = GetPrivateRefCount(buffer);
6724 Assert(refcount);
6725 if (refcount != 1)
6726 return false;
6727
6728 /* Try to acquire lock */
6730 return false;
6731
6735
6736 Assert(refcount > 0);
6737 if (refcount == 1)
6738 {
6739 /* Successfully acquired exclusive lock with pincount 1 */
6741 return true;
6742 }
6743
6744 /* Failed, so release the lock */
6747 return false;
6748}
6749
6750/*
6751 * IsBufferCleanupOK - as above, but we already have the lock
6752 *
6753 * Check whether it's OK to perform cleanup on a buffer we've already
6754 * locked. If we observe that the pin count is 1, our exclusive lock
6755 * happens to be a cleanup lock, and we can proceed with anything that
6756 * would have been allowable had we sought a cleanup lock originally.
6757 */
6758bool
6760{
6763
6765
6766 /* see AIO related comment in LockBufferForCleanup() */
6767
6768 if (BufferIsLocal(buffer))
6769 {
6770 /* There should be exactly one pin */
6771 if (LocalRefCount[-buffer - 1] != 1)
6772 return false;
6773 /* Nobody else to wait for */
6774 return true;
6775 }
6776
6777 /* There should be exactly one local pin */
6778 if (GetPrivateRefCount(buffer) != 1)
6779 return false;
6780
6782
6783 /* caller must hold exclusive lock on buffer */
6785
6787
6790 {
6791 /* pincount is OK. */
6793 return true;
6794 }
6795
6797 return false;
6798}
6799
6800
6801/*
6802 * Functions for buffer I/O handling
6803 *
6804 * Also note that these are used only for shared buffers, not local ones.
6805 */
6806
6807/*
6808 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
6809 */
6810static void
6812{
6814
6816 for (;;)
6817 {
6820
6821 /*
6822 * It may not be necessary to acquire the spinlock to check the flag
6823 * here, but since this test is essential for correctness, we'd better
6824 * play it safe.
6825 */
6827
6828 /*
6829 * Copy the wait reference while holding the spinlock. This protects
6830 * against a concurrent TerminateBufferIO() in another backend from
6831 * clearing the wref while it's being read.
6832 */
6833 iow = buf->io_wref;
6835
6836 /* no IO in progress, we don't need to wait */
6838 break;
6839
6840 /*
6841 * The buffer has asynchronous IO in progress, wait for it to
6842 * complete.
6843 */
6844 if (pgaio_wref_valid(&iow))
6845 {
6847
6848 /*
6849 * The AIO subsystem internally uses condition variables and thus
6850 * might remove this backend from the BufferDesc's CV. While that
6851 * wouldn't cause a correctness issue (the first CV sleep just
6852 * immediately returns if not already registered), it seems worth
6853 * avoiding unnecessary loop iterations, given that we take care
6854 * to do so at the start of the function.
6855 */
6857 continue;
6858 }
6859
6860 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6862 }
6864}
6865
6866/*
6867 * StartBufferIO: begin I/O on this buffer
6868 * (Assumptions)
6869 * My process is executing no IO on this buffer
6870 * The buffer is Pinned
6871 *
6872 * In some scenarios multiple backends could attempt the same I/O operation
6873 * concurrently. If someone else has already started I/O on this buffer then
6874 * we will wait for completion of the IO using WaitIO().
6875 *
6876 * Input operations are only attempted on buffers that are not BM_VALID,
6877 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6878 * so we can always tell if the work is already done.
6879 *
6880 * Returns true if we successfully marked the buffer as I/O busy,
6881 * false if someone else already did the work.
6882 *
6883 * If nowait is true, then we don't wait for an I/O to be finished by another
6884 * backend. In that case, false indicates either that the I/O was already
6885 * finished, or is still in progress. This is useful for callers that want to
6886 * find out if they can perform the I/O as part of a larger operation, without
6887 * waiting for the answer or distinguishing the reasons why not.
6888 */
6889bool
6891{
6893
6895
6896 for (;;)
6897 {
6899
6901 break;
6903 if (nowait)
6904 return false;
6905 WaitIO(buf);
6906 }
6907
6908 /* Once we get here, there is definitely no I/O active on this buffer */
6909
6910 /* Check if someone else already did the I/O */
6911 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6912 {
6914 return false;
6915 }
6916
6919 0);
6920
6923
6924 return true;
6925}
6926
6927/*
6928 * TerminateBufferIO: release a buffer we were doing I/O on
6929 * (Assumptions)
6930 * My process is executing IO for the buffer
6931 * BM_IO_IN_PROGRESS bit is set for the buffer
6932 * The buffer is Pinned
6933 *
6934 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6935 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6936 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6937 * marking the buffer clean if it was re-dirtied while we were writing.
6938 *
6939 * set_flag_bits gets ORed into the buffer's flags. It must include
6940 * BM_IO_ERROR in a failure case. For successful completion it could
6941 * be 0, or BM_VALID if we just finished reading in the page.
6942 *
6943 * If forget_owner is true, we release the buffer I/O from the current
6944 * resource owner. (forget_owner=false is used when the resource owner itself
6945 * is being released)
6946 */
6947void
6949 bool forget_owner, bool release_aio)
6950{
6953 int refcount_change = 0;
6954
6956
6959
6960 /* Clear earlier errors, if this IO failed, it'll be marked again */
6962
6965
6966 if (release_aio)
6967 {
6968 /* release ownership by the AIO subsystem */
6970 refcount_change = -1;
6971 pgaio_wref_clear(&buf->io_wref);
6972 }
6973
6977
6978 if (forget_owner)
6981
6983
6984 /*
6985 * Support LockBufferForCleanup()
6986 *
6987 * We may have just released the last pin other than the waiter's. In most
6988 * cases, this backend holds another pin on the buffer. But, if, for
6989 * example, this backend is completing an IO issued by another backend, it
6990 * may be time to wake the waiter.
6991 */
6994}
6995
6996/*
6997 * AbortBufferIO: Clean up active buffer I/O after an error.
6998 *
6999 * All LWLocks & content locks we might have held have been released, but we
7000 * haven't yet released buffer pins, so the buffer is still pinned.
7001 *
7002 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7003 * possible the error condition wasn't related to the I/O.
7004 *
7005 * Note: this does not remove the buffer I/O from the resource owner.
7006 * That's correct when we're releasing the whole resource owner, but
7007 * beware if you use this in other contexts.
7008 */
7009static void
7011{
7014
7017
7018 if (!(buf_state & BM_VALID))
7019 {
7022 }
7023 else
7024 {
7027
7028 /* Issue notice if this is not the first failure... */
7029 if (buf_state & BM_IO_ERROR)
7030 {
7031 /* Buffer is pinned, so we can read tag without spinlock */
7034 errmsg("could not write block %u of %s",
7035 buf_hdr->tag.blockNum,
7037 BufTagGetForkNum(&buf_hdr->tag)).str),
7038 errdetail("Multiple failures --- write error might be permanent.")));
7039 }
7040 }
7041
7042 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7043}
7044
7045/*
7046 * Error context callback for errors occurring during shared buffer writes.
7047 */
7048static void
7050{
7052
7053 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7054 if (bufHdr != NULL)
7055 errcontext("writing block %u of relation \"%s\"",
7056 bufHdr->tag.blockNum,
7058 BufTagGetForkNum(&bufHdr->tag)).str);
7059}
7060
7061/*
7062 * Error context callback for errors occurring during local buffer writes.
7063 */
7064static void
7066{
7068
7069 if (bufHdr != NULL)
7070 errcontext("writing block %u of relation \"%s\"",
7071 bufHdr->tag.blockNum,
7074 BufTagGetForkNum(&bufHdr->tag)).str);
7075}
7076
7077/*
7078 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7079 */
7080static int
7081rlocator_comparator(const void *p1, const void *p2)
7082{
7083 RelFileLocator n1 = *(const RelFileLocator *) p1;
7084 RelFileLocator n2 = *(const RelFileLocator *) p2;
7085
7086 if (n1.relNumber < n2.relNumber)
7087 return -1;
7088 else if (n1.relNumber > n2.relNumber)
7089 return 1;
7090
7091 if (n1.dbOid < n2.dbOid)
7092 return -1;
7093 else if (n1.dbOid > n2.dbOid)
7094 return 1;
7095
7096 if (n1.spcOid < n2.spcOid)
7097 return -1;
7098 else if (n1.spcOid > n2.spcOid)
7099 return 1;
7100 else
7101 return 0;
7102}
7103
7104/*
7105 * Lock buffer header - set BM_LOCKED in buffer state.
7106 */
7107uint64
7109{
7111
7113
7114 while (true)
7115 {
7116 /*
7117 * Always try once to acquire the lock directly, without setting up
7118 * the spin-delay infrastructure. The work necessary for that shows up
7119 * in profiles and is rarely necessary.
7120 */
7122 if (likely(!(old_buf_state & BM_LOCKED)))
7123 break; /* got lock */
7124
7125 /* and then spin without atomic operations until lock is released */
7126 {
7128
7130
7131 while (old_buf_state & BM_LOCKED)
7132 {
7135 }
7137 }
7138
7139 /*
7140 * Retry. The lock might obviously already be re-acquired by the time
7141 * we're attempting to get it again.
7142 */
7143 }
7144
7145 return old_buf_state | BM_LOCKED;
7146}
7147
7148/*
7149 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7150 * state at that point.
7151 *
7152 * Obviously the buffer could be locked by the time the value is returned, so
7153 * this is primarily useful in CAS style loops.
7154 */
7157{
7160
7162
7163 buf_state = pg_atomic_read_u64(&buf->state);
7164
7165 while (buf_state & BM_LOCKED)
7166 {
7168 buf_state = pg_atomic_read_u64(&buf->state);
7169 }
7170
7172
7173 return buf_state;
7174}
7175
7176/*
7177 * BufferTag comparator.
7178 */
7179static inline int
7181{
7182 int ret;
7185
7188
7190
7191 if (ret != 0)
7192 return ret;
7193
7195 return -1;
7197 return 1;
7198
7199 if (ba->blockNum < bb->blockNum)
7200 return -1;
7201 if (ba->blockNum > bb->blockNum)
7202 return 1;
7203
7204 return 0;
7205}
7206
7207/*
7208 * Comparator determining the writeout order in a checkpoint.
7209 *
7210 * It is important that tablespaces are compared first, the logic balancing
7211 * writes between tablespaces relies on it.
7212 */
7213static inline int
7215{
7216 /* compare tablespace */
7217 if (a->tsId < b->tsId)
7218 return -1;
7219 else if (a->tsId > b->tsId)
7220 return 1;
7221 /* compare relation */
7222 if (a->relNumber < b->relNumber)
7223 return -1;
7224 else if (a->relNumber > b->relNumber)
7225 return 1;
7226 /* compare fork */
7227 else if (a->forkNum < b->forkNum)
7228 return -1;
7229 else if (a->forkNum > b->forkNum)
7230 return 1;
7231 /* compare block number */
7232 else if (a->blockNum < b->blockNum)
7233 return -1;
7234 else if (a->blockNum > b->blockNum)
7235 return 1;
7236 /* equal page IDs are unlikely, but not impossible */
7237 return 0;
7238}
7239
7240/*
7241 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7242 * progress.
7243 */
7244static int
7246{
7249
7250 /* we want a min-heap, so return 1 for the a < b */
7251 if (sa->progress < sb->progress)
7252 return 1;
7253 else if (sa->progress == sb->progress)
7254 return 0;
7255 else
7256 return -1;
7257}
7258
7259/*
7260 * Initialize a writeback context, discarding potential previous state.
7261 *
7262 * *max_pending is a pointer instead of an immediate value, so the coalesce
7263 * limits can easily changed by the GUC mechanism, and so calling code does
7264 * not have to check the current configuration. A value of 0 means that no
7265 * writeback control will be performed.
7266 */
7267void
7268WritebackContextInit(WritebackContext *context, int *max_pending)
7269{
7270 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7271
7272 context->max_pending = max_pending;
7273 context->nr_pending = 0;
7274}
7275
7276/*
7277 * Add buffer to list of pending writeback requests.
7278 */
7279void
7281 BufferTag *tag)
7282{
7283 PendingWriteback *pending;
7284
7285 /*
7286 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7287 * point in tracking in that case.
7288 */
7290 !enableFsync)
7291 return;
7292
7293 /*
7294 * Add buffer to the pending writeback array, unless writeback control is
7295 * disabled.
7296 */
7297 if (*wb_context->max_pending > 0)
7298 {
7300
7301 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7302
7303 pending->tag = *tag;
7304 }
7305
7306 /*
7307 * Perform pending flushes if the writeback limit is exceeded. This
7308 * includes the case where previously an item has been added, but control
7309 * is now disabled.
7310 */
7311 if (wb_context->nr_pending >= *wb_context->max_pending)
7313}
7314
7315#define ST_SORT sort_pending_writebacks
7316#define ST_ELEMENT_TYPE PendingWriteback
7317#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7318#define ST_SCOPE static
7319#define ST_DEFINE
7320#include "lib/sort_template.h"
7321
7322/*
7323 * Issue all pending writeback requests, previously scheduled with
7324 * ScheduleBufferTagForWriteback, to the OS.
7325 *
7326 * Because this is only used to improve the OSs IO scheduling we try to never
7327 * error out - it's just a hint.
7328 */
7329void
7331{
7333 int i;
7334
7335 if (wb_context->nr_pending == 0)
7336 return;
7337
7338 /*
7339 * Executing the writes in-order can make them a lot faster, and allows to
7340 * merge writeback requests to consecutive blocks into larger writebacks.
7341 */
7342 sort_pending_writebacks(wb_context->pending_writebacks,
7343 wb_context->nr_pending);
7344
7346
7347 /*
7348 * Coalesce neighbouring writes, but nothing else. For that we iterate
7349 * through the, now sorted, array of pending flushes, and look forward to
7350 * find all neighbouring (or identical) writes.
7351 */
7352 for (i = 0; i < wb_context->nr_pending; i++)
7353 {
7357 int ahead;
7358 BufferTag tag;
7360 Size nblocks = 1;
7361
7362 cur = &wb_context->pending_writebacks[i];
7363 tag = cur->tag;
7365
7366 /*
7367 * Peek ahead, into following writeback requests, to see if they can
7368 * be combined with the current one.
7369 */
7370 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7371 {
7372
7373 next = &wb_context->pending_writebacks[i + ahead + 1];
7374
7375 /* different file, stop */
7377 BufTagGetRelFileLocator(&next->tag)) ||
7378 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7379 break;
7380
7381 /* ok, block queued twice, skip */
7382 if (cur->tag.blockNum == next->tag.blockNum)
7383 continue;
7384
7385 /* only merge consecutive writes */
7386 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7387 break;
7388
7389 nblocks++;
7390 cur = next;
7391 }
7392
7393 i += ahead;
7394
7395 /* and finally tell the kernel to write the data to storage */
7397 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7398 }
7399
7400 /*
7401 * Assume that writeback requests are only issued for buffers containing
7402 * blocks of permanent relations.
7403 */
7405 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7406
7407 wb_context->nr_pending = 0;
7408}
7409
7410/* ResourceOwner callbacks */
7411
7412static void
7419
7420static char *
7422{
7424
7425 return psprintf("lost track of buffer IO on buffer %d", buffer);
7426}
7427
7428/*
7429 * Release buffer as part of resource owner cleanup. This will only be called
7430 * if the buffer is pinned. If this backend held the content lock at the time
7431 * of the error we also need to release that (note that it is not possible to
7432 * hold a content lock without a pin).
7433 */
7434static void
7436{
7438
7439 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7440 if (!BufferIsValid(buffer))
7441 elog(ERROR, "bad buffer ID: %d", buffer);
7442
7443 if (BufferIsLocal(buffer))
7445 else
7446 {
7448
7450
7451 /* not having a private refcount would imply resowner corruption */
7452 Assert(ref != NULL);
7453
7454 /*
7455 * If the buffer was locked at the time of the resowner release,
7456 * release the lock now. This should only happen after errors.
7457 */
7458 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7459 {
7461
7462 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7464 }
7465
7467 }
7468}
7469
7470static char *
7475
7476/*
7477 * Helper function to evict unpinned buffer whose buffer header lock is
7478 * already acquired.
7479 */
7480static bool
7482{
7484 bool result;
7485
7486 *buffer_flushed = false;
7487
7490
7491 if ((buf_state & BM_VALID) == 0)
7492 {
7493 UnlockBufHdr(desc);
7494 return false;
7495 }
7496
7497 /* Check that it's not pinned already. */
7499 {
7500 UnlockBufHdr(desc);
7501 return false;
7502 }
7503
7504 PinBuffer_Locked(desc); /* releases spinlock */
7505
7506 /* If it was dirty, try to clean it once. */
7507 if (buf_state & BM_DIRTY)
7508 {
7510 *buffer_flushed = true;
7511 }
7512
7513 /* This will return false if it becomes dirty or someone else pins it. */
7514 result = InvalidateVictimBuffer(desc);
7515
7516 UnpinBuffer(desc);
7517
7518 return result;
7519}
7520
7521/*
7522 * Try to evict the current block in a shared buffer.
7523 *
7524 * This function is intended for testing/development use only!
7525 *
7526 * To succeed, the buffer must not be pinned on entry, so if the caller had a
7527 * particular block in mind, it might already have been replaced by some other
7528 * block by the time this function runs. It's also unpinned on return, so the
7529 * buffer might be occupied again by the time control is returned, potentially
7530 * even by the same block. This inherent raciness without other interlocking
7531 * makes the function unsuitable for non-testing usage.
7532 *
7533 * *buffer_flushed is set to true if the buffer was dirty and has been
7534 * flushed, false otherwise. However, *buffer_flushed=true does not
7535 * necessarily mean that we flushed the buffer, it could have been flushed by
7536 * someone else.
7537 *
7538 * Returns true if the buffer was valid and it has now been made invalid.
7539 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7540 * or if the buffer becomes dirty again while we're trying to write it out.
7541 */
7542bool
7544{
7545 BufferDesc *desc;
7546
7548
7549 /* Make sure we can pin the buffer. */
7552
7553 desc = GetBufferDescriptor(buf - 1);
7554 LockBufHdr(desc);
7555
7557}
7558
7559/*
7560 * Try to evict all the shared buffers.
7561 *
7562 * This function is intended for testing/development use only! See
7563 * EvictUnpinnedBuffer().
7564 *
7565 * The buffers_* parameters are mandatory and indicate the total count of
7566 * buffers that:
7567 * - buffers_evicted - were evicted
7568 * - buffers_flushed - were flushed
7569 * - buffers_skipped - could not be evicted
7570 */
7571void
7574{
7575 *buffers_evicted = 0;
7576 *buffers_skipped = 0;
7577 *buffers_flushed = 0;
7578
7579 for (int buf = 1; buf <= NBuffers; buf++)
7580 {
7581 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7583 bool buffer_flushed;
7584
7586
7588 if (!(buf_state & BM_VALID))
7589 continue;
7590
7593
7594 LockBufHdr(desc);
7595
7597 (*buffers_evicted)++;
7598 else
7599 (*buffers_skipped)++;
7600
7601 if (buffer_flushed)
7602 (*buffers_flushed)++;
7603 }
7604}
7605
7606/*
7607 * Try to evict all the shared buffers containing provided relation's pages.
7608 *
7609 * This function is intended for testing/development use only! See
7610 * EvictUnpinnedBuffer().
7611 *
7612 * The caller must hold at least AccessShareLock on the relation to prevent
7613 * the relation from being dropped.
7614 *
7615 * The buffers_* parameters are mandatory and indicate the total count of
7616 * buffers that:
7617 * - buffers_evicted - were evicted
7618 * - buffers_flushed - were flushed
7619 * - buffers_skipped - could not be evicted
7620 */
7621void
7624{
7626
7627 *buffers_skipped = 0;
7628 *buffers_evicted = 0;
7629 *buffers_flushed = 0;
7630
7631 for (int buf = 1; buf <= NBuffers; buf++)
7632 {
7633 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7635 bool buffer_flushed;
7636
7638
7639 /* An unlocked precheck should be safe and saves some cycles. */
7640 if ((buf_state & BM_VALID) == 0 ||
7642 continue;
7643
7644 /* Make sure we can pin the buffer. */
7647
7648 buf_state = LockBufHdr(desc);
7649
7650 /* recheck, could have changed without the lock */
7651 if ((buf_state & BM_VALID) == 0 ||
7653 {
7654 UnlockBufHdr(desc);
7655 continue;
7656 }
7657
7659 (*buffers_evicted)++;
7660 else
7661 (*buffers_skipped)++;
7662
7663 if (buffer_flushed)
7664 (*buffers_flushed)++;
7665 }
7666}
7667
7668/*
7669 * Helper function to mark unpinned buffer dirty whose buffer header lock is
7670 * already acquired.
7671 */
7672static bool
7675{
7677 bool result = false;
7678
7679 *buffer_already_dirty = false;
7680
7683
7684 if ((buf_state & BM_VALID) == 0)
7685 {
7686 UnlockBufHdr(desc);
7687 return false;
7688 }
7689
7690 /* Check that it's not pinned already. */
7692 {
7693 UnlockBufHdr(desc);
7694 return false;
7695 }
7696
7697 /* Pin the buffer and then release the buffer spinlock */
7698 PinBuffer_Locked(desc);
7699
7700 /* If it was not already dirty, mark it as dirty. */
7701 if (!(buf_state & BM_DIRTY))
7702 {
7705 result = true;
7706 BufferLockUnlock(buf, desc);
7707 }
7708 else
7709 *buffer_already_dirty = true;
7710
7711 UnpinBuffer(desc);
7712
7713 return result;
7714}
7715
7716/*
7717 * Try to mark the provided shared buffer as dirty.
7718 *
7719 * This function is intended for testing/development use only!
7720 *
7721 * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
7722 *
7723 * The buffer_already_dirty parameter is mandatory and indicate if the buffer
7724 * could not be dirtied because it is already dirty.
7725 *
7726 * Returns true if the buffer has successfully been marked as dirty.
7727 */
7728bool
7730{
7731 BufferDesc *desc;
7732 bool buffer_dirtied = false;
7733
7735
7736 /* Make sure we can pin the buffer. */
7739
7740 desc = GetBufferDescriptor(buf - 1);
7741 LockBufHdr(desc);
7742
7744 /* Both can not be true at the same time */
7746
7747 return buffer_dirtied;
7748}
7749
7750/*
7751 * Try to mark all the shared buffers containing provided relation's pages as
7752 * dirty.
7753 *
7754 * This function is intended for testing/development use only! See
7755 * MarkDirtyUnpinnedBuffer().
7756 *
7757 * The buffers_* parameters are mandatory and indicate the total count of
7758 * buffers that:
7759 * - buffers_dirtied - were dirtied
7760 * - buffers_already_dirty - were already dirty
7761 * - buffers_skipped - could not be dirtied because of a reason different
7762 * than a buffer being already dirty.
7763 */
7764void
7769{
7771
7772 *buffers_dirtied = 0;
7774 *buffers_skipped = 0;
7775
7776 for (int buf = 1; buf <= NBuffers; buf++)
7777 {
7778 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7781
7783
7784 /* An unlocked precheck should be safe and saves some cycles. */
7785 if ((buf_state & BM_VALID) == 0 ||
7787 continue;
7788
7789 /* Make sure we can pin the buffer. */
7792
7793 buf_state = LockBufHdr(desc);
7794
7795 /* recheck, could have changed without the lock */
7796 if ((buf_state & BM_VALID) == 0 ||
7798 {
7799 UnlockBufHdr(desc);
7800 continue;
7801 }
7802
7804 (*buffers_dirtied)++;
7805 else if (buffer_already_dirty)
7806 (*buffers_already_dirty)++;
7807 else
7808 (*buffers_skipped)++;
7809 }
7810}
7811
7812/*
7813 * Try to mark all the shared buffers as dirty.
7814 *
7815 * This function is intended for testing/development use only! See
7816 * MarkDirtyUnpinnedBuffer().
7817 *
7818 * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
7819 * parameters.
7820 */
7821void
7825{
7826 *buffers_dirtied = 0;
7828 *buffers_skipped = 0;
7829
7830 for (int buf = 1; buf <= NBuffers; buf++)
7831 {
7832 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7835
7837
7839 if (!(buf_state & BM_VALID))
7840 continue;
7841
7844
7845 LockBufHdr(desc);
7846
7848 (*buffers_dirtied)++;
7849 else if (buffer_already_dirty)
7850 (*buffers_already_dirty)++;
7851 else
7852 (*buffers_skipped)++;
7853 }
7854}
7855
7856/*
7857 * Generic implementation of the AIO handle staging callback for readv/writev
7858 * on local/shared buffers.
7859 *
7860 * Each readv/writev can target multiple buffers. The buffers have already
7861 * been registered with the IO handle.
7862 *
7863 * To make the IO ready for execution ("staging"), we need to ensure that the
7864 * targeted buffers are in an appropriate state while the IO is ongoing. For
7865 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
7866 * in this backend could lead to this backend's buffer pin being released as
7867 * part of error handling, which in turn could lead to the buffer being
7868 * replaced while IO is ongoing.
7869 */
7872{
7873 uint64 *io_data;
7874 uint8 handle_data_len;
7877
7878 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7879
7881
7882 /* iterate over all buffers affected by the vectored readv/writev */
7883 for (int i = 0; i < handle_data_len; i++)
7884 {
7886 BufferDesc *buf_hdr = is_temp ?
7890
7891 /*
7892 * Check that all the buffers are actually ones that could conceivably
7893 * be done in one IO, i.e. are sequential. This is the last
7894 * buffer-aware code before IO is actually executed and confusion
7895 * about which buffers are targeted by IO can be hard to debug, making
7896 * it worth doing extra-paranoid checks.
7897 */
7898 if (i == 0)
7899 first = buf_hdr->tag;
7900 else
7901 {
7902 Assert(buf_hdr->tag.relNumber == first.relNumber);
7903 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
7904 }
7905
7906 if (is_temp)
7908 else
7910
7911 /* verify the buffer is in the expected state */
7913 if (is_write)
7914 {
7917 }
7918 else
7919 {
7922 }
7923
7924 /* temp buffers don't use BM_IO_IN_PROGRESS */
7925 if (!is_temp)
7927
7929
7930 /*
7931 * Reflect that the buffer is now owned by the AIO subsystem.
7932 *
7933 * For local buffers: This can't be done just via LocalRefCount, as
7934 * one might initially think, as this backend could error out while
7935 * AIO is still in progress, releasing all the pins by the backend
7936 * itself.
7937 *
7938 * This pin is released again in TerminateBufferIO().
7939 */
7940 buf_hdr->io_wref = io_ref;
7941
7942 if (is_temp)
7943 {
7946 }
7947 else
7949
7950 /*
7951 * Ensure the content lock that prevents buffer modifications while
7952 * the buffer is being written out is not released early due to an
7953 * error.
7954 */
7955 if (is_write && !is_temp)
7956 {
7958
7959 /*
7960 * Lock is now owned by AIO subsystem.
7961 */
7963 }
7964
7965 /*
7966 * Stop tracking this buffer via the resowner - the AIO system now
7967 * keeps track.
7968 */
7969 if (!is_temp)
7971 }
7972}
7973
7974/*
7975 * Decode readv errors as encoded by buffer_readv_encode_error().
7976 */
7977static inline void
7979 bool *zeroed_any,
7980 bool *ignored_any,
7984{
7985 uint32 rem_error = result.error_data;
7986
7987 /* see static asserts in buffer_readv_encode_error */
7988#define READV_COUNT_BITS 7
7989#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
7990
7991 *zeroed_any = rem_error & 1;
7992 rem_error >>= 1;
7993
7994 *ignored_any = rem_error & 1;
7995 rem_error >>= 1;
7996
7999
8002
8005}
8006
8007/*
8008 * Helper to encode errors for buffer_readv_complete()
8009 *
8010 * Errors are encoded as follows:
8011 * - bit 0 indicates whether any page was zeroed (1) or not (0)
8012 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8013 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8014 * - next READV_COUNT_BITS bits indicate the number of checksum failures
8015 * - next READV_COUNT_BITS bits indicate the first offset of the first page
8016 * that was errored or zeroed or, if no errors/zeroes, the first ignored
8017 * checksum
8018 */
8019static inline void
8021 bool is_temp,
8022 bool zeroed_any,
8023 bool ignored_any,
8030{
8031
8032 uint8 shift = 0;
8036
8038 "PG_IOV_MAX is bigger than reserved space for error data");
8040 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8041
8042 /*
8043 * We only have space to encode one offset - but luckily that's good
8044 * enough. If there is an error, the error is the interesting offset, same
8045 * with a zeroed buffer vs an ignored buffer.
8046 */
8047 if (error_count > 0)
8049 else if (zeroed_count > 0)
8051 else
8053
8054 Assert(!zeroed_any || error_count == 0);
8055
8056 result->error_data = 0;
8057
8058 result->error_data |= zeroed_any << shift;
8059 shift += 1;
8060
8061 result->error_data |= ignored_any << shift;
8062 shift += 1;
8063
8064 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8065 shift += READV_COUNT_BITS;
8066
8067 result->error_data |= ((uint32) checkfail_count) << shift;
8068 shift += READV_COUNT_BITS;
8069
8070 result->error_data |= ((uint32) first_off) << shift;
8071 shift += READV_COUNT_BITS;
8072
8073 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8075
8076 if (error_count > 0)
8077 result->status = PGAIO_RS_ERROR;
8078 else
8079 result->status = PGAIO_RS_WARNING;
8080
8081 /*
8082 * The encoding is complicated enough to warrant cross-checking it against
8083 * the decode function.
8084 */
8085#ifdef USE_ASSERT_CHECKING
8086 {
8087 bool zeroed_any_2,
8092
8097 &first_off_2);
8103 }
8104#endif
8105
8106#undef READV_COUNT_BITS
8107#undef READV_COUNT_MASK
8108}
8109
8110/*
8111 * Helper for AIO readv completion callbacks, supporting both shared and temp
8112 * buffers. Gets called once for each buffer in a multi-page read.
8113 */
8116 uint8 flags, bool failed, bool is_temp,
8117 bool *buffer_invalid,
8118 bool *failed_checksum,
8119 bool *ignored_checksum,
8120 bool *zeroed_buffer)
8121{
8122 BufferDesc *buf_hdr = is_temp ?
8125 BufferTag tag = buf_hdr->tag;
8126 char *bufdata = BufferGetBlock(buffer);
8128 int piv_flags;
8129
8130 /* check that the buffer is in the expected state for a read */
8131#ifdef USE_ASSERT_CHECKING
8132 {
8134
8137 /* temp buffers don't use BM_IO_IN_PROGRESS */
8138 if (!is_temp)
8141 }
8142#endif
8143
8144 *buffer_invalid = false;
8145 *failed_checksum = false;
8146 *ignored_checksum = false;
8147 *zeroed_buffer = false;
8148
8149 /*
8150 * We ask PageIsVerified() to only log the message about checksum errors,
8151 * as the completion might be run in any backend (or IO workers). We will
8152 * report checksum errors in buffer_readv_report().
8153 */
8155
8156 /* the local zero_damaged_pages may differ from the definer's */
8159
8160 /* Check for garbage data. */
8161 if (!failed)
8162 {
8163 /*
8164 * If the buffer is not currently pinned by this backend, e.g. because
8165 * we're completing this IO after an error, the buffer data will have
8166 * been marked as inaccessible when the buffer was unpinned. The AIO
8167 * subsystem holds a pin, but that doesn't prevent the buffer from
8168 * having been marked as inaccessible. The completion might also be
8169 * executed in a different process.
8170 */
8171#ifdef USE_VALGRIND
8172 if (!BufferIsPinned(buffer))
8174#endif
8175
8176 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8178 {
8179 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8180 {
8181 memset(bufdata, 0, BLCKSZ);
8182 *zeroed_buffer = true;
8183 }
8184 else
8185 {
8186 *buffer_invalid = true;
8187 /* mark buffer as having failed */
8188 failed = true;
8189 }
8190 }
8191 else if (*failed_checksum)
8192 *ignored_checksum = true;
8193
8194 /* undo what we did above */
8195#ifdef USE_VALGRIND
8196 if (!BufferIsPinned(buffer))
8198#endif
8199
8200 /*
8201 * Immediately log a message about the invalid page, but only to the
8202 * server log. The reason to do so immediately is that this may be
8203 * executed in a different backend than the one that originated the
8204 * request. The reason to do so immediately is that the originator
8205 * might not process the query result immediately (because it is busy
8206 * doing another part of query processing) or at all (e.g. if it was
8207 * cancelled or errored out due to another IO also failing). The
8208 * definer of the IO will emit an ERROR or WARNING when processing the
8209 * IO's results
8210 *
8211 * To avoid duplicating the code to emit these log messages, we reuse
8212 * buffer_readv_report().
8213 */
8215 {
8216 PgAioResult result_one = {0};
8217
8222 *zeroed_buffer ? 1 : 0,
8223 *failed_checksum ? 1 : 0,
8226 }
8227 }
8228
8229 /* Terminate I/O and set BM_VALID. */
8230 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8231 if (is_temp)
8233 else
8234 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8235
8236 /*
8237 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8238 * callback may not be executed in the same backend that called
8239 * BUFFER_READ_START. The alternative would be to defer calling the
8240 * tracepoint to a later point (e.g. the local completion callback for
8241 * shared buffer reads), which seems even less helpful.
8242 */
8244 tag.blockNum,
8245 tag.spcOid,
8246 tag.dbOid,
8247 tag.relNumber,
8249 false);
8250}
8251
8252/*
8253 * Perform completion handling of a single AIO read. This read may cover
8254 * multiple blocks / buffers.
8255 *
8256 * Shared between shared and local buffers, to reduce code duplication.
8257 */
8260 uint8 cb_data, bool is_temp)
8261{
8262 PgAioResult result = prior_result;
8267 uint8 error_count = 0;
8268 uint8 zeroed_count = 0;
8269 uint8 ignored_count = 0;
8271 uint64 *io_data;
8272 uint8 handle_data_len;
8273
8274 if (is_temp)
8275 {
8276 Assert(td->smgr.is_temp);
8278 }
8279 else
8280 Assert(!td->smgr.is_temp);
8281
8282 /*
8283 * Iterate over all the buffers affected by this IO and call the
8284 * per-buffer completion function for each buffer.
8285 */
8286 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8287 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8288 {
8290 bool failed;
8291 bool failed_verification = false;
8292 bool failed_checksum = false;
8293 bool zeroed_buffer = false;
8294 bool ignored_checksum = false;
8295
8297
8298 /*
8299 * If the entire I/O failed on a lower-level, each buffer needs to be
8300 * marked as failed. In case of a partial read, the first few buffers
8301 * may be ok.
8302 */
8303 failed =
8305 || prior_result.result <= buf_off;
8306
8307 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8311 &zeroed_buffer);
8312
8313 /*
8314 * Track information about the number of different kinds of error
8315 * conditions across all pages, as there can be multiple pages failing
8316 * verification as part of one IO.
8317 */
8320 if (zeroed_buffer && zeroed_count++ == 0)
8322 if (ignored_checksum && ignored_count++ == 0)
8324 if (failed_checksum)
8326 }
8327
8328 /*
8329 * If the smgr read succeeded [partially] and page verification failed for
8330 * some of the pages, adjust the IO's result state appropriately.
8331 */
8332 if (prior_result.status != PGAIO_RS_ERROR &&
8333 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8334 {
8335 buffer_readv_encode_error(&result, is_temp,
8336 zeroed_count > 0, ignored_count > 0,
8340 pgaio_result_report(result, td, DEBUG1);
8341 }
8342
8343 /*
8344 * For shared relations this reporting is done in
8345 * shared_buffer_readv_complete_local().
8346 */
8347 if (is_temp && checkfail_count > 0)
8350
8351 return result;
8352}
8353
8354/*
8355 * AIO error reporting callback for aio_shared_buffer_readv_cb and
8356 * aio_local_buffer_readv_cb.
8357 *
8358 * The error is encoded / decoded in buffer_readv_encode_error() /
8359 * buffer_readv_decode_error().
8360 */
8361static void
8363 int elevel)
8364{
8365 int nblocks = td->smgr.nblocks;
8366 BlockNumber first = td->smgr.blockNum;
8367 BlockNumber last = first + nblocks - 1;
8370 RelPathStr rpath =
8372 bool zeroed_any,
8376 first_off;
8378 const char *msg_one,
8379 *msg_mult,
8380 *det_mult,
8381 *hint_mult;
8382
8386 &first_off);
8387
8388 /*
8389 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8390 * special case, it's too irregular to be emitted the same way as the
8391 * other cases.
8392 */
8393 if (zeroed_any && ignored_any)
8394 {
8396 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8397 Assert(result.status != PGAIO_RS_ERROR);
8399
8400 ereport(elevel,
8402 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8403 affected_count, checkfail_count, first, last, rpath.str),
8404 affected_count > 1 ?
8405 errdetail("Block %u held the first zeroed page.",
8406 first + first_off) : 0,
8407 errhint_plural("See server log for details about the other %d invalid block.",
8408 "See server log for details about the other %d invalid blocks.",
8411 return;
8412 }
8413
8414 /*
8415 * The other messages are highly repetitive. To avoid duplicating a long
8416 * and complicated ereport(), gather the translated format strings
8417 * separately and then do one common ereport.
8418 */
8419 if (result.status == PGAIO_RS_ERROR)
8420 {
8421 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8423 msg_one = _("invalid page in block %u of relation \"%s\"");
8424 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8425 det_mult = _("Block %u held the first invalid page.");
8426 hint_mult = _("See server log for the other %u invalid block(s).");
8427 }
8428 else if (zeroed_any && !ignored_any)
8429 {
8431 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8432 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8433 det_mult = _("Block %u held the first zeroed page.");
8434 hint_mult = _("See server log for the other %u zeroed block(s).");
8435 }
8436 else if (!zeroed_any && ignored_any)
8437 {
8439 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8440 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8441 det_mult = _("Block %u held the first ignored page.");
8442 hint_mult = _("See server log for the other %u ignored block(s).");
8443 }
8444 else
8446
8447 ereport(elevel,
8449 affected_count == 1 ?
8450 errmsg_internal(msg_one, first + first_off, rpath.str) :
8451 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8454}
8455
8456static void
8461
8462static PgAioResult
8468
8469/*
8470 * We need a backend-local completion callback for shared buffers, to be able
8471 * to report checksum errors correctly. Unfortunately that can only safely
8472 * happen if the reporting backend has previously called
8473 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8474 * the backend that started the IO. Hence this callback.
8475 */
8476static PgAioResult
8506
8507static void
8512
8513static PgAioResult
8519
8520/* readv callback is passed READ_BUFFERS_* flags as callback data */
8523 .complete_shared = shared_buffer_readv_complete,
8524 /* need a local callback to report checksum failures */
8525 .complete_local = shared_buffer_readv_complete_local,
8526 .report = buffer_readv_report,
8527};
8528
8529/* readv callback is passed READ_BUFFERS_* flags as callback data */
8532
8533 /*
8534 * Note that this, in contrast to the shared_buffers case, uses
8535 * complete_local, as only the issuing backend has access to the required
8536 * datastructures. This is important in case the IO completion may be
8537 * consumed incidentally by another backend.
8538 */
8539 .complete_local = local_buffer_readv_complete,
8540 .report = buffer_readv_report,
8541};
int io_method
Definition aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
bool pgaio_have_staged(void)
Definition aio.c:1107
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
void pgaio_submit_staged(void)
Definition aio.c:1123
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ IOMETHOD_SYNC
Definition aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_OK
Definition aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define pg_write_barrier()
Definition atomics.h:155
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1609
int BgWriterDelay
Definition bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
#define MaxBlockNumber
Definition block.h:35
static int32 next
Definition blutils.c:225
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
#define BufferIsLocal(buffer)
Definition buf.h:37
CkptSortItem * CkptBufferIds
Definition buf_init.c:26
WritebackContext BackendWritebackContext
Definition buf_init.c:25
#define BM_MAX_USAGE_COUNT
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
#define BM_PERMANENT
#define BUF_USAGECOUNT_MASK
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BM_LOCK_VAL_SHARED
#define BUF_REFCOUNT_ONE
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_LOCK_VAL_EXCLUSIVE
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
#define BM_PIN_COUNT_WAITER
#define BM_DIRTY
#define BM_LOCK_WAKE_IN_PROGRESS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
#define BM_JUST_DIRTIED
#define BUF_STATE_GET_USAGECOUNT(state)
#define BM_LOCK_MASK
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
#define BUF_STATE_GET_REFCOUNT(state)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_LOCK_HAS_WAITERS
#define BM_IO_ERROR
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BM_LOCK_VAL_SHARE_EXCLUSIVE
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:118
bool track_io_timing
Definition bufmgr.c:177
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7435
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6495
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition bufmgr.c:5166
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5534
void DropDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5031
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition bufmgr.c:7214
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8259
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:374
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:248
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition bufmgr.c:1665
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition bufmgr.c:4681
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition bufmgr.c:3122
static int ReservedRefCountSlot
Definition bufmgr.c:253
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8477
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1363
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1628
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:773
static uint32 PrivateRefCountClock
Definition bufmgr.c:252
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4417
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7413
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8514
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1590
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7572
int io_max_combine_limit
Definition bufmgr.c:202
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4554
const ResourceOwnerDesc buffer_io_resowner_desc
Definition bufmgr.c:270
bool zero_damaged_pages
Definition bufmgr.c:174
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:93
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3293
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7622
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8115
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7471
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5756
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5954
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition bufmgr.c:7180
bool IsBufferCleanupOK(Buffer buffer)
Definition bufmgr.c:6759
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:75
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7421
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:2998
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6129
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:965
void AtEOXact_Buffers(bool isCommit)
Definition bufmgr.c:4104
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7010
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition bufmgr.c:8521
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:5872
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:997
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1294
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6163
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1694
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7156
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1132
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2101
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4174
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition bufmgr.c:1652
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition bufmgr.c:5378
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition bufmgr.c:4801
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6061
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7081
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6383
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition bufmgr.c:1026
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition bufmgr.c:8530
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2375
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4156
int io_combine_limit_guc
Definition bufmgr.c:201
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7245
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4378
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:74
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5908
const ResourceOwnerDesc buffer_resowner_desc
Definition bufmgr.c:279
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:7871
void UnlockBuffer(Buffer buffer)
Definition bufmgr.c:6416
#define BUF_REUSABLE
Definition bufmgr.c:83
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7065
static void BufferSync(int flags)
Definition bufmgr.c:3457
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1865
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8508
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4300
void CheckPointBuffers(int flags)
Definition bufmgr.c:4343
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3025
static uint32 MaxProportionalPins
Definition bufmgr.c:256
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6021
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2704
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6143
bool BgBufferSync(WritebackContext *wb_context)
Definition bufmgr.c:3736
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7108
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3325
bool BufferIsPermanent(Buffer buffer)
Definition bufmgr.c:4605
void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:7822
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:130
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8457
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6328
void UnlockBuffers(void)
Definition bufmgr.c:5710
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:683
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8463
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2452
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6475
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition bufmgr.c:4573
int bgwriter_flush_after
Definition bufmgr.c:209
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5502
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:2972
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3182
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:4971
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4635
void LockBufferInternal(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:6432
bool HoldingBufferPinThatDelaysRecovery(void)
Definition bufmgr.c:6675
bool MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
Definition bufmgr.c:7729
int checkpoint_flush_after
Definition bufmgr.c:208
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5519
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition bufmgr.c:1211
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3370
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7049
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7280
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1733
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7268
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3057
#define BufferIsPinned(bufnum)
Definition bufmgr.c:590
double bgwriter_lru_multiplier
Definition bufmgr.c:176
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7481
int backend_flush_after
Definition bufmgr.c:210
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2642
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8362
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:294
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:213
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:405
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:529
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2660
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6528
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5566
void FlushRelationBuffers(Relation rel)
Definition bufmgr.c:5078
#define READV_COUNT_BITS
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6299
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7330
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:552
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition bufmgr.c:7543
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:949
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition bufmgr.c:804
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:85
int maintenance_io_concurrency
Definition bufmgr.c:192
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3361
void FlushDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5442
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2274
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5264
int effective_io_concurrency
Definition bufmgr.c:185
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:494
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6401
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:6948
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition bufmgr.c:6890
void MarkDirtyRelUnpinnedBuffers(Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:7765
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1609
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:912
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:7673
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:249
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:7978
#define READV_COUNT_MASK
static int PrivateRefCountEntryLast
Definition bufmgr.c:254
int io_combine_limit
Definition bufmgr.c:200
void InitBufferManagerAccess(void)
Definition bufmgr.c:4121
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8020
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4034
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2616
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:865
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3417
static HTAB * PrivateRefCountHash
Definition bufmgr.c:250
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:251
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6701
int bgwriter_lru_maxpages
Definition bufmgr.c:175
uint32 GetPinLimit(void)
Definition bufmgr.c:2604
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:6811
#define BUF_WRITTEN
Definition bufmgr.c:82
void FlushOneBuffer(Buffer buffer)
Definition bufmgr.c:5482
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define P_NEW
Definition bufmgr.h:198
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
#define DEFAULT_IO_COMBINE_LIMIT
Definition bufmgr.h:174
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:433
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
BufferLockMode
Definition bufmgr.h:204
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:215
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:173
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition bufmgr.h:168
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition bufmgr.h:169
void * Block
Definition bufmgr.h:26
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
ReadBufferMode
Definition bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
@ RBM_NORMAL
Definition bufmgr.h:46
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
bool ignore_checksum_failure
Definition bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:468
static bool PageIsNew(const PageData *page)
Definition bufpage.h:233
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:390
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:385
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:469
#define pg_noinline
Definition c.h:307
#define likely(x)
Definition c.h:423
uint8_t uint8
Definition c.h:556
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:235
#define Max(x, y)
Definition c.h:1013
#define Assert(condition)
Definition c.h:885
double float8
Definition c.h:656
#define pg_attribute_always_inline
Definition c.h:291
int16_t int16
Definition c.h:553
int32_t int32
Definition c.h:554
uint64_t uint64
Definition c.h:559
#define pg_unreachable()
Definition c.h:353
#define unlikely(x)
Definition c.h:424
uint32_t uint32
Definition c.h:558
#define lengthof(array)
Definition c.h:815
#define MemSet(start, val, len)
Definition c.h:1035
#define StaticAssertDecl(condition, errmessage)
Definition c.h:950
size_t Size
Definition c.h:631
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
struct cursor * cur
Definition ecpg.c:29
Datum arg
Definition elog.c:1322
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define _(x)
Definition elog.c:95
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
#define errcontext
Definition elog.h:198
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
#define DEBUG3
Definition elog.h:28
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define LOG_SERVER_ONLY
Definition elog.h:32
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
int io_direct_flags
Definition fd.c:171
#define IO_DIRECT_DATA
Definition fd.h:54
#define palloc_array(type, count)
Definition fe_memutils.h:76
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:321
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:174
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:747
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:787
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
int NBuffers
Definition globals.c:142
bool enableFsync
Definition globals.c:129
ProcNumber MyProcNumber
Definition globals.c:90
int VacuumCostPageMiss
Definition globals.c:152
bool VacuumCostActive
Definition globals.c:158
bool IsUnderPostmaster
Definition globals.c:120
int VacuumCostBalance
Definition globals.c:157
int MaxBackends
Definition globals.c:146
int VacuumCostPageDirty
Definition globals.c:153
int VacuumCostPageHit
Definition globals.c:151
const char * str
@ HASH_FIND
Definition hsearch.h:113
@ HASH_REMOVE
Definition hsearch.h:115
@ HASH_ENTER
Definition hsearch.h:114
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
BufferUsage pgBufferUsage
Definition instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
int b
Definition isn.c:74
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
int32 * LocalRefCount
Definition localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:841
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition localbuf.c:523
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1003
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1014
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:805
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:491
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:702
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:562
int NLocBuffer
Definition localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:346
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:848
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:665
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1176
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1793
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
@ LW_WS_WAITING
Definition lwlock.h:31
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
@ LW_SHARED
Definition lwlock.h:113
@ LW_EXCLUSIVE
Definition lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define RESUME_INTERRUPTS()
Definition miscadmin.h:136
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define HOLD_INTERRUPTS()
Definition miscadmin.h:134
#define END_CRIT_SECTION()
Definition miscadmin.h:152
#define ERRCODE_DATA_CORRUPTED
static PgChecksumMode mode
static int64 current_size
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition pg_iovec.h:47
static char buf[DEFAULT_XLOG_SEG_SIZE]
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:719
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_EXTEND
Definition pgstat.h:318
@ IOOP_READ
Definition pgstat.h:319
@ IOOP_WRITEBACK
Definition pgstat.h:315
@ IOOP_HIT
Definition pgstat.h:313
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
@ IOOP_WRITE
Definition pgstat.h:320
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:724
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define qsort(a, b, c, d)
Definition port.h:495
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:335
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:315
static Datum PointerGetDatum(const void *X)
Definition postgres.h:352
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define NUM_AUXILIARY_PROCS
Definition proc.h:527
#define GetPGProcByNumber(n)
Definition proc.h:504
#define DELAY_CHKPT_START
Definition proc.h:136
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
int ProcNumber
Definition procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition procsignal.c:501
void set_ps_display_remove_suffix(void)
Definition ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:387
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:576
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:667
#define RelationIsValid(relation)
Definition rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
#define free(a)
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:1983
PGPROC * MyProc
Definition proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:757
int DeadlockTimeout
Definition proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:745
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:1971
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:793
bool log_recovery_conflict_waits
Definition standby.c:42
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:274
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:46
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187
BufferTag tag
pg_atomic_uint64 state
int64 shared_blks_dirtied
Definition instrument.h:28
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 shared_blks_written
Definition instrument.h:29
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
int ckpt_bufs_written
Definition xlog.h:178
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:149
int num_scanned
Definition bufmgr.c:154
float8 progress
Definition bufmgr.c:148
int num_to_scan
Definition bufmgr.c:152
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
Definition proc.h:176
uint8 lwWaitMode
Definition proc.h:276
int delayChkptFlags
Definition proc.h:252
PGSemaphore sem
Definition proc.h:250
uint8 lwWaiting
Definition proc.h:275
PgAioHandleCallbackStage stage
Definition aio.h:219
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
uint32 id
Definition aio_types.h:105
PgAioResult result
Definition aio_types.h:132
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248
PgStat_Counter buffers_written
Definition pgstat.h:270
Buffer recent_buffer
Definition bufmgr.h:61
BufferLockMode lockmode
Definition bufmgr.c:110
PrivateRefCountData data
Definition bufmgr.c:126
ForkNumber forknum
Definition bufmgr.h:137
PgAioWaitRef io_wref
Definition bufmgr.h:150
SMgrRelation smgr
Definition bufmgr.h:135
BufferAccessStrategy strategy
Definition bufmgr.h:138
BlockNumber blocknum
Definition bufmgr.h:146
PgAioReturn io_return
Definition bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
const char * name
Definition resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
SMgrRelation srel
Definition bufmgr.c:170
RelFileLocator rlocator
Definition bufmgr.c:169
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1858
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@128 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
static volatile sig_atomic_t waiting
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
bool RecoveryInProgress(void)
Definition xlog.c:6460
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3145
CheckpointStatsData CheckpointStats
Definition xlog.c:212
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2783
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:154
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:151
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:150
#define XLogIsNeeded()
Definition xlog.h:111
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
#define InHotStandby
Definition xlogutils.h:60