PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "common/hashfn.h"
49#include "executor/instrument.h"
50#include "lib/binaryheap.h"
51#include "miscadmin.h"
52#include "pg_trace.h"
53#include "pgstat.h"
54#include "postmaster/bgwriter.h"
55#include "storage/aio.h"
57#include "storage/bufmgr.h"
58#include "storage/fd.h"
59#include "storage/ipc.h"
60#include "storage/lmgr.h"
61#include "storage/proc.h"
62#include "storage/proclist.h"
63#include "storage/procsignal.h"
64#include "storage/read_stream.h"
65#include "storage/smgr.h"
66#include "storage/standby.h"
67#include "utils/memdebug.h"
68#include "utils/ps_status.h"
69#include "utils/rel.h"
70#include "utils/resowner.h"
71#include "utils/timestamp.h"
72#include "utils/wait_event.h"
73
74
75/* Note: these two macros only work on shared buffers, not local ones! */
76#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
77#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
78
79/* Note: this macro only works on local buffers, not shared ones! */
80#define LocalBufHdrGetBlock(bufHdr) \
81 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
82
83/* Bits in SyncOneBuffer's return value */
84#define BUF_WRITTEN 0x01
85#define BUF_REUSABLE 0x02
86
87#define RELS_BSEARCH_THRESHOLD 20
88
89/*
90 * This is the size (in the number of blocks) above which we scan the
91 * entire buffer pool to remove the buffers for all the pages of relation
92 * being dropped. For the relations with size below this threshold, we find
93 * the buffers by doing lookups in BufMapping table.
94 */
95#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
96
97/*
98 * This is separated out from PrivateRefCountEntry to allow for copying all
99 * the data members via struct assignment.
100 */
102{
103 /*
104 * How many times has the buffer been pinned by this backend.
105 */
107
108 /*
109 * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
110 * the buffer is not locked.
111 */
114
116{
117 /*
118 * Note that this needs to be same as the entry's corresponding
119 * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
120 * store it in both places as this is used for the hashtable key and
121 * because it is more convenient (passing around a PrivateRefCountEntry
122 * suffices to identify the buffer) and faster (checking the keys array is
123 * faster when checking many entries, checking the entry is faster if just
124 * checking a single entry).
125 */
127
128 char status;
129
132
133#define SH_PREFIX refcount
134#define SH_ELEMENT_TYPE PrivateRefCountEntry
135#define SH_KEY_TYPE Buffer
136#define SH_KEY buffer
137#define SH_HASH_KEY(tb, key) murmurhash32((uint32) (key))
138#define SH_EQUAL(tb, a, b) ((a) == (b))
139#define SH_SCOPE static inline
140#define SH_DECLARE
141#define SH_DEFINE
142#include "lib/simplehash.h"
143
144/* 64 bytes, about the size of a cache line on common systems */
145#define REFCOUNT_ARRAY_ENTRIES 8
146
147/*
148 * Status of buffers to checkpoint for a particular tablespace, used
149 * internally in BufferSync.
150 */
151typedef struct CkptTsStatus
152{
153 /* oid of the tablespace */
155
156 /*
157 * Checkpoint progress for this tablespace. To make progress comparable
158 * between tablespaces the progress is, for each tablespace, measured as a
159 * number between 0 and the total number of to-be-checkpointed pages. Each
160 * page checkpointed in this tablespace increments this space's progress
161 * by progress_slice.
162 */
165
166 /* number of to-be checkpointed pages in this tablespace */
168 /* already processed pages in this tablespace */
170
171 /* current offset in CkptBufferIds for this tablespace */
172 int index;
174
175/*
176 * Type for array used to sort SMgrRelations
177 *
178 * FlushRelationsAllBuffers shares the same comparator function with
179 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
180 * compatible.
181 */
182typedef struct SMgrSortArray
183{
184 RelFileLocator rlocator; /* This must be the first member */
187
188/* GUC variables */
192bool track_io_timing = false;
193
194/*
195 * How many buffers PrefetchBuffer callers should try to stay ahead of their
196 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
197 * for buffers not belonging to tablespaces that have their
198 * effective_io_concurrency parameter set.
199 */
201
202/*
203 * Like effective_io_concurrency, but used by maintenance code paths that might
204 * benefit from a higher setting because they work on behalf of many sessions.
205 * Overridden by the tablespace setting of the same name.
206 */
208
209/*
210 * Limit on how many blocks should be handled in single I/O operations.
211 * StartReadBuffers() callers should respect it, as should other operations
212 * that call smgr APIs directly. It is computed as the minimum of underlying
213 * GUCs io_combine_limit_guc and io_max_combine_limit.
214 */
218
219/*
220 * GUC variables about triggering kernel writeback for buffers written; OS
221 * dependent defaults are set via the GUC mechanism.
222 */
226
227/* local state for LockBufferForCleanup */
229
230/*
231 * Backend-Private refcount management:
232 *
233 * Each buffer also has a private refcount that keeps track of the number of
234 * times the buffer is pinned in the current process. This is so that the
235 * shared refcount needs to be modified only once if a buffer is pinned more
236 * than once by an individual backend. It's also used to check that no
237 * buffers are still pinned at the end of transactions and when exiting. We
238 * also use this mechanism to track whether this backend has a buffer locked,
239 * and, if so, in what mode.
240 *
241 *
242 * To avoid - as we used to - requiring an array with NBuffers entries to keep
243 * track of local buffers, we use a small sequentially searched array
244 * (PrivateRefCountArrayKeys, with the corresponding data stored in
245 * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
246 * keep track of backend local pins.
247 *
248 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
249 * refcounts are kept track of in the array; after that, new array entries
250 * displace old ones into the hash table. That way a frequently used entry
251 * can't get "stuck" in the hashtable while infrequent ones clog the array.
252 *
253 * Note that in most scenarios the number of pinned buffers will not exceed
254 * REFCOUNT_ARRAY_ENTRIES.
255 *
256 *
257 * To enter a buffer into the refcount tracking mechanism first reserve a free
258 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
259 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
260 * memory allocations in NewPrivateRefCountEntry() which can be important
261 * because in some scenarios it's called with a spinlock held...
262 */
268static int ReservedRefCountSlot = -1;
270
272
273static void ReservePrivateRefCountEntry(void);
278
279/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
280static void ResOwnerReleaseBufferIO(Datum res);
281static char *ResOwnerPrintBufferIO(Datum res);
282static void ResOwnerReleaseBuffer(Datum res);
283static char *ResOwnerPrintBuffer(Datum res);
284
286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};
293
295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};
302
303/*
304 * Ensure that the PrivateRefCountArray has sufficient space to store one more
305 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
306 * a new entry - but it's perfectly fine to not use a reserved entry.
307 */
308static void
310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
383
384/*
385 * Fill a previously reserved refcount entry.
386 */
389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}
411
412/*
413 * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
414 * inlining. This particularly seems to be true if the compiler is capable of
415 * auto-vectorizing the code, as that imposes additional stack-alignment
416 * requirements etc.
417 */
420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
498
499/*
500 * Return the PrivateRefCount entry for the passed buffer.
501 *
502 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
503 * do_move is true, and the entry resides in the hashtable the entry is
504 * optimized for frequent access by moving it to the array.
505 */
506static inline PrivateRefCountEntry *
508{
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
534}
535
536/*
537 * Returns how many times the passed buffer is pinned by this backend.
538 *
539 * Only works for shared memory buffers!
540 */
541static inline int32
543{
545
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}
559
560/*
561 * Release resources used to track the reference count of a buffer which we no
562 * longer have pinned and don't want to pin again immediately.
563 */
564static void
566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
591
592/*
593 * BufferIsPinned
594 * True iff the buffer is pinned (also checks for valid buffer number).
595 *
596 * NOTE: what we check here is that *this* backend holds a pin on
597 * the buffer. We do not care whether some other backend does.
598 */
599#define BufferIsPinned(bufnum) \
600( \
601 !BufferIsValid(bufnum) ? \
602 false \
603 : \
604 BufferIsLocal(bufnum) ? \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
607 (GetPrivateRefCount(bufnum) > 0) \
608)
609
610
613 ForkNumber forkNum, BlockNumber blockNum,
617 BufferAccessStrategy strategy,
618 uint32 flags,
621 Buffer *buffers,
625 BufferAccessStrategy strategy,
626 uint32 flags,
629 Buffer *buffers,
631static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
632 bool skip_if_not_valid);
633static void PinBuffer_Locked(BufferDesc *buf);
634static void UnpinBuffer(BufferDesc *buf);
635static void UnpinBufferNoOwner(BufferDesc *buf);
636static void BufferSync(int flags);
637static int SyncOneBuffer(int buf_id, bool skip_recently_used,
639static void WaitIO(BufferDesc *buf);
640static void AbortBufferIO(Buffer buffer);
641static void shared_buffer_write_error_callback(void *arg);
642static void local_buffer_write_error_callback(void *arg);
643static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
644 char relpersistence,
645 ForkNumber forkNum,
646 BlockNumber blockNum,
647 BufferAccessStrategy strategy,
649static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
650static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
656static void FindAndDropRelationBuffers(RelFileLocator rlocator,
657 ForkNumber forkNum,
662 ForkNumber forkNum, bool permanent);
663static void AtProcExit_Buffers(int code, Datum arg);
664static void CheckForBufferLeaks(void);
665#ifdef USE_ASSERT_CHECKING
667#endif
668static int rlocator_comparator(const void *p1, const void *p2);
669static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
670static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
671static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
672
678static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
683static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
686
687
688/*
689 * Implementation of PrefetchBuffer() for shared buffers.
690 */
693 ForkNumber forkNum,
694 BlockNumber blockNum)
695{
696 PrefetchBufferResult result = {InvalidBuffer, false};
697 BufferTag newTag; /* identity of requested block */
698 uint32 newHash; /* hash value for newTag */
699 LWLock *newPartitionLock; /* buffer partition lock for it */
700 int buf_id;
701
702 Assert(BlockNumberIsValid(blockNum));
703
704 /* create a tag so we can lookup the buffer */
705 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
706 forkNum, blockNum);
707
708 /* determine its hash code and partition lock ID */
711
712 /* see if the block is in the buffer pool already */
714 buf_id = BufTableLookup(&newTag, newHash);
716
717 /* If not in buffers, initiate prefetch */
718 if (buf_id < 0)
719 {
720#ifdef USE_PREFETCH
721 /*
722 * Try to initiate an asynchronous read. This returns false in
723 * recovery if the relation file doesn't exist.
724 */
725 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
726 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
727 {
728 result.initiated_io = true;
729 }
730#endif /* USE_PREFETCH */
731 }
732 else
733 {
734 /*
735 * Report the buffer it was in at that time. The caller may be able
736 * to avoid a buffer table lookup, but it's not pinned and it must be
737 * rechecked!
738 */
739 result.recent_buffer = buf_id + 1;
740 }
741
742 /*
743 * If the block *is* in buffers, we do nothing. This is not really ideal:
744 * the block might be just about to be evicted, which would be stupid
745 * since we know we are going to need it soon. But the only easy answer
746 * is to bump the usage_count, which does not seem like a great solution:
747 * when the caller does ultimately touch the block, usage_count would get
748 * bumped again, resulting in too much favoritism for blocks that are
749 * involved in a prefetch sequence. A real fix would involve some
750 * additional per-buffer state, and it's not clear that there's enough of
751 * a problem to justify that.
752 */
753
754 return result;
755}
756
757/*
758 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
759 *
760 * This is named by analogy to ReadBuffer but doesn't actually allocate a
761 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
762 * block will not be delayed by the I/O. Prefetching is optional.
763 *
764 * There are three possible outcomes:
765 *
766 * 1. If the block is already cached, the result includes a valid buffer that
767 * could be used by the caller to avoid the need for a later buffer lookup, but
768 * it's not pinned, so the caller must recheck it.
769 *
770 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
771 * true. Currently there is no way to know if the data was already cached by
772 * the kernel and therefore didn't really initiate I/O, and no way to know when
773 * the I/O completes other than using synchronous ReadBuffer().
774 *
775 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
776 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
777 * lack of a kernel facility), direct I/O is enabled, or the underlying
778 * relation file wasn't found and we are in recovery. (If the relation file
779 * wasn't found and we are not in recovery, an error is raised).
780 */
783{
785 Assert(BlockNumberIsValid(blockNum));
786
788 {
789 /* see comments in ReadBufferExtended */
793 errmsg("cannot access temporary tables of other sessions")));
794
795 /* pass it off to localbuf.c */
796 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
797 }
798 else
799 {
800 /* pass it to the shared buffer version */
801 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803}
804
805/*
806 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
807 *
808 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
809 * successful. Return true if the buffer is valid and still has the expected
810 * tag. In that case, the buffer is pinned and the usage count is bumped.
811 */
812bool
814 Buffer recent_buffer)
815{
817 BufferTag tag;
819
820 Assert(BufferIsValid(recent_buffer));
821
824 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
825
826 if (BufferIsLocal(recent_buffer))
827 {
828 int b = -recent_buffer - 1;
829
832
833 /* Is it still valid and holding the right tag? */
834 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
835 {
836 PinLocalBuffer(bufHdr, true);
837
839
840 return true;
841 }
842 }
843 else
844 {
845 bufHdr = GetBufferDescriptor(recent_buffer - 1);
846
847 /*
848 * Is it still valid and holding the right tag? We do an unlocked tag
849 * comparison first, to make it unlikely that we'll increment the
850 * usage counter of the wrong buffer, if someone calls us with a very
851 * out of date recent_buffer. Then we'll check it again if we get the
852 * pin.
853 */
854 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
855 PinBuffer(bufHdr, NULL, true))
856 {
857 if (BufferTagsEqual(&tag, &bufHdr->tag))
858 {
860 return true;
861 }
863 }
864 }
865
866 return false;
867}
868
869/*
870 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
871 * fork with RBM_NORMAL mode and default strategy.
872 */
873Buffer
878
879/*
880 * ReadBufferExtended -- returns a buffer containing the requested
881 * block of the requested relation. If the blknum
882 * requested is P_NEW, extend the relation file and
883 * allocate a new block. (Caller is responsible for
884 * ensuring that only one backend tries to extend a
885 * relation at the same time!)
886 *
887 * Returns: the buffer number for the buffer containing
888 * the block read. The returned buffer has been pinned.
889 * Does not return on error --- elog's instead.
890 *
891 * Assume when this function is called, that reln has been opened already.
892 *
893 * In RBM_NORMAL mode, the page is read from disk, and the page header is
894 * validated. An error is thrown if the page header is not valid. (But
895 * note that an all-zero page is considered "valid"; see
896 * PageIsVerified().)
897 *
898 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
899 * valid, the page is zeroed instead of throwing an error. This is intended
900 * for non-critical data, where the caller is prepared to repair errors.
901 *
902 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
903 * filled with zeros instead of reading it from disk. Useful when the caller
904 * is going to fill the page from scratch, since this saves I/O and avoids
905 * unnecessary failure if the page-on-disk has corrupt page headers.
906 * The page is returned locked to ensure that the caller has a chance to
907 * initialize the page before it's made visible to others.
908 * Caution: do not use this mode to read a page that is beyond the relation's
909 * current physical EOF; that is likely to cause problems in md.c when
910 * the page is modified and written out. P_NEW is OK, though.
911 *
912 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
913 * a cleanup-strength lock on the page.
914 *
915 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
916 *
917 * If strategy is not NULL, a nondefault buffer access strategy is used.
918 * See buffer/README for details.
919 */
920inline Buffer
923{
924 Buffer buf;
925
926 /*
927 * Reject attempts to read non-local temporary relations; we would be
928 * likely to get wrong data since we have no visibility into the owning
929 * session's local buffers.
930 */
934 errmsg("cannot access temporary tables of other sessions")));
935
936 /*
937 * Read the buffer, and update pgstat counters to reflect a cache hit or
938 * miss.
939 */
941 forkNum, blockNum, mode, strategy);
942
943 return buf;
944}
945
946
947/*
948 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
949 * a relcache entry for the relation.
950 *
951 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
952 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
953 * cannot be used for temporary relations (and making that work might be
954 * difficult, unless we only want to read temporary relations for our own
955 * ProcNumber).
956 */
957Buffer
960 BufferAccessStrategy strategy, bool permanent)
961{
962 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
963
964 return ReadBuffer_common(NULL, smgr,
966 forkNum, blockNum,
967 mode, strategy);
968}
969
970/*
971 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
972 */
973Buffer
975 ForkNumber forkNum,
976 BufferAccessStrategy strategy,
977 uint32 flags)
978{
979 Buffer buf;
980 uint32 extend_by = 1;
981
982 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
983 &buf, &extend_by);
984
985 return buf;
986}
987
988/*
989 * Extend relation by multiple blocks.
990 *
991 * Tries to extend the relation by extend_by blocks. Depending on the
992 * availability of resources the relation may end up being extended by a
993 * smaller number of pages (unless an error is thrown, always by at least one
994 * page). *extended_by is updated to the number of pages the relation has been
995 * extended to.
996 *
997 * buffers needs to be an array that is at least extend_by long. Upon
998 * completion, the first extend_by array elements will point to a pinned
999 * buffer.
1000 *
1001 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
1002 * locked. This is useful for callers that want a buffer that is guaranteed to
1003 * be empty.
1004 */
1008 BufferAccessStrategy strategy,
1009 uint32 flags,
1011 Buffer *buffers,
1013{
1014 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1015 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1016 Assert(extend_by > 0);
1017
1018 if (bmr.relpersistence == '\0')
1019 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1020
1021 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1023 buffers, extended_by);
1024}
1025
1026/*
1027 * Extend the relation so it is at least extend_to blocks large, return buffer
1028 * (extend_to - 1).
1029 *
1030 * This is useful for callers that want to write a specific page, regardless
1031 * of the current size of the relation (e.g. useful for visibilitymap and for
1032 * crash recovery).
1033 */
1034Buffer
1037 BufferAccessStrategy strategy,
1038 uint32 flags,
1041{
1043 uint32 extended_by = 0;
1045 Buffer buffers[64];
1046
1047 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1048 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1050
1051 if (bmr.relpersistence == '\0')
1052 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1053
1054 /*
1055 * If desired, create the file if it doesn't exist. If
1056 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1057 * an smgrexists call.
1058 */
1059 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1060 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1061 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1063 {
1065
1066 /* recheck, fork might have been created concurrently */
1069
1071 }
1072
1073 /*
1074 * If requested, invalidate size cache, so that smgrnblocks asks the
1075 * kernel.
1076 */
1077 if (flags & EB_CLEAR_SIZE_CACHE)
1078 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1079
1080 /*
1081 * Estimate how many pages we'll need to extend by. This avoids acquiring
1082 * unnecessarily many victim buffers.
1083 */
1085
1086 /*
1087 * Since no-one else can be looking at the page contents yet, there is no
1088 * difference between an exclusive lock and a cleanup-strength lock. Note
1089 * that we pass the original mode to ReadBuffer_common() below, when
1090 * falling back to reading the buffer to a concurrent relation extension.
1091 */
1093 flags |= EB_LOCK_TARGET;
1094
1095 while (current_size < extend_to)
1096 {
1097 uint32 num_pages = lengthof(buffers);
1099
1100 if ((uint64) current_size + num_pages > extend_to)
1101 num_pages = extend_to - current_size;
1102
1103 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1104 num_pages, extend_to,
1105 buffers, &extended_by);
1106
1108 Assert(num_pages != 0 || current_size >= extend_to);
1109
1110 for (uint32 i = 0; i < extended_by; i++)
1111 {
1112 if (first_block + i != extend_to - 1)
1113 ReleaseBuffer(buffers[i]);
1114 else
1115 buffer = buffers[i];
1116 }
1117 }
1118
1119 /*
1120 * It's possible that another backend concurrently extended the relation.
1121 * In that case read the buffer.
1122 *
1123 * XXX: Should we control this via a flag?
1124 */
1125 if (buffer == InvalidBuffer)
1126 {
1127 Assert(extended_by == 0);
1128 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1129 fork, extend_to - 1, mode, strategy);
1130 }
1131
1132 return buffer;
1133}
1134
1135/*
1136 * Lock and optionally zero a buffer, as part of the implementation of
1137 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1138 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1139 */
1140static void
1142{
1144 bool need_to_zero;
1146
1148
1149 if (already_valid)
1150 {
1151 /*
1152 * If the caller already knew the buffer was valid, we can skip some
1153 * header interaction. The caller just wants to lock the buffer.
1154 */
1155 need_to_zero = false;
1156 }
1157 else if (isLocalBuf)
1158 {
1159 /* Simple case for non-shared buffers. */
1161 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1162 }
1163 else
1164 {
1165 /*
1166 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1167 * concurrently. Even though we aren't doing I/O, that ensures that
1168 * we don't zero a page that someone else has pinned. An exclusive
1169 * content lock wouldn't be enough, because readers are allowed to
1170 * drop the content lock after determining that a tuple is visible
1171 * (see buffer access rules in README).
1172 */
1174 need_to_zero = StartBufferIO(bufHdr, true, false);
1175 }
1176
1177 if (need_to_zero)
1178 {
1180
1181 /*
1182 * Grab the buffer content lock before marking the page as valid, to
1183 * make sure that no other backend sees the zeroed page before the
1184 * caller has had a chance to initialize it.
1185 *
1186 * Since no-one else can be looking at the page contents yet, there is
1187 * no difference between an exclusive lock and a cleanup-strength
1188 * lock. (Note that we cannot use LockBuffer() or
1189 * LockBufferForCleanup() here, because they assert that the buffer is
1190 * already valid.)
1191 */
1192 if (!isLocalBuf)
1194
1195 /* Set BM_VALID, terminate IO, and wake up any waiters */
1196 if (isLocalBuf)
1197 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1198 else
1199 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1200 }
1201 else if (!isLocalBuf)
1202 {
1203 /*
1204 * The buffer is valid, so we can't zero it. The caller still expects
1205 * the page to be locked on return.
1206 */
1207 if (mode == RBM_ZERO_AND_LOCK)
1209 else
1211 }
1212}
1213
1214/*
1215 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1216 * already present, or false if more work is required to either read it in or
1217 * zero it.
1218 */
1221 SMgrRelation smgr,
1222 char persistence,
1223 ForkNumber forkNum,
1224 BlockNumber blockNum,
1225 BufferAccessStrategy strategy,
1226 bool *foundPtr)
1227{
1231
1232 Assert(blockNum != P_NEW);
1233
1234 /* Persistence should be set before */
1235 Assert((persistence == RELPERSISTENCE_TEMP ||
1236 persistence == RELPERSISTENCE_PERMANENT ||
1237 persistence == RELPERSISTENCE_UNLOGGED));
1238
1239 if (persistence == RELPERSISTENCE_TEMP)
1240 {
1243 }
1244 else
1245 {
1246 io_context = IOContextForStrategy(strategy);
1248 }
1249
1250 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1254 smgr->smgr_rlocator.backend);
1255
1256 if (persistence == RELPERSISTENCE_TEMP)
1257 {
1258 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1259 if (*foundPtr)
1261 }
1262 else
1263 {
1264 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1265 strategy, foundPtr, io_context);
1266 if (*foundPtr)
1268 }
1269 if (rel)
1270 {
1271 /*
1272 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1273 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1274 * zeroed instead), the per-relation stats always count them.
1275 */
1277 if (*foundPtr)
1279 }
1280 if (*foundPtr)
1281 {
1283 if (VacuumCostActive)
1285
1286 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1290 smgr->smgr_rlocator.backend,
1291 true);
1292 }
1293
1295}
1296
1297/*
1298 * ReadBuffer_common -- common logic for all ReadBuffer variants
1299 *
1300 * smgr is required, rel is optional unless using P_NEW.
1301 */
1304 ForkNumber forkNum,
1306 BufferAccessStrategy strategy)
1307{
1308 ReadBuffersOperation operation;
1309 Buffer buffer;
1310 int flags;
1311 char persistence;
1312
1313 /*
1314 * Backward compatibility path, most code should use ExtendBufferedRel()
1315 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1316 * scales a lot better.
1317 */
1318 if (unlikely(blockNum == P_NEW))
1319 {
1321
1322 /*
1323 * Since no-one else can be looking at the page contents yet, there is
1324 * no difference between an exclusive lock and a cleanup-strength
1325 * lock.
1326 */
1328 flags |= EB_LOCK_FIRST;
1329
1330 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1331 }
1332
1333 if (rel)
1334 persistence = rel->rd_rel->relpersistence;
1335 else
1336 persistence = smgr_persistence;
1337
1340 {
1341 bool found;
1342
1343 buffer = PinBufferForBlock(rel, smgr, persistence,
1344 forkNum, blockNum, strategy, &found);
1345 ZeroAndLockBuffer(buffer, mode, found);
1346 return buffer;
1347 }
1348
1349 /*
1350 * Signal that we are going to immediately wait. If we're immediately
1351 * waiting, there is no benefit in actually executing the IO
1352 * asynchronously, it would just add dispatch overhead.
1353 */
1355 if (mode == RBM_ZERO_ON_ERROR)
1357 operation.smgr = smgr;
1358 operation.rel = rel;
1359 operation.persistence = persistence;
1360 operation.forknum = forkNum;
1361 operation.strategy = strategy;
1362 if (StartReadBuffer(&operation,
1363 &buffer,
1364 blockNum,
1365 flags))
1366 WaitReadBuffers(&operation);
1367
1368 return buffer;
1369}
1370
1373 Buffer *buffers,
1374 BlockNumber blockNum,
1375 int *nblocks,
1376 int flags,
1377 bool allow_forwarding)
1378{
1379 int actual_nblocks = *nblocks;
1380 int maxcombine = 0;
1381 bool did_start_io;
1382
1383 Assert(*nblocks == 1 || allow_forwarding);
1384 Assert(*nblocks > 0);
1385 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1386
1387 for (int i = 0; i < actual_nblocks; ++i)
1388 {
1389 bool found;
1390
1391 if (allow_forwarding && buffers[i] != InvalidBuffer)
1392 {
1394
1395 /*
1396 * This is a buffer that was pinned by an earlier call to
1397 * StartReadBuffers(), but couldn't be handled in one operation at
1398 * that time. The operation was split, and the caller has passed
1399 * an already pinned buffer back to us to handle the rest of the
1400 * operation. It must continue at the expected block number.
1401 */
1402 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1403
1404 /*
1405 * It might be an already valid buffer (a hit) that followed the
1406 * final contiguous block of an earlier I/O (a miss) marking the
1407 * end of it, or a buffer that some other backend has since made
1408 * valid by performing the I/O for us, in which case we can handle
1409 * it as a hit now. It is safe to check for a BM_VALID flag with
1410 * a relaxed load, because we got a fresh view of it while pinning
1411 * it in the previous call.
1412 *
1413 * On the other hand if we don't see BM_VALID yet, it must be an
1414 * I/O that was split by the previous call and we need to try to
1415 * start a new I/O from this block. We're also racing against any
1416 * other backend that might start the I/O or even manage to mark
1417 * it BM_VALID after this check, but StartBufferIO() will handle
1418 * those cases.
1419 */
1420 if (BufferIsLocal(buffers[i]))
1421 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1422 else
1423 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1425 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1426 }
1427 else
1428 {
1429 buffers[i] = PinBufferForBlock(operation->rel,
1430 operation->smgr,
1431 operation->persistence,
1432 operation->forknum,
1433 blockNum + i,
1434 operation->strategy,
1435 &found);
1436 }
1437
1438 if (found)
1439 {
1440 /*
1441 * We have a hit. If it's the first block in the requested range,
1442 * we can return it immediately and report that WaitReadBuffers()
1443 * does not need to be called. If the initial value of *nblocks
1444 * was larger, the caller will have to call again for the rest.
1445 */
1446 if (i == 0)
1447 {
1448 *nblocks = 1;
1449
1450#ifdef USE_ASSERT_CHECKING
1451
1452 /*
1453 * Initialize enough of ReadBuffersOperation to make
1454 * CheckReadBuffersOperation() work. Outside of assertions
1455 * that's not necessary when no IO is issued.
1456 */
1457 operation->buffers = buffers;
1458 operation->blocknum = blockNum;
1459 operation->nblocks = 1;
1460 operation->nblocks_done = 1;
1461 CheckReadBuffersOperation(operation, true);
1462#endif
1463 return false;
1464 }
1465
1466 /*
1467 * Otherwise we already have an I/O to perform, but this block
1468 * can't be included as it is already valid. Split the I/O here.
1469 * There may or may not be more blocks requiring I/O after this
1470 * one, we haven't checked, but they can't be contiguous with this
1471 * one in the way. We'll leave this buffer pinned, forwarding it
1472 * to the next call, avoiding the need to unpin it here and re-pin
1473 * it in the next call.
1474 */
1475 actual_nblocks = i;
1476 break;
1477 }
1478 else
1479 {
1480 /*
1481 * Check how many blocks we can cover with the same IO. The smgr
1482 * implementation might e.g. be limited due to a segment boundary.
1483 */
1484 if (i == 0 && actual_nblocks > 1)
1485 {
1486 maxcombine = smgrmaxcombine(operation->smgr,
1487 operation->forknum,
1488 blockNum);
1490 {
1491 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1492 blockNum, actual_nblocks, maxcombine);
1494 }
1495 }
1496 }
1497 }
1498 *nblocks = actual_nblocks;
1499
1500 /* Populate information needed for I/O. */
1501 operation->buffers = buffers;
1502 operation->blocknum = blockNum;
1503 operation->flags = flags;
1504 operation->nblocks = actual_nblocks;
1505 operation->nblocks_done = 0;
1506 pgaio_wref_clear(&operation->io_wref);
1507
1508 /*
1509 * When using AIO, start the IO in the background. If not, issue prefetch
1510 * requests if desired by the caller.
1511 *
1512 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1513 * de-risk the introduction of AIO somewhat. It's a large architectural
1514 * change, with lots of chances for unanticipated performance effects.
1515 *
1516 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1517 * asynchronously, but without the check here we'd execute IO earlier than
1518 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1519 */
1520 if (io_method != IOMETHOD_SYNC)
1521 {
1522 /*
1523 * Try to start IO asynchronously. It's possible that no IO needs to
1524 * be started, if another backend already performed the IO.
1525 *
1526 * Note that if an IO is started, it might not cover the entire
1527 * requested range, e.g. because an intermediary block has been read
1528 * in by another backend. In that case any "trailing" buffers we
1529 * already pinned above will be "forwarded" by read_stream.c to the
1530 * next call to StartReadBuffers().
1531 *
1532 * This is signalled to the caller by decrementing *nblocks *and*
1533 * reducing operation->nblocks. The latter is done here, but not below
1534 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1535 * overall read size anymore, we need to retry until done in its
1536 * entirety or until failed.
1537 */
1538 did_start_io = AsyncReadBuffers(operation, nblocks);
1539
1540 operation->nblocks = *nblocks;
1541 }
1542 else
1543 {
1544 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1545
1546 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1547 {
1548 /*
1549 * In theory we should only do this if PinBufferForBlock() had to
1550 * allocate new buffers above. That way, if two calls to
1551 * StartReadBuffers() were made for the same blocks before
1552 * WaitReadBuffers(), only the first would issue the advice.
1553 * That'd be a better simulation of true asynchronous I/O, which
1554 * would only start the I/O once, but isn't done here for
1555 * simplicity.
1556 */
1557 smgrprefetch(operation->smgr,
1558 operation->forknum,
1559 blockNum,
1561 }
1562
1563 /*
1564 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1565 * will initiate the necessary IO.
1566 */
1567 did_start_io = true;
1568 }
1569
1571
1572 return did_start_io;
1573}
1574
1575/*
1576 * Begin reading a range of blocks beginning at blockNum and extending for
1577 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1578 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1579 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1580 * and is now being continued. On return, *nblocks holds the number of blocks
1581 * accepted by this operation. If it is less than the original number then
1582 * this operation has been split, but buffer elements up to the original
1583 * requested size may hold forwarded buffers to be used for a continuing
1584 * operation. The caller must either start a new I/O beginning at the block
1585 * immediately following the blocks accepted by this call and pass those
1586 * buffers back in, or release them if it chooses not to. It shouldn't make
1587 * any other use of or assumptions about forwarded buffers.
1588 *
1589 * If false is returned, no I/O is necessary and the buffers covered by
1590 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1591 * an I/O has been started, and WaitReadBuffers() must be called with the same
1592 * operation object before the buffers covered by *nblocks on exit can be
1593 * accessed. Along with the operation object, the caller-supplied array of
1594 * buffers must remain valid until WaitReadBuffers() is called, and any
1595 * forwarded buffers must also be preserved for a continuing call unless
1596 * they are explicitly released.
1597 */
1598bool
1600 Buffer *buffers,
1601 BlockNumber blockNum,
1602 int *nblocks,
1603 int flags)
1604{
1605 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1606 true /* expect forwarded buffers */ );
1607}
1608
1609/*
1610 * Single block version of the StartReadBuffers(). This might save a few
1611 * instructions when called from another translation unit, because it is
1612 * specialized for nblocks == 1.
1613 *
1614 * This version does not support "forwarded" buffers: they cannot be created
1615 * by reading only one block and *buffer is ignored on entry.
1616 */
1617bool
1619 Buffer *buffer,
1620 BlockNumber blocknum,
1621 int flags)
1622{
1623 int nblocks = 1;
1624 bool result;
1625
1626 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1627 false /* single block, no forwarding */ );
1628 Assert(nblocks == 1); /* single block can't be short */
1629
1630 return result;
1631}
1632
1633/*
1634 * Perform sanity checks on the ReadBuffersOperation.
1635 */
1636static void
1638{
1639#ifdef USE_ASSERT_CHECKING
1640 Assert(operation->nblocks_done <= operation->nblocks);
1641 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1642
1643 for (int i = 0; i < operation->nblocks; i++)
1644 {
1645 Buffer buffer = operation->buffers[i];
1649
1650 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1652
1653 if (i < operation->nblocks_done)
1655 }
1656#endif
1657}
1658
1659/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1660static inline bool
1662{
1663 if (BufferIsLocal(buffer))
1665 true, nowait);
1666 else
1667 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1668}
1669
1670/*
1671 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1672 */
1673static inline bool
1675{
1676 /*
1677 * If this backend currently has staged IO, we need to submit the pending
1678 * IO before waiting for the right to issue IO, to avoid the potential for
1679 * deadlocks (and, more commonly, unnecessary delays for other backends).
1680 */
1681 if (!nowait && pgaio_have_staged())
1682 {
1684 return true;
1685
1686 /*
1687 * Unfortunately StartBufferIO() returning false doesn't allow to
1688 * distinguish between the buffer already being valid and IO already
1689 * being in progress. Since IO already being in progress is quite
1690 * rare, this approach seems fine.
1691 */
1693 }
1694
1695 return ReadBuffersCanStartIOOnce(buffer, nowait);
1696}
1697
1698/*
1699 * Helper for WaitReadBuffers() that processes the results of a readv
1700 * operation, raising an error if necessary.
1701 */
1702static void
1704{
1705 PgAioReturn *aio_ret = &operation->io_return;
1707 int newly_read_blocks = 0;
1708
1709 Assert(pgaio_wref_valid(&operation->io_wref));
1710 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1711
1712 /*
1713 * SMGR reports the number of blocks successfully read as the result of
1714 * the IO operation. Thus we can simply add that to ->nblocks_done.
1715 */
1716
1717 if (likely(rs != PGAIO_RS_ERROR))
1718 newly_read_blocks = aio_ret->result.result;
1719
1720 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1721 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1722 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1723 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1724 {
1725 /*
1726 * We'll retry, so we just emit a debug message to the server log (or
1727 * not even that in prod scenarios).
1728 */
1729 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1730 elog(DEBUG3, "partial read, will retry");
1731 }
1732
1735
1736 operation->nblocks_done += newly_read_blocks;
1737
1738 Assert(operation->nblocks_done <= operation->nblocks);
1739}
1740
1741void
1743{
1744 PgAioReturn *aio_ret = &operation->io_return;
1747
1748 if (operation->persistence == RELPERSISTENCE_TEMP)
1749 {
1752 }
1753 else
1754 {
1757 }
1758
1759 /*
1760 * If we get here without an IO operation having been issued, the
1761 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1762 * caller should not have called WaitReadBuffers().
1763 *
1764 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1765 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1766 * of the retry logic below, no extra code is required.
1767 *
1768 * This path is expected to eventually go away.
1769 */
1770 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1771 elog(ERROR, "waiting for read operation that didn't read");
1772
1773 /*
1774 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1775 * done. We may need multiple retries, not just because we could get
1776 * multiple partial reads, but also because some of the remaining
1777 * to-be-read buffers may have been read in by other backends, limiting
1778 * the IO size.
1779 */
1780 while (true)
1781 {
1783
1784 CheckReadBuffersOperation(operation, false);
1785
1786 /*
1787 * If there is an IO associated with the operation, we may need to
1788 * wait for it.
1789 */
1790 if (pgaio_wref_valid(&operation->io_wref))
1791 {
1792 /*
1793 * Track the time spent waiting for the IO to complete. As
1794 * tracking a wait even if we don't actually need to wait
1795 *
1796 * a) is not cheap, due to the timestamping overhead
1797 *
1798 * b) reports some time as waiting, even if we never waited
1799 *
1800 * we first check if we already know the IO is complete.
1801 */
1802 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1803 !pgaio_wref_check_done(&operation->io_wref))
1804 {
1806
1807 pgaio_wref_wait(&operation->io_wref);
1808
1809 /*
1810 * The IO operation itself was already counted earlier, in
1811 * AsyncReadBuffers(), this just accounts for the wait time.
1812 */
1814 io_start, 0, 0);
1815 }
1816 else
1817 {
1818 Assert(pgaio_wref_check_done(&operation->io_wref));
1819 }
1820
1821 /*
1822 * We now are sure the IO completed. Check the results. This
1823 * includes reporting on errors if there were any.
1824 */
1825 ProcessReadBuffersResult(operation);
1826 }
1827
1828 /*
1829 * Most of the time, the one IO we already started, will read in
1830 * everything. But we need to deal with partial reads and buffers not
1831 * needing IO anymore.
1832 */
1833 if (operation->nblocks_done == operation->nblocks)
1834 break;
1835
1837
1838 /*
1839 * This may only complete the IO partially, either because some
1840 * buffers were already valid, or because of a partial read.
1841 *
1842 * NB: In contrast to after the AsyncReadBuffers() call in
1843 * StartReadBuffers(), we do *not* reduce
1844 * ReadBuffersOperation->nblocks here, callers expect the full
1845 * operation to be completed at this point (as more operations may
1846 * have been queued).
1847 */
1849 }
1850
1851 CheckReadBuffersOperation(operation, true);
1852
1853 /* NB: READ_DONE tracepoint was already executed in completion callback */
1854}
1855
1856/*
1857 * Initiate IO for the ReadBuffersOperation
1858 *
1859 * This function only starts a single IO at a time. The size of the IO may be
1860 * limited to below the to-be-read blocks, if one of the buffers has
1861 * concurrently been read in. If the first to-be-read buffer is already valid,
1862 * no IO will be issued.
1863 *
1864 * To support retries after partial reads, the first operation->nblocks_done
1865 * buffers are skipped.
1866 *
1867 * On return *nblocks_progress is updated to reflect the number of buffers
1868 * affected by the call. If the first buffer is valid, *nblocks_progress is
1869 * set to 1 and operation->nblocks_done is incremented.
1870 *
1871 * Returns true if IO was initiated, false if no IO was necessary.
1872 */
1873static bool
1875{
1876 Buffer *buffers = &operation->buffers[0];
1877 int flags = operation->flags;
1878 BlockNumber blocknum = operation->blocknum;
1879 ForkNumber forknum = operation->forknum;
1880 char persistence = operation->persistence;
1881 int16 nblocks_done = operation->nblocks_done;
1882 Buffer *io_buffers = &operation->buffers[nblocks_done];
1883 int io_buffers_len = 0;
1885 uint32 ioh_flags = 0;
1889 bool did_start_io;
1890
1891 /*
1892 * When this IO is executed synchronously, either because the caller will
1893 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1894 * the AIO subsystem needs to know.
1895 */
1896 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1898
1899 if (persistence == RELPERSISTENCE_TEMP)
1900 {
1904 }
1905 else
1906 {
1909 }
1910
1911 /*
1912 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1913 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1914 * set globally, but on a per-session basis. The completion callback,
1915 * which may be run in other processes, e.g. in IO workers, may have a
1916 * different value of the zero_damaged_pages GUC.
1917 *
1918 * XXX: We probably should eventually use a different flag for
1919 * zero_damaged_pages, so we can report different log levels / error codes
1920 * for zero_damaged_pages and ZERO_ON_ERROR.
1921 */
1924
1925 /*
1926 * For the same reason as with zero_damaged_pages we need to use this
1927 * backend's ignore_checksum_failure value.
1928 */
1931
1932
1933 /*
1934 * To be allowed to report stats in the local completion callback we need
1935 * to prepare to report stats now. This ensures we can safely report the
1936 * checksum failure even in a critical section.
1937 */
1939
1940 /*
1941 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1942 * might block, which we don't want after setting IO_IN_PROGRESS.
1943 *
1944 * If we need to wait for IO before we can get a handle, submit
1945 * already-staged IO first, so that other backends don't need to wait.
1946 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1947 * wait for already submitted IO, which doesn't require additional locks,
1948 * but it could still cause undesirable waits.
1949 *
1950 * A secondary benefit is that this would allow us to measure the time in
1951 * pgaio_io_acquire() without causing undue timer overhead in the common,
1952 * non-blocking, case. However, currently the pgstats infrastructure
1953 * doesn't really allow that, as it a) asserts that an operation can't
1954 * have time without operations b) doesn't have an API to report
1955 * "accumulated" time.
1956 */
1958 if (unlikely(!ioh))
1959 {
1961
1963 }
1964
1965 /*
1966 * Check if we can start IO on the first to-be-read buffer.
1967 *
1968 * If an I/O is already in progress in another backend, we want to wait
1969 * for the outcome: either done, or something went wrong and we will
1970 * retry.
1971 */
1972 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1973 {
1974 /*
1975 * Someone else has already completed this block, we're done.
1976 *
1977 * When IO is necessary, ->nblocks_done is updated in
1978 * ProcessReadBuffersResult(), but that is not called if no IO is
1979 * necessary. Thus update here.
1980 */
1981 operation->nblocks_done += 1;
1982 *nblocks_progress = 1;
1983
1985 pgaio_wref_clear(&operation->io_wref);
1986 did_start_io = false;
1987
1988 /*
1989 * Report and track this as a 'hit' for this backend, even though it
1990 * must have started out as a miss in PinBufferForBlock(). The other
1991 * backend will track this as a 'read'.
1992 */
1993 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1994 operation->smgr->smgr_rlocator.locator.spcOid,
1995 operation->smgr->smgr_rlocator.locator.dbOid,
1996 operation->smgr->smgr_rlocator.locator.relNumber,
1997 operation->smgr->smgr_rlocator.backend,
1998 true);
1999
2000 if (persistence == RELPERSISTENCE_TEMP)
2002 else
2004
2005 if (operation->rel)
2006 pgstat_count_buffer_hit(operation->rel);
2007
2009
2010 if (VacuumCostActive)
2012 }
2013 else
2014 {
2016
2017 /* We found a buffer that we need to read in. */
2018 Assert(io_buffers[0] == buffers[nblocks_done]);
2019 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2020 io_buffers_len = 1;
2021
2022 /*
2023 * How many neighboring-on-disk blocks can we scatter-read into other
2024 * buffers at the same time? In this case we don't wait if we see an
2025 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2026 * head block, so we should get on with that I/O as soon as possible.
2027 */
2028 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2029 {
2030 if (!ReadBuffersCanStartIO(buffers[i], true))
2031 break;
2032 /* Must be consecutive block numbers. */
2033 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2034 BufferGetBlockNumber(buffers[i]) - 1);
2035 Assert(io_buffers[io_buffers_len] == buffers[i]);
2036
2037 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2038 }
2039
2040 /* get a reference to wait for in WaitReadBuffers() */
2041 pgaio_io_get_wref(ioh, &operation->io_wref);
2042
2043 /* provide the list of buffers to the completion callbacks */
2045
2047 persistence == RELPERSISTENCE_TEMP ?
2050 flags);
2051
2053
2054 /* ---
2055 * Even though we're trying to issue IO asynchronously, track the time
2056 * in smgrstartreadv():
2057 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2058 * immediately
2059 * - the io method might not support the IO (e.g. worker IO for a temp
2060 * table)
2061 * ---
2062 */
2064 smgrstartreadv(ioh, operation->smgr, forknum,
2065 blocknum + nblocks_done,
2069
2070 if (persistence == RELPERSISTENCE_TEMP)
2072 else
2074
2075 /*
2076 * Track vacuum cost when issuing IO, not after waiting for it.
2077 * Otherwise we could end up issuing a lot of IO in a short timespan,
2078 * despite a low cost limit.
2079 */
2080 if (VacuumCostActive)
2082
2084 did_start_io = true;
2085 }
2086
2087 return did_start_io;
2088}
2089
2090/*
2091 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2092 * buffer. If no buffer exists already, selects a replacement victim and
2093 * evicts the old page, but does NOT read in new page.
2094 *
2095 * "strategy" can be a buffer replacement strategy object, or NULL for
2096 * the default strategy. The selected buffer's usage_count is advanced when
2097 * using the default strategy, but otherwise possibly not (see PinBuffer).
2098 *
2099 * The returned buffer is pinned and is already marked as holding the
2100 * desired page. If it already did have the desired page, *foundPtr is
2101 * set true. Otherwise, *foundPtr is set false.
2102 *
2103 * io_context is passed as an output parameter to avoid calling
2104 * IOContextForStrategy() when there is a shared buffers hit and no IO
2105 * statistics need be captured.
2106 *
2107 * No locks are held either at entry or exit.
2108 */
2110BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2111 BlockNumber blockNum,
2112 BufferAccessStrategy strategy,
2114{
2115 BufferTag newTag; /* identity of requested block */
2116 uint32 newHash; /* hash value for newTag */
2117 LWLock *newPartitionLock; /* buffer partition lock for it */
2118 int existing_buf_id;
2122 uint64 set_bits = 0;
2123
2124 /* Make sure we will have room to remember the buffer pin */
2127
2128 /* create a tag so we can lookup the buffer */
2129 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2130
2131 /* determine its hash code and partition lock ID */
2134
2135 /* see if the block is in the buffer pool already */
2138 if (existing_buf_id >= 0)
2139 {
2140 BufferDesc *buf;
2141 bool valid;
2142
2143 /*
2144 * Found it. Now, pin the buffer so no one can steal it from the
2145 * buffer pool, and check to see if the correct data has been loaded
2146 * into the buffer.
2147 */
2149
2150 valid = PinBuffer(buf, strategy, false);
2151
2152 /* Can release the mapping lock as soon as we've pinned it */
2154
2155 *foundPtr = true;
2156
2157 if (!valid)
2158 {
2159 /*
2160 * We can only get here if (a) someone else is still reading in
2161 * the page, (b) a previous read attempt failed, or (c) someone
2162 * called StartReadBuffers() but not yet WaitReadBuffers().
2163 */
2164 *foundPtr = false;
2165 }
2166
2167 return buf;
2168 }
2169
2170 /*
2171 * Didn't find it in the buffer pool. We'll have to initialize a new
2172 * buffer. Remember to unlock the mapping lock while doing the work.
2173 */
2175
2176 /*
2177 * Acquire a victim buffer. Somebody else might try to do the same, we
2178 * don't hold any conflicting locks. If so we'll have to undo our work
2179 * later.
2180 */
2183
2184 /*
2185 * Try to make a hashtable entry for the buffer under its new tag. If
2186 * somebody else inserted another buffer for the tag, we'll release the
2187 * victim buffer we acquired and use the already inserted one.
2188 */
2191 if (existing_buf_id >= 0)
2192 {
2194 bool valid;
2195
2196 /*
2197 * Got a collision. Someone has already done what we were about to do.
2198 * We'll just handle this as if it were found in the buffer pool in
2199 * the first place. First, give up the buffer we were planning to
2200 * use.
2201 *
2202 * We could do this after releasing the partition lock, but then we'd
2203 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2204 * before acquiring the lock, for the rare case of such a collision.
2205 */
2207
2208 /* remaining code should match code at top of routine */
2209
2211
2212 valid = PinBuffer(existing_buf_hdr, strategy, false);
2213
2214 /* Can release the mapping lock as soon as we've pinned it */
2216
2217 *foundPtr = true;
2218
2219 if (!valid)
2220 {
2221 /*
2222 * We can only get here if (a) someone else is still reading in
2223 * the page, (b) a previous read attempt failed, or (c) someone
2224 * called StartReadBuffers() but not yet WaitReadBuffers().
2225 */
2226 *foundPtr = false;
2227 }
2228
2229 return existing_buf_hdr;
2230 }
2231
2232 /*
2233 * Need to lock the buffer header too in order to change its tag.
2234 */
2236
2237 /* some sanity checks while we hold the buffer header lock */
2240
2241 victim_buf_hdr->tag = newTag;
2242
2243 /*
2244 * Make sure BM_PERMANENT is set for buffers that must be written at every
2245 * checkpoint. Unlogged buffers only need to be written at shutdown
2246 * checkpoints, except for their "init" forks, which need to be treated
2247 * just like permanent relations.
2248 */
2250 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2252
2254 set_bits, 0, 0);
2255
2257
2258 /*
2259 * Buffer contents are currently invalid.
2260 */
2261 *foundPtr = false;
2262
2263 return victim_buf_hdr;
2264}
2265
2266/*
2267 * InvalidateBuffer -- mark a shared buffer invalid.
2268 *
2269 * The buffer header spinlock must be held at entry. We drop it before
2270 * returning. (This is sane because the caller must have locked the
2271 * buffer in order to be sure it should be dropped.)
2272 *
2273 * This is used only in contexts such as dropping a relation. We assume
2274 * that no other backend could possibly be interested in using the page,
2275 * so the only reason the buffer might be pinned is if someone else is
2276 * trying to write it out. We have to let them finish before we can
2277 * reclaim the buffer.
2278 *
2279 * The buffer could get reclaimed by someone else while we are waiting
2280 * to acquire the necessary locks; if so, don't mess it up.
2281 */
2282static void
2284{
2286 uint32 oldHash; /* hash value for oldTag */
2287 LWLock *oldPartitionLock; /* buffer partition lock for it */
2290
2291 /* Save the original buffer tag before dropping the spinlock */
2292 oldTag = buf->tag;
2293
2295
2296 /*
2297 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2298 * worth storing the hashcode in BufferDesc so we need not recompute it
2299 * here? Probably not.
2300 */
2303
2304retry:
2305
2306 /*
2307 * Acquire exclusive mapping lock in preparation for changing the buffer's
2308 * association.
2309 */
2311
2312 /* Re-lock the buffer header */
2314
2315 /* If it's changed while we were waiting for lock, do nothing */
2316 if (!BufferTagsEqual(&buf->tag, &oldTag))
2317 {
2320 return;
2321 }
2322
2323 /*
2324 * We assume the reason for it to be pinned is that either we were
2325 * asynchronously reading the page in before erroring out or someone else
2326 * is flushing the page out. Wait for the IO to finish. (This could be
2327 * an infinite loop if the refcount is messed up... it would be nice to
2328 * time out after awhile, but there seems no way to be sure how many loops
2329 * may be needed. Note that if the other guy has pinned the buffer but
2330 * not yet done StartBufferIO, WaitIO will fall through and we'll
2331 * effectively be busy-looping here.)
2332 */
2334 {
2337 /* safety check: should definitely not be our *own* pin */
2339 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2340 WaitIO(buf);
2341 goto retry;
2342 }
2343
2344 /*
2345 * An invalidated buffer should not have any backends waiting to lock the
2346 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2347 */
2349
2350 /*
2351 * Clear out the buffer's tag and flags. We must do this to ensure that
2352 * linear scans of the buffer array don't think the buffer is valid.
2353 */
2355 ClearBufferTag(&buf->tag);
2356
2358 0,
2360 0);
2361
2362 /*
2363 * Remove the buffer from the lookup hashtable, if it was in there.
2364 */
2365 if (oldFlags & BM_TAG_VALID)
2367
2368 /*
2369 * Done with mapping lock.
2370 */
2372}
2373
2374/*
2375 * Helper routine for GetVictimBuffer()
2376 *
2377 * Needs to be called on a buffer with a valid tag, pinned, but without the
2378 * buffer header spinlock held.
2379 *
2380 * Returns true if the buffer can be reused, in which case the buffer is only
2381 * pinned by this backend and marked as invalid, false otherwise.
2382 */
2383static bool
2385{
2387 uint32 hash;
2389 BufferTag tag;
2390
2392
2393 /* have buffer pinned, so it's safe to read tag without lock */
2394 tag = buf_hdr->tag;
2395
2396 hash = BufTableHashCode(&tag);
2398
2400
2401 /* lock the buffer header */
2403
2404 /*
2405 * We have the buffer pinned nobody else should have been able to unset
2406 * this concurrently.
2407 */
2410 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2411
2412 /*
2413 * If somebody else pinned the buffer since, or even worse, dirtied it,
2414 * give up on this buffer: It's clearly in use.
2415 */
2417 {
2419
2422
2423 return false;
2424 }
2425
2426 /*
2427 * An invalidated buffer should not have any backends waiting to lock the
2428 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2429 */
2431
2432 /*
2433 * Clear out the buffer's tag and flags and usagecount. This is not
2434 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2435 * doing anything with the buffer. But currently it's beneficial, as the
2436 * cheaper pre-check for several linear scans of shared buffers use the
2437 * tag (see e.g. FlushDatabaseBuffers()).
2438 */
2439 ClearBufferTag(&buf_hdr->tag);
2441 0,
2443 0);
2444
2446
2447 /* finally delete buffer from the buffer mapping table */
2448 BufTableDelete(&tag, hash);
2449
2451
2456
2457 return true;
2458}
2459
2460static Buffer
2462{
2464 Buffer buf;
2466 bool from_ring;
2467
2468 /*
2469 * Ensure, before we pin a victim buffer, that there's a free refcount
2470 * entry and resource owner slot for the pin.
2471 */
2474
2475 /* we return here if a prospective victim buffer gets used concurrently */
2476again:
2477
2478 /*
2479 * Select a victim buffer. The buffer is returned pinned and owned by
2480 * this backend.
2481 */
2484
2485 /*
2486 * We shouldn't have any other pins for this buffer.
2487 */
2489
2490 /*
2491 * If the buffer was dirty, try to write it out. There is a race
2492 * condition here, another backend could dirty the buffer between
2493 * StrategyGetBuffer() checking that it is not in use and invalidating the
2494 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2495 * that the buffer is not dirty.
2496 */
2497 if (buf_state & BM_DIRTY)
2498 {
2501
2502 /*
2503 * We need a share-exclusive lock on the buffer contents to write it
2504 * out (else we might write invalid data, eg because someone else is
2505 * compacting the page contents while we write). We must use a
2506 * conditional lock acquisition here to avoid deadlock. Even though
2507 * the buffer was not pinned (and therefore surely not locked) when
2508 * StrategyGetBuffer returned it, someone else could have pinned and
2509 * (share-)exclusive-locked it by the time we get here. If we try to
2510 * get the lock unconditionally, we'd block waiting for them; if they
2511 * later block waiting for us, deadlock ensues. (This has been
2512 * observed to happen when two backends are both trying to split btree
2513 * index pages, and the second one just happens to be trying to split
2514 * the page the first one got from StrategyGetBuffer.)
2515 */
2517 {
2518 /*
2519 * Someone else has locked the buffer, so give it up and loop back
2520 * to get another one.
2521 */
2523 goto again;
2524 }
2525
2526 /*
2527 * If using a nondefault strategy, and this victim came from the
2528 * strategy ring, let the strategy decide whether to reject it when
2529 * reusing it would require a WAL flush. This only applies to
2530 * permanent buffers; unlogged buffers can have fake LSNs, so
2531 * XLogNeedsFlush() is not meaningful for them.
2532 *
2533 * We need to hold the content lock in at least share-exclusive mode
2534 * to safely inspect the page LSN, so this couldn't have been done
2535 * inside StrategyGetBuffer().
2536 */
2537 if (strategy && from_ring &&
2541 {
2544 goto again;
2545 }
2546
2547 /* OK, do the I/O */
2550
2552 &buf_hdr->tag);
2553 }
2554
2555
2556 if (buf_state & BM_VALID)
2557 {
2558 /*
2559 * When a BufferAccessStrategy is in use, blocks evicted from shared
2560 * buffers are counted as IOOP_EVICT in the corresponding context
2561 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2562 * strategy in two cases: 1) while initially claiming buffers for the
2563 * strategy ring 2) to replace an existing strategy ring buffer
2564 * because it is pinned or in use and cannot be reused.
2565 *
2566 * Blocks evicted from buffers already in the strategy ring are
2567 * counted as IOOP_REUSE in the corresponding strategy context.
2568 *
2569 * At this point, we can accurately count evictions and reuses,
2570 * because we have successfully claimed the valid buffer. Previously,
2571 * we may have been forced to release the buffer due to concurrent
2572 * pinners or erroring out.
2573 */
2575 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2576 }
2577
2578 /*
2579 * If the buffer has an entry in the buffer mapping table, delete it. This
2580 * can fail because another backend could have pinned or dirtied the
2581 * buffer.
2582 */
2584 {
2586 goto again;
2587 }
2588
2589 /* a final set of sanity checks */
2590#ifdef USE_ASSERT_CHECKING
2592
2595
2597#endif
2598
2599 return buf;
2600}
2601
2602/*
2603 * Return the maximum number of buffers that a backend should try to pin once,
2604 * to avoid exceeding its fair share. This is the highest value that
2605 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2606 * system with a very small buffer pool relative to max_connections.
2607 */
2608uint32
2610{
2611 return MaxProportionalPins;
2612}
2613
2614/*
2615 * Return the maximum number of additional buffers that this backend should
2616 * pin if it wants to stay under the per-backend limit, considering the number
2617 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2618 * return by this function can be zero.
2619 */
2620uint32
2622{
2624
2625 /*
2626 * We get the number of "overflowed" pins for free, but don't know the
2627 * number of pins in PrivateRefCountArray. The cost of calculating that
2628 * exactly doesn't seem worth it, so just assume the max.
2629 */
2631
2632 /* Is this backend already holding more than its fair share? */
2634 return 0;
2635
2637}
2638
2639/*
2640 * Limit the number of pins a batch operation may additionally acquire, to
2641 * avoid running out of pinnable buffers.
2642 *
2643 * One additional pin is always allowed, on the assumption that the operation
2644 * requires at least one to make progress.
2645 */
2646void
2648{
2649 uint32 limit;
2650
2651 if (*additional_pins <= 1)
2652 return;
2653
2654 limit = GetAdditionalPinLimit();
2655 limit = Max(limit, 1);
2656 if (limit < *additional_pins)
2657 *additional_pins = limit;
2658}
2659
2660/*
2661 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2662 * avoid duplicating the tracing and relpersistence related logic.
2663 */
2664static BlockNumber
2667 BufferAccessStrategy strategy,
2668 uint32 flags,
2671 Buffer *buffers,
2673{
2675
2677 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2678 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2679 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2680 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2681 extend_by);
2682
2683 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2686 buffers, &extend_by);
2687 else
2688 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2690 buffers, &extend_by);
2692
2694 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2695 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2696 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2697 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2698 *extended_by,
2699 first_block);
2700
2701 return first_block;
2702}
2703
2704/*
2705 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2706 * shared buffers.
2707 */
2708static BlockNumber
2711 BufferAccessStrategy strategy,
2712 uint32 flags,
2715 Buffer *buffers,
2717{
2721
2723
2724 /*
2725 * Acquire victim buffers for extension without holding extension lock.
2726 * Writing out victim buffers is the most expensive part of extending the
2727 * relation, particularly when doing so requires WAL flushes. Zeroing out
2728 * the buffers is also quite expensive, so do that before holding the
2729 * extension lock as well.
2730 *
2731 * These pages are pinned by us and not valid. While we hold the pin they
2732 * can't be acquired as victim buffers by another backend.
2733 */
2734 for (uint32 i = 0; i < extend_by; i++)
2735 {
2737
2738 buffers[i] = GetVictimBuffer(strategy, io_context);
2740
2741 /* new buffers are zero-filled */
2742 MemSet(buf_block, 0, BLCKSZ);
2743 }
2744
2745 /*
2746 * Lock relation against concurrent extensions, unless requested not to.
2747 *
2748 * We use the same extension lock for all forks. That's unnecessarily
2749 * restrictive, but currently extensions for forks don't happen often
2750 * enough to make it worth locking more granularly.
2751 *
2752 * Note that another backend might have extended the relation by the time
2753 * we get the lock.
2754 */
2755 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2757
2758 /*
2759 * If requested, invalidate size cache, so that smgrnblocks asks the
2760 * kernel.
2761 */
2762 if (flags & EB_CLEAR_SIZE_CACHE)
2763 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2764
2766
2767 /*
2768 * Now that we have the accurate relation size, check if the caller wants
2769 * us to extend to only up to a specific size. If there were concurrent
2770 * extensions, we might have acquired too many buffers and need to release
2771 * them.
2772 */
2774 {
2776
2778 extend_by = 0;
2779 else if ((uint64) first_block + extend_by > extend_upto)
2781
2782 for (uint32 i = extend_by; i < orig_extend_by; i++)
2783 {
2784 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2785
2787 }
2788
2789 if (extend_by == 0)
2790 {
2791 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2794 return first_block;
2795 }
2796 }
2797
2798 /* Fail if relation is already at maximum possible length */
2800 ereport(ERROR,
2802 errmsg("cannot extend relation %s beyond %u blocks",
2803 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2804 MaxBlockNumber)));
2805
2806 /*
2807 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2808 *
2809 * This needs to happen before we extend the relation, because as soon as
2810 * we do, other backends can start to read in those pages.
2811 */
2812 for (uint32 i = 0; i < extend_by; i++)
2813 {
2814 Buffer victim_buf = buffers[i];
2816 BufferTag tag;
2817 uint32 hash;
2819 int existing_id;
2820
2821 /* in case we need to pin an existing buffer below */
2824
2825 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2826 first_block + i);
2827 hash = BufTableHashCode(&tag);
2829
2831
2833
2834 /*
2835 * We get here only in the corner case where we are trying to extend
2836 * the relation but we found a pre-existing buffer. This can happen
2837 * because a prior attempt at extending the relation failed, and
2838 * because mdread doesn't complain about reads beyond EOF (when
2839 * zero_damaged_pages is ON) and so a previous attempt to read a block
2840 * beyond EOF could have left a "valid" zero-filled buffer.
2841 *
2842 * This has also been observed when relation was overwritten by
2843 * external process. Since the legitimate cases should always have
2844 * left a zero-filled buffer, complain if not PageIsNew.
2845 */
2846 if (existing_id >= 0)
2847 {
2850 bool valid;
2851
2852 /*
2853 * Pin the existing buffer before releasing the partition lock,
2854 * preventing it from being evicted.
2855 */
2856 valid = PinBuffer(existing_hdr, strategy, false);
2857
2860
2863
2864 if (valid && !PageIsNew((Page) buf_block))
2865 ereport(ERROR,
2866 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2867 existing_hdr->tag.blockNum,
2868 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2869
2870 /*
2871 * We *must* do smgr[zero]extend before succeeding, else the page
2872 * will not be reserved by the kernel, and the next P_NEW call
2873 * will decide to return the same page. Clear the BM_VALID bit,
2874 * do StartBufferIO() and proceed.
2875 *
2876 * Loop to handle the very small possibility that someone re-sets
2877 * BM_VALID between our clearing it and StartBufferIO inspecting
2878 * it.
2879 */
2880 do
2881 {
2883 } while (!StartBufferIO(existing_hdr, true, false));
2884 }
2885 else
2886 {
2888 uint64 set_bits = 0;
2889
2891
2892 /* some sanity checks while we hold the buffer header lock */
2895
2896 victim_buf_hdr->tag = tag;
2897
2899 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2901
2903 set_bits, 0,
2904 0);
2905
2907
2908 /* XXX: could combine the locked operations in it with the above */
2909 StartBufferIO(victim_buf_hdr, true, false);
2910 }
2911 }
2912
2914
2915 /*
2916 * Note: if smgrzeroextend fails, we will end up with buffers that are
2917 * allocated but not marked BM_VALID. The next relation extension will
2918 * still select the same block number (because the relation didn't get any
2919 * longer on disk) and so future attempts to extend the relation will find
2920 * the same buffers (if they have not been recycled) but come right back
2921 * here to try smgrzeroextend again.
2922 *
2923 * We don't need to set checksum for all-zero pages.
2924 */
2926
2927 /*
2928 * Release the file-extension lock; it's now OK for someone else to extend
2929 * the relation some more.
2930 *
2931 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2932 * take noticeable time.
2933 */
2934 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2936
2938 io_start, 1, extend_by * BLCKSZ);
2939
2940 /* Set BM_VALID, terminate IO, and wake up any waiters */
2941 for (uint32 i = 0; i < extend_by; i++)
2942 {
2943 Buffer buf = buffers[i];
2945 bool lock = false;
2946
2947 if (flags & EB_LOCK_FIRST && i == 0)
2948 lock = true;
2949 else if (flags & EB_LOCK_TARGET)
2950 {
2952 if (first_block + i + 1 == extend_upto)
2953 lock = true;
2954 }
2955
2956 if (lock)
2958
2959 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2960 }
2961
2963
2965
2966 return first_block;
2967}
2968
2969/*
2970 * BufferIsLockedByMe
2971 *
2972 * Checks if this backend has the buffer locked in any mode.
2973 *
2974 * Buffer must be pinned.
2975 */
2976bool
2978{
2980
2982
2983 if (BufferIsLocal(buffer))
2984 {
2985 /* Content locks are not maintained for local buffers. */
2986 return true;
2987 }
2988 else
2989 {
2991 return BufferLockHeldByMe(bufHdr);
2992 }
2993}
2994
2995/*
2996 * BufferIsLockedByMeInMode
2997 *
2998 * Checks if this backend has the buffer locked in the specified mode.
2999 *
3000 * Buffer must be pinned.
3001 */
3002bool
3004{
3006
3008
3009 if (BufferIsLocal(buffer))
3010 {
3011 /* Content locks are not maintained for local buffers. */
3012 return true;
3013 }
3014 else
3015 {
3018 }
3019}
3020
3021/*
3022 * BufferIsDirty
3023 *
3024 * Checks if buffer is already dirty.
3025 *
3026 * Buffer must be pinned and [share-]exclusive-locked. (Without such a lock,
3027 * the result may be stale before it's returned.)
3028 */
3029bool
3031{
3033
3035
3036 if (BufferIsLocal(buffer))
3037 {
3038 int bufid = -buffer - 1;
3039
3041 /* Content locks are not maintained for local buffers. */
3042 }
3043 else
3044 {
3048 }
3049
3050 return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3051}
3052
3053/*
3054 * MarkBufferDirty
3055 *
3056 * Marks buffer contents as dirty (actual write happens later).
3057 *
3058 * Buffer must be pinned and exclusive-locked. (If caller does not hold
3059 * exclusive lock, then somebody could be in process of writing the buffer,
3060 * leading to risk of bad data written to disk.)
3061 */
3062void
3064{
3068
3069 if (!BufferIsValid(buffer))
3070 elog(ERROR, "bad buffer ID: %d", buffer);
3071
3072 if (BufferIsLocal(buffer))
3073 {
3075 return;
3076 }
3077
3079
3082
3083 /*
3084 * NB: We have to wait for the buffer header spinlock to be not held, as
3085 * TerminateBufferIO() relies on the spinlock.
3086 */
3088 for (;;)
3089 {
3092
3094
3097
3099 buf_state))
3100 break;
3101 }
3102
3103 /*
3104 * If the buffer was not dirty already, do vacuum accounting.
3105 */
3106 if (!(old_buf_state & BM_DIRTY))
3107 {
3109 if (VacuumCostActive)
3111 }
3112}
3113
3114/*
3115 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3116 *
3117 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3118 * compared to calling the two routines separately. Now it's mainly just
3119 * a convenience function. However, if the passed buffer is valid and
3120 * already contains the desired block, we just return it as-is; and that
3121 * does save considerable work compared to a full release and reacquire.
3122 *
3123 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3124 * buffer actually needs to be released. This case is the same as ReadBuffer,
3125 * but can save some tests in the caller.
3126 */
3127Buffer
3129 Relation relation,
3130 BlockNumber blockNum)
3131{
3132 ForkNumber forkNum = MAIN_FORKNUM;
3134
3135 if (BufferIsValid(buffer))
3136 {
3138 if (BufferIsLocal(buffer))
3139 {
3141 if (bufHdr->tag.blockNum == blockNum &&
3142 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3143 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3144 return buffer;
3146 }
3147 else
3148 {
3150 /* we have pin, so it's ok to examine tag without spinlock */
3151 if (bufHdr->tag.blockNum == blockNum &&
3152 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3153 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3154 return buffer;
3156 }
3157 }
3158
3159 return ReadBuffer(relation, blockNum);
3160}
3161
3162/*
3163 * PinBuffer -- make buffer unavailable for replacement.
3164 *
3165 * For the default access strategy, the buffer's usage_count is incremented
3166 * when we first pin it; for other strategies we just make sure the usage_count
3167 * isn't zero. (The idea of the latter is that we don't want synchronized
3168 * heap scans to inflate the count, but we need it to not be zero to discourage
3169 * other backends from stealing buffers from our ring. As long as we cycle
3170 * through the ring faster than the global clock-sweep cycles, buffers in
3171 * our ring won't be chosen as victims for replacement by other backends.)
3172 *
3173 * This should be applied only to shared buffers, never local ones.
3174 *
3175 * Since buffers are pinned/unpinned very frequently, pin buffers without
3176 * taking the buffer header lock; instead update the state variable in loop of
3177 * CAS operations. Hopefully it's just a single CAS.
3178 *
3179 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3180 * must have been done already.
3181 *
3182 * Returns true if buffer is BM_VALID, else false. This provision allows
3183 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3184 * true, then a false return value also indicates that the buffer was
3185 * (recently) invalid and has not been pinned.
3186 */
3187static bool
3189 bool skip_if_not_valid)
3190{
3192 bool result;
3194
3197
3198 ref = GetPrivateRefCountEntry(b, true);
3199
3200 if (ref == NULL)
3201 {
3204
3206 for (;;)
3207 {
3209 return false;
3210
3211 /*
3212 * We're not allowed to increase the refcount while the buffer
3213 * header spinlock is held. Wait for the lock to be released.
3214 */
3217
3219
3220 /* increase refcount */
3222
3223 if (strategy == NULL)
3224 {
3225 /* Default case: increase usagecount unless already max. */
3228 }
3229 else
3230 {
3231 /*
3232 * Ring buffers shouldn't evict others from pool. Thus we
3233 * don't make usagecount more than 1.
3234 */
3237 }
3238
3240 buf_state))
3241 {
3242 result = (buf_state & BM_VALID) != 0;
3243
3245 break;
3246 }
3247 }
3248 }
3249 else
3250 {
3251 /*
3252 * If we previously pinned the buffer, it is likely to be valid, but
3253 * it may not be if StartReadBuffers() was called and
3254 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3255 * the flags without locking. This is racy, but it's OK to return
3256 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3257 * it'll see that it's now valid.
3258 *
3259 * Note: We deliberately avoid a Valgrind client request here.
3260 * Individual access methods can optionally superimpose buffer page
3261 * client requests on top of our client requests to enforce that
3262 * buffers are only accessed while locked (and pinned). It's possible
3263 * that the buffer page is legitimately non-accessible here. We
3264 * cannot meddle with that.
3265 */
3266 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3267
3268 Assert(ref->data.refcount > 0);
3269 ref->data.refcount++;
3271 }
3272
3273 return result;
3274}
3275
3276/*
3277 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3278 * The spinlock is released before return.
3279 *
3280 * As this function is called with the spinlock held, the caller has to
3281 * previously call ReservePrivateRefCountEntry() and
3282 * ResourceOwnerEnlarge(CurrentResourceOwner);
3283 *
3284 * Currently, no callers of this function want to modify the buffer's
3285 * usage_count at all, so there's no need for a strategy parameter.
3286 * Also we don't bother with a BM_VALID test (the caller could check that for
3287 * itself).
3288 *
3289 * Also all callers only ever use this function when it's known that the
3290 * buffer can't have a preexisting pin by this backend. That allows us to skip
3291 * searching the private refcount array & hash, which is a boon, because the
3292 * spinlock is still held.
3293 *
3294 * Note: use of this routine is frequently mandatory, not just an optimization
3295 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3296 * its state can change under us.
3297 */
3298static void
3300{
3302
3303 /*
3304 * As explained, We don't expect any preexisting pins. That allows us to
3305 * manipulate the PrivateRefCount after releasing the spinlock
3306 */
3308
3309 /*
3310 * Since we hold the buffer spinlock, we can update the buffer state and
3311 * release the lock in one operation.
3312 */
3314
3316 0, 0, 1);
3317
3319}
3320
3321/*
3322 * Support for waking up another backend that is waiting for the cleanup lock
3323 * to be released using BM_PIN_COUNT_WAITER.
3324 *
3325 * See LockBufferForCleanup().
3326 *
3327 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3328 * not just reducing the backend-local pincount for the buffer).
3329 */
3330static void
3332{
3333 /*
3334 * Acquire the buffer header lock, re-check that there's a waiter. Another
3335 * backend could have unpinned this buffer, and already woken up the
3336 * waiter.
3337 *
3338 * There's no danger of the buffer being replaced after we unpinned it
3339 * above, as it's pinned by the waiter. The waiter removes
3340 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3341 * backend waking it up.
3342 */
3344
3347 {
3348 /* we just released the last pin other than the waiter's */
3349 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3350
3353 0);
3354 ProcSendSignal(wait_backend_pgprocno);
3355 }
3356 else
3358}
3359
3360/*
3361 * UnpinBuffer -- make buffer available for replacement.
3362 *
3363 * This should be applied only to shared buffers, never local ones. This
3364 * always adjusts CurrentResourceOwner.
3365 */
3366static void
3374
3375static void
3377{
3380
3382
3383 /* not moving as we're likely deleting it soon anyway */
3384 ref = GetPrivateRefCountEntry(b, false);
3385 Assert(ref != NULL);
3386 Assert(ref->data.refcount > 0);
3387 ref->data.refcount--;
3388 if (ref->data.refcount == 0)
3389 {
3391
3392 /*
3393 * Mark buffer non-accessible to Valgrind.
3394 *
3395 * Note that the buffer may have already been marked non-accessible
3396 * within access method code that enforces that buffers are only
3397 * accessed while a buffer lock is held.
3398 */
3400
3401 /*
3402 * I'd better not still hold the buffer content lock. Can't use
3403 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3404 */
3406
3407 /* decrement the shared reference count */
3409
3410 /* Support LockBufferForCleanup() */
3413
3415 }
3416}
3417
3418/*
3419 * Set up backend-local tracking of a buffer pinned the first time by this
3420 * backend.
3421 */
3422inline void
3424{
3426
3428 ref->data.refcount++;
3429
3431
3432 /*
3433 * This is the first pin for this page by this backend, mark its page as
3434 * defined to valgrind. While the page contents might not actually be
3435 * valid yet, we don't currently guarantee that such pages are marked
3436 * undefined or non-accessible.
3437 *
3438 * It's not necessarily the prettiest to do this here, but otherwise we'd
3439 * need this block of code in multiple places.
3440 */
3442 BLCKSZ);
3443}
3444
3445#define ST_SORT sort_checkpoint_bufferids
3446#define ST_ELEMENT_TYPE CkptSortItem
3447#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3448#define ST_SCOPE static
3449#define ST_DEFINE
3450#include "lib/sort_template.h"
3451
3452/*
3453 * BufferSync -- Write out all dirty buffers in the pool.
3454 *
3455 * This is called at checkpoint time to write out all dirty shared buffers.
3456 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3457 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3458 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3459 * even unlogged buffers, which are otherwise skipped. The remaining flags
3460 * currently have no effect here.
3461 */
3462static void
3463BufferSync(int flags)
3464{
3466 int buf_id;
3467 int num_to_scan;
3468 int num_spaces;
3469 int num_processed;
3470 int num_written;
3472 Oid last_tsid;
3474 int i;
3475 uint64 mask = BM_DIRTY;
3477
3478 /*
3479 * Unless this is a shutdown checkpoint or we have been explicitly told,
3480 * we write only permanent, dirty buffers. But at shutdown or end of
3481 * recovery, we write all dirty buffers.
3482 */
3485 mask |= BM_PERMANENT;
3486
3487 /*
3488 * Loop over all buffers, and mark the ones that need to be written with
3489 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3490 * can estimate how much work needs to be done.
3491 *
3492 * This allows us to write only those pages that were dirty when the
3493 * checkpoint began, and not those that get dirtied while it proceeds.
3494 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3495 * later in this function, or by normal backends or the bgwriter cleaning
3496 * scan, the flag is cleared. Any buffer dirtied after this point won't
3497 * have the flag set.
3498 *
3499 * Note that if we fail to write some buffer, we may leave buffers with
3500 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3501 * certainly need to be written for the next checkpoint attempt, too.
3502 */
3503 num_to_scan = 0;
3504 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3505 {
3507 uint64 set_bits = 0;
3508
3509 /*
3510 * Header spinlock is enough to examine BM_DIRTY, see comment in
3511 * SyncOneBuffer.
3512 */
3514
3515 if ((buf_state & mask) == mask)
3516 {
3517 CkptSortItem *item;
3518
3520
3521 item = &CkptBufferIds[num_to_scan++];
3522 item->buf_id = buf_id;
3523 item->tsId = bufHdr->tag.spcOid;
3524 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3525 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3526 item->blockNum = bufHdr->tag.blockNum;
3527 }
3528
3530 set_bits, 0,
3531 0);
3532
3533 /* Check for barrier events in case NBuffers is large. */
3536 }
3537
3538 if (num_to_scan == 0)
3539 return; /* nothing to do */
3540
3542
3544
3545 /*
3546 * Sort buffers that need to be written to reduce the likelihood of random
3547 * IO. The sorting is also important for the implementation of balancing
3548 * writes between tablespaces. Without balancing writes we'd potentially
3549 * end up writing to the tablespaces one-by-one; possibly overloading the
3550 * underlying system.
3551 */
3553
3554 num_spaces = 0;
3555
3556 /*
3557 * Allocate progress status for each tablespace with buffers that need to
3558 * be flushed. This requires the to-be-flushed array to be sorted.
3559 */
3561 for (i = 0; i < num_to_scan; i++)
3562 {
3563 CkptTsStatus *s;
3564 Oid cur_tsid;
3565
3567
3568 /*
3569 * Grow array of per-tablespace status structs, every time a new
3570 * tablespace is found.
3571 */
3573 {
3574 Size sz;
3575
3576 num_spaces++;
3577
3578 /*
3579 * Not worth adding grow-by-power-of-2 logic here - even with a
3580 * few hundred tablespaces this should be fine.
3581 */
3582 sz = sizeof(CkptTsStatus) * num_spaces;
3583
3584 if (per_ts_stat == NULL)
3586 else
3588
3589 s = &per_ts_stat[num_spaces - 1];
3590 memset(s, 0, sizeof(*s));
3591 s->tsId = cur_tsid;
3592
3593 /*
3594 * The first buffer in this tablespace. As CkptBufferIds is sorted
3595 * by tablespace all (s->num_to_scan) buffers in this tablespace
3596 * will follow afterwards.
3597 */
3598 s->index = i;
3599
3600 /*
3601 * progress_slice will be determined once we know how many buffers
3602 * are in each tablespace, i.e. after this loop.
3603 */
3604
3606 }
3607 else
3608 {
3609 s = &per_ts_stat[num_spaces - 1];
3610 }
3611
3612 s->num_to_scan++;
3613
3614 /* Check for barrier events. */
3617 }
3618
3619 Assert(num_spaces > 0);
3620
3621 /*
3622 * Build a min-heap over the write-progress in the individual tablespaces,
3623 * and compute how large a portion of the total progress a single
3624 * processed buffer is.
3625 */
3628 NULL);
3629
3630 for (i = 0; i < num_spaces; i++)
3631 {
3633
3634 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3635
3637 }
3638
3640
3641 /*
3642 * Iterate through to-be-checkpointed buffers and write the ones (still)
3643 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3644 * tablespaces; otherwise the sorting would lead to only one tablespace
3645 * receiving writes at a time, making inefficient use of the hardware.
3646 */
3647 num_processed = 0;
3648 num_written = 0;
3649 while (!binaryheap_empty(ts_heap))
3650 {
3654
3655 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3656 Assert(buf_id != -1);
3657
3658 bufHdr = GetBufferDescriptor(buf_id);
3659
3660 num_processed++;
3661
3662 /*
3663 * We don't need to acquire the lock here, because we're only looking
3664 * at a single bit. It's possible that someone else writes the buffer
3665 * and clears the flag right after we check, but that doesn't matter
3666 * since SyncOneBuffer will then do nothing. However, there is a
3667 * further race condition: it's conceivable that between the time we
3668 * examine the bit here and the time SyncOneBuffer acquires the lock,
3669 * someone else not only wrote the buffer but replaced it with another
3670 * page and dirtied it. In that improbable case, SyncOneBuffer will
3671 * write the buffer though we didn't need to. It doesn't seem worth
3672 * guarding against this, though.
3673 */
3675 {
3676 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3677 {
3680 num_written++;
3681 }
3682 }
3683
3684 /*
3685 * Measure progress independent of actually having to flush the buffer
3686 * - otherwise writing become unbalanced.
3687 */
3688 ts_stat->progress += ts_stat->progress_slice;
3689 ts_stat->num_scanned++;
3690 ts_stat->index++;
3691
3692 /* Have all the buffers from the tablespace been processed? */
3693 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3694 {
3696 }
3697 else
3698 {
3699 /* update heap with the new progress */
3701 }
3702
3703 /*
3704 * Sleep to throttle our I/O rate.
3705 *
3706 * (This will check for barrier events even if it doesn't sleep.)
3707 */
3708 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3709 }
3710
3711 /*
3712 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3713 * IOContext will always be IOCONTEXT_NORMAL.
3714 */
3716
3718 per_ts_stat = NULL;
3720
3721 /*
3722 * Update checkpoint statistics. As noted above, this doesn't include
3723 * buffers written by other backends or bgwriter scan.
3724 */
3726
3728}
3729
3730/*
3731 * BgBufferSync -- Write out some dirty buffers in the pool.
3732 *
3733 * This is called periodically by the background writer process.
3734 *
3735 * Returns true if it's appropriate for the bgwriter process to go into
3736 * low-power hibernation mode. (This happens if the strategy clock-sweep
3737 * has been "lapped" and no buffer allocations have occurred recently,
3738 * or if the bgwriter has been effectively disabled by setting
3739 * bgwriter_lru_maxpages to 0.)
3740 */
3741bool
3743{
3744 /* info obtained from freelist.c */
3745 int strategy_buf_id;
3748
3749 /*
3750 * Information saved between calls so we can determine the strategy
3751 * point's advance rate and avoid scanning already-cleaned buffers.
3752 */
3753 static bool saved_info_valid = false;
3754 static int prev_strategy_buf_id;
3756 static int next_to_clean;
3757 static uint32 next_passes;
3758
3759 /* Moving averages of allocation rate and clean-buffer density */
3760 static float smoothed_alloc = 0;
3761 static float smoothed_density = 10.0;
3762
3763 /* Potentially these could be tunables, but for now, not */
3764 float smoothing_samples = 16;
3765 float scan_whole_pool_milliseconds = 120000.0;
3766
3767 /* Used to compute how far we scan ahead */
3768 long strategy_delta;
3769 int bufs_to_lap;
3770 int bufs_ahead;
3771 float scans_per_alloc;
3774 int min_scan_buffers;
3775
3776 /* Variables for the scanning loop proper */
3777 int num_to_scan;
3778 int num_written;
3779 int reusable_buffers;
3780
3781 /* Variables for final smoothed_density update */
3782 long new_strategy_delta;
3784
3785 /*
3786 * Find out where the clock-sweep currently is, and how many buffer
3787 * allocations have happened since our last call.
3788 */
3790
3791 /* Report buffer alloc counts to pgstat */
3793
3794 /*
3795 * If we're not running the LRU scan, just stop after doing the stats
3796 * stuff. We mark the saved state invalid so that we can recover sanely
3797 * if LRU scan is turned back on later.
3798 */
3799 if (bgwriter_lru_maxpages <= 0)
3800 {
3801 saved_info_valid = false;
3802 return true;
3803 }
3804
3805 /*
3806 * Compute strategy_delta = how many buffers have been scanned by the
3807 * clock-sweep since last time. If first time through, assume none. Then
3808 * see if we are still ahead of the clock-sweep, and if so, how many
3809 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3810 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3811 * behavior when the passes counts wrap around.
3812 */
3813 if (saved_info_valid)
3814 {
3816
3819
3820 Assert(strategy_delta >= 0);
3821
3822 if ((int32) (next_passes - strategy_passes) > 0)
3823 {
3824 /* we're one pass ahead of the strategy point */
3826#ifdef BGW_DEBUG
3827 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3831#endif
3832 }
3833 else if (next_passes == strategy_passes &&
3835 {
3836 /* on same pass, but ahead or at least not behind */
3838#ifdef BGW_DEBUG
3839 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3843#endif
3844 }
3845 else
3846 {
3847 /*
3848 * We're behind, so skip forward to the strategy point and start
3849 * cleaning from there.
3850 */
3851#ifdef BGW_DEBUG
3852 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3856#endif
3860 }
3861 }
3862 else
3863 {
3864 /*
3865 * Initializing at startup or after LRU scanning had been off. Always
3866 * start at the strategy point.
3867 */
3868#ifdef BGW_DEBUG
3869 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3871#endif
3872 strategy_delta = 0;
3876 }
3877
3878 /* Update saved info for next time */
3881 saved_info_valid = true;
3882
3883 /*
3884 * Compute how many buffers had to be scanned for each new allocation, ie,
3885 * 1/density of reusable buffers, and track a moving average of that.
3886 *
3887 * If the strategy point didn't move, we don't update the density estimate
3888 */
3889 if (strategy_delta > 0 && recent_alloc > 0)
3890 {
3894 }
3895
3896 /*
3897 * Estimate how many reusable buffers there are between the current
3898 * strategy point and where we've scanned ahead to, based on the smoothed
3899 * density estimate.
3900 */
3903
3904 /*
3905 * Track a moving average of recent buffer allocations. Here, rather than
3906 * a true average we want a fast-attack, slow-decline behavior: we
3907 * immediately follow any increase.
3908 */
3909 if (smoothed_alloc <= (float) recent_alloc)
3911 else
3914
3915 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3917
3918 /*
3919 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3920 * eventually underflow to zero, and the underflows produce annoying
3921 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3922 * zero, there's no point in tracking smaller and smaller values of
3923 * smoothed_alloc, so just reset it to exactly zero to avoid this
3924 * syndrome. It will pop back up as soon as recent_alloc increases.
3925 */
3926 if (upcoming_alloc_est == 0)
3927 smoothed_alloc = 0;
3928
3929 /*
3930 * Even in cases where there's been little or no buffer allocation
3931 * activity, we want to make a small amount of progress through the buffer
3932 * cache so that as many reusable buffers as possible are clean after an
3933 * idle period.
3934 *
3935 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3936 * the BGW will be called during the scan_whole_pool time; slice the
3937 * buffer pool into that many sections.
3938 */
3940
3942 {
3943#ifdef BGW_DEBUG
3944 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3946#endif
3948 }
3949
3950 /*
3951 * Now write out dirty reusable buffers, working forward from the
3952 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3953 * enough buffers to match our estimate of the next cycle's allocation
3954 * requirements, or hit the bgwriter_lru_maxpages limit.
3955 */
3956
3957 num_to_scan = bufs_to_lap;
3958 num_written = 0;
3960
3961 /* Execute the LRU scan */
3962 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3963 {
3965 wb_context);
3966
3967 if (++next_to_clean >= NBuffers)
3968 {
3969 next_to_clean = 0;
3970 next_passes++;
3971 }
3972 num_to_scan--;
3973
3974 if (sync_state & BUF_WRITTEN)
3975 {
3978 {
3980 break;
3981 }
3982 }
3983 else if (sync_state & BUF_REUSABLE)
3985 }
3986
3988
3989#ifdef BGW_DEBUG
3990 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3993 bufs_to_lap - num_to_scan,
3996#endif
3997
3998 /*
3999 * Consider the above scan as being like a new allocation scan.
4000 * Characterize its density and update the smoothed one based on it. This
4001 * effectively halves the moving average period in cases where both the
4002 * strategy and the background writer are doing some useful scanning,
4003 * which is helpful because a long memory isn't as desirable on the
4004 * density estimates.
4005 */
4006 new_strategy_delta = bufs_to_lap - num_to_scan;
4008 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4009 {
4013
4014#ifdef BGW_DEBUG
4015 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4018#endif
4019 }
4020
4021 /* Return true if OK to hibernate */
4022 return (bufs_to_lap == 0 && recent_alloc == 0);
4023}
4024
4025/*
4026 * SyncOneBuffer -- process a single buffer during syncing.
4027 *
4028 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4029 * buffers marked recently used, as these are not replacement candidates.
4030 *
4031 * Returns a bitmask containing the following flag bits:
4032 * BUF_WRITTEN: we wrote the buffer.
4033 * BUF_REUSABLE: buffer is available for replacement, ie, it has
4034 * pin count 0 and usage count 0.
4035 *
4036 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4037 * after locking it, but we don't care all that much.)
4038 */
4039static int
4041{
4043 int result = 0;
4045 BufferTag tag;
4046
4047 /* Make sure we can handle the pin */
4050
4051 /*
4052 * Check whether buffer needs writing.
4053 *
4054 * We can make this check without taking the buffer content lock so long
4055 * as we mark pages dirty in access methods *before* logging changes with
4056 * XLogInsert(): if someone marks the buffer dirty just after our check we
4057 * don't worry because our checkpoint.redo points before log record for
4058 * upcoming changes and so we are not required to write such dirty buffer.
4059 */
4061
4064 {
4065 result |= BUF_REUSABLE;
4066 }
4067 else if (skip_recently_used)
4068 {
4069 /* Caller told us not to write recently-used buffers */
4071 return result;
4072 }
4073
4074 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4075 {
4076 /* It's clean, so nothing to do */
4078 return result;
4079 }
4080
4081 /*
4082 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4083 * nothing if the buffer is clean by the time we've locked it.)
4084 */
4086
4088
4089 tag = bufHdr->tag;
4090
4092
4093 /*
4094 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4095 * IOContext will always be IOCONTEXT_NORMAL.
4096 */
4098
4099 return result | BUF_WRITTEN;
4100}
4101
4102/*
4103 * AtEOXact_Buffers - clean up at end of transaction.
4104 *
4105 * As of PostgreSQL 8.0, buffer pins should get released by the
4106 * ResourceOwner mechanism. This routine is just a debugging
4107 * cross-check that no pins remain.
4108 */
4109void
4118
4119/*
4120 * Initialize access to shared buffer pool
4121 *
4122 * This is called during backend startup (whether standalone or under the
4123 * postmaster). It sets up for this backend's access to the already-existing
4124 * buffer pool.
4125 */
4126void
4128{
4129 /*
4130 * An advisory limit on the number of pins each backend should hold, based
4131 * on shared_buffers and the maximum number of connections possible.
4132 * That's very pessimistic, but outside toy-sized shared_buffers it should
4133 * allow plenty of pins. LimitAdditionalPins() and
4134 * GetAdditionalPinLimit() can be used to check the remaining balance.
4135 */
4137
4140
4142
4143 /*
4144 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4145 * the corresponding phase of backend shutdown.
4146 */
4147 Assert(MyProc != NULL);
4149}
4150
4151/*
4152 * During backend exit, ensure that we released all shared-buffer locks and
4153 * assert that we have no remaining pins.
4154 */
4155static void
4157{
4158 UnlockBuffers();
4159
4161
4162 /* localbuf.c needs a chance too */
4164}
4165
4166/*
4167 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4168 *
4169 * As of PostgreSQL 8.0, buffer pins should get released by the
4170 * ResourceOwner mechanism. This routine is just a debugging
4171 * cross-check that no pins remain.
4172 */
4173static void
4175{
4176#ifdef USE_ASSERT_CHECKING
4177 int RefCountErrors = 0;
4179 int i;
4180 char *s;
4181
4182 /* check the array */
4183 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4184 {
4186 {
4187 res = &PrivateRefCountArray[i];
4188
4190 elog(WARNING, "buffer refcount leak: %s", s);
4191 pfree(s);
4192
4194 }
4195 }
4196
4197 /* if necessary search the hash */
4199 {
4200 refcount_iterator iter;
4201
4203 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4204 {
4206 elog(WARNING, "buffer refcount leak: %s", s);
4207 pfree(s);
4209 }
4210 }
4211
4212 Assert(RefCountErrors == 0);
4213#endif
4214}
4215
4216#ifdef USE_ASSERT_CHECKING
4217/*
4218 * Check for exclusive-locked catalog buffers. This is the core of
4219 * AssertCouldGetRelation().
4220 *
4221 * A backend would self-deadlock on the content lock if the catalog scan read
4222 * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4223 * of catalogs used in relcache, because a catcache search on any catalog may
4224 * build that catalog's relcache entry. We don't have an inventory of
4225 * catalogs relcache uses, so just check buffers of most catalogs.
4226 *
4227 * It's better to minimize waits while holding an exclusive buffer lock, so it
4228 * would be nice to broaden this check not to be catalog-specific. However,
4229 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4230 * read tables. That is deadlock-free as long as there's no loop in the
4231 * dependency graph: modifying table A may cause an opclass to read table B,
4232 * but it must not cause a read of table A.
4233 */
4234void
4236{
4238
4239 /* check the array */
4240 for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4241 {
4243 {
4244 res = &PrivateRefCountArray[i];
4245
4246 if (res->buffer == InvalidBuffer)
4247 continue;
4248
4250 }
4251 }
4252
4253 /* if necessary search the hash */
4255 {
4256 refcount_iterator iter;
4257
4259 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4260 {
4262 }
4263 }
4264}
4265
4266static void
4268{
4270 BufferTag tag;
4271 Oid relid;
4272
4274 return;
4275
4276 tag = bufHdr->tag;
4277
4278 /*
4279 * This relNumber==relid assumption holds until a catalog experiences
4280 * VACUUM FULL or similar. After a command like that, relNumber will be
4281 * in the normal (non-catalog) range, and we lose the ability to detect
4282 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4283 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4284 * held lock.
4285 */
4286 relid = tag.relNumber;
4287
4288 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4289 return;
4290
4292}
4293#endif
4294
4295
4296/*
4297 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4298 */
4299char *
4301{
4302 BufferDesc *buf;
4304 char *result;
4305 ProcNumber backend;
4307
4309 if (BufferIsLocal(buffer))
4310 {
4313 backend = MyProcNumber;
4314 }
4315 else
4316 {
4319 backend = INVALID_PROC_NUMBER;
4320 }
4321
4322 /* theoretically we should lock the bufHdr here */
4323 buf_state = pg_atomic_read_u64(&buf->state);
4324
4325 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4326 buffer,
4328 BufTagGetForkNum(&buf->tag)).str,
4329 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4331 return result;
4332}
4333
4334/*
4335 * CheckPointBuffers
4336 *
4337 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4338 *
4339 * Note: temporary relations do not participate in checkpoints, so they don't
4340 * need to be flushed.
4341 */
4342void
4344{
4345 BufferSync(flags);
4346}
4347
4348/*
4349 * BufferGetBlockNumber
4350 * Returns the block number associated with a buffer.
4351 *
4352 * Note:
4353 * Assumes that the buffer is valid and pinned, else the
4354 * value may be obsolete immediately...
4355 */
4358{
4360
4362
4363 if (BufferIsLocal(buffer))
4365 else
4367
4368 /* pinned, so OK to read tag without spinlock */
4369 return bufHdr->tag.blockNum;
4370}
4371
4372/*
4373 * BufferGetTag
4374 * Returns the relfilelocator, fork number and block number associated with
4375 * a buffer.
4376 */
4377void
4380{
4382
4383 /* Do the same checks as BufferGetBlockNumber. */
4385
4386 if (BufferIsLocal(buffer))
4388 else
4390
4391 /* pinned, so OK to read tag without spinlock */
4392 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4393 *forknum = BufTagGetForkNum(&bufHdr->tag);
4394 *blknum = bufHdr->tag.blockNum;
4395}
4396
4397/*
4398 * FlushBuffer
4399 * Physically write out a shared buffer.
4400 *
4401 * NOTE: this actually just passes the buffer contents to the kernel; the
4402 * real write to disk won't happen until the kernel feels like it. This
4403 * is okay from our point of view since we can redo the changes from WAL.
4404 * However, we will need to force the changes to disk via fsync before
4405 * we can checkpoint WAL.
4406 *
4407 * The caller must hold a pin on the buffer and have
4408 * (share-)exclusively-locked the buffer contents.
4409 *
4410 * If the caller has an smgr reference for the buffer's relation, pass it
4411 * as the second parameter. If not, pass NULL.
4412 */
4413static void
4416{
4418 ErrorContextCallback errcallback;
4421 char *bufToWrite;
4422
4425
4426 /*
4427 * Try to start an I/O operation. If StartBufferIO returns false, then
4428 * someone else flushed the buffer before we could, so we need not do
4429 * anything.
4430 */
4431 if (!StartBufferIO(buf, false, false))
4432 return;
4433
4434 /* Setup error traceback support for ereport() */
4436 errcallback.arg = buf;
4437 errcallback.previous = error_context_stack;
4438 error_context_stack = &errcallback;
4439
4440 /* Find smgr relation for buffer */
4441 if (reln == NULL)
4443
4445 buf->tag.blockNum,
4446 reln->smgr_rlocator.locator.spcOid,
4447 reln->smgr_rlocator.locator.dbOid,
4448 reln->smgr_rlocator.locator.relNumber);
4449
4450 /*
4451 * As we hold at least a share-exclusive lock on the buffer, the LSN
4452 * cannot change during the flush (and thus can't be torn).
4453 */
4455
4456 /*
4457 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4458 * rule that log updates must hit disk before any of the data-file changes
4459 * they describe do.
4460 *
4461 * However, this rule does not apply to unlogged relations, which will be
4462 * lost after a crash anyway. Most unlogged relation pages do not bear
4463 * LSNs since we never emit WAL records for them, and therefore flushing
4464 * up through the buffer LSN would be useless, but harmless. However,
4465 * some index AMs use LSNs internally to detect concurrent page
4466 * modifications, and therefore unlogged index pages bear "fake" LSNs
4467 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4468 * LSN counter could advance past the WAL insertion point; and if it did
4469 * happen, attempting to flush WAL through that location would fail, with
4470 * disastrous system-wide consequences. To make sure that can't happen,
4471 * skip the flush if the buffer isn't permanent.
4472 */
4473 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4475
4476 /*
4477 * Now it's safe to write the buffer to disk. Note that no one else should
4478 * have been able to write it, while we were busy with log flushing,
4479 * because we got the exclusive right to perform I/O by setting the
4480 * BM_IO_IN_PROGRESS bit.
4481 */
4483
4484 /*
4485 * Update page checksum if desired. Since we have only shared lock on the
4486 * buffer, other processes might be updating hint bits in it, so we must
4487 * copy the page to private storage if we do checksumming.
4488 */
4489 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4490
4492
4493 /*
4494 * bufToWrite is either the shared buffer or a copy, as appropriate.
4495 */
4497 BufTagGetForkNum(&buf->tag),
4498 buf->tag.blockNum,
4499 bufToWrite,
4500 false);
4501
4502 /*
4503 * When a strategy is in use, only flushes of dirty buffers already in the
4504 * strategy ring are counted as strategy writes (IOCONTEXT
4505 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4506 * statistics tracking.
4507 *
4508 * If a shared buffer initially added to the ring must be flushed before
4509 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4510 *
4511 * If a shared buffer which was added to the ring later because the
4512 * current strategy buffer is pinned or in use or because all strategy
4513 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4514 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4515 * (from_ring will be false).
4516 *
4517 * When a strategy is not in use, the write can only be a "regular" write
4518 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4519 */
4522
4524
4525 /*
4526 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4527 */
4528 TerminateBufferIO(buf, true, 0, true, false);
4529
4531 buf->tag.blockNum,
4532 reln->smgr_rlocator.locator.spcOid,
4533 reln->smgr_rlocator.locator.dbOid,
4534 reln->smgr_rlocator.locator.relNumber);
4535
4536 /* Pop the error context stack */
4537 error_context_stack = errcallback.previous;
4538}
4539
4540/*
4541 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4542 * before/after calling FlushBuffer().
4543 */
4544static void
4554
4555/*
4556 * RelationGetNumberOfBlocksInFork
4557 * Determines the current number of pages in the specified relation fork.
4558 *
4559 * Note that the accuracy of the result will depend on the details of the
4560 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4561 * it might not be.
4562 */
4565{
4566 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4567 {
4568 /*
4569 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4570 * tableam returns the size in bytes - but for the purpose of this
4571 * routine, we want the number of blocks. Therefore divide, rounding
4572 * up.
4573 */
4575
4576 szbytes = table_relation_size(relation, forkNum);
4577
4578 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4579 }
4580 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4581 {
4582 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4583 }
4584 else
4585 Assert(false);
4586
4587 return 0; /* keep compiler quiet */
4588}
4589
4590/*
4591 * BufferIsPermanent
4592 * Determines whether a buffer will potentially still be around after
4593 * a crash. Caller must hold a buffer pin.
4594 */
4595bool
4597{
4599
4600 /* Local buffers are used only for temp relations. */
4601 if (BufferIsLocal(buffer))
4602 return false;
4603
4604 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4607
4608 /*
4609 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4610 * need not bother with the buffer header spinlock. Even if someone else
4611 * changes the buffer header state while we're doing this, the state is
4612 * changed atomically, so we'll read the old value or the new value, but
4613 * not random garbage.
4614 */
4616 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4617}
4618
4619/*
4620 * BufferGetLSNAtomic
4621 * Retrieves the LSN of the buffer atomically.
4622 *
4623 * This is necessary for some callers who may only hold a share lock on
4624 * the buffer. A share lock allows a concurrent backend to set hint bits
4625 * on the page, which in turn may require a WAL record to be emitted.
4626 *
4627 * On platforms with 8 byte atomic reads/writes, we don't need to do any
4628 * additional locking. On platforms not supporting such 8 byte atomic
4629 * reads/writes, we need to actually take the header lock.
4630 */
4633{
4634 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4637
4638#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4640#else
4641 {
4642 char *page = BufferGetPage(buffer);
4644 XLogRecPtr lsn;
4645
4646 /*
4647 * If we don't need locking for correctness, fastpath out.
4648 */
4650 return PageGetLSN(page);
4651
4654 lsn = PageGetLSN(page);
4656
4657 return lsn;
4658 }
4659#endif
4660}
4661
4662/* ---------------------------------------------------------------------
4663 * DropRelationBuffers
4664 *
4665 * This function removes from the buffer pool all the pages of the
4666 * specified relation forks that have block numbers >= firstDelBlock.
4667 * (In particular, with firstDelBlock = 0, all pages are removed.)
4668 * Dirty pages are simply dropped, without bothering to write them
4669 * out first. Therefore, this is NOT rollback-able, and so should be
4670 * used only with extreme caution!
4671 *
4672 * Currently, this is called only from smgr.c when the underlying file
4673 * is about to be deleted or truncated (firstDelBlock is needed for
4674 * the truncation case). The data in the affected pages would therefore
4675 * be deleted momentarily anyway, and there is no point in writing it.
4676 * It is the responsibility of higher-level code to ensure that the
4677 * deletion or truncation does not lose any data that could be needed
4678 * later. It is also the responsibility of higher-level code to ensure
4679 * that no other process could be trying to load more pages of the
4680 * relation into buffers.
4681 * --------------------------------------------------------------------
4682 */
4683void
4686{
4687 int i;
4688 int j;
4689 RelFileLocatorBackend rlocator;
4692
4693 rlocator = smgr_reln->smgr_rlocator;
4694
4695 /* If it's a local relation, it's localbuf.c's problem. */
4696 if (RelFileLocatorBackendIsTemp(rlocator))
4697 {
4698 if (rlocator.backend == MyProcNumber)
4699 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4701
4702 return;
4703 }
4704
4705 /*
4706 * To remove all the pages of the specified relation forks from the buffer
4707 * pool, we need to scan the entire buffer pool but we can optimize it by
4708 * finding the buffers from BufMapping table provided we know the exact
4709 * size of each fork of the relation. The exact size is required to ensure
4710 * that we don't leave any buffer for the relation being dropped as
4711 * otherwise the background writer or checkpointer can lead to a PANIC
4712 * error while flushing buffers corresponding to files that don't exist.
4713 *
4714 * To know the exact size, we rely on the size cached for each fork by us
4715 * during recovery which limits the optimization to recovery and on
4716 * standbys but we can easily extend it once we have shared cache for
4717 * relation size.
4718 *
4719 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4720 * and the future writes keeps the cached value up-to-date. See
4721 * smgrextend. It is possible that the value of the first lseek is smaller
4722 * than the actual number of existing blocks in the file due to buggy
4723 * Linux kernels that might not have accounted for the recent write. But
4724 * that should be fine because there must not be any buffers after that
4725 * file size.
4726 */
4727 for (i = 0; i < nforks; i++)
4728 {
4729 /* Get the number of blocks for a relation's fork */
4731
4733 {
4735 break;
4736 }
4737
4738 /* calculate the number of blocks to be invalidated */
4740 }
4741
4742 /*
4743 * We apply the optimization iff the total number of blocks to invalidate
4744 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4745 */
4748 {
4749 for (j = 0; j < nforks; j++)
4750 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4752 return;
4753 }
4754
4755 for (i = 0; i < NBuffers; i++)
4756 {
4758
4759 /*
4760 * We can make this a tad faster by prechecking the buffer tag before
4761 * we attempt to lock the buffer; this saves a lot of lock
4762 * acquisitions in typical cases. It should be safe because the
4763 * caller must have AccessExclusiveLock on the relation, or some other
4764 * reason to be certain that no one is loading new pages of the rel
4765 * into the buffer pool. (Otherwise we might well miss such pages
4766 * entirely.) Therefore, while the tag might be changing while we
4767 * look at it, it can't be changing *to* a value we care about, only
4768 * *away* from such a value. So false negatives are impossible, and
4769 * false positives are safe because we'll recheck after getting the
4770 * buffer lock.
4771 *
4772 * We could check forkNum and blockNum as well as the rlocator, but
4773 * the incremental win from doing so seems small.
4774 */
4775 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4776 continue;
4777
4779
4780 for (j = 0; j < nforks; j++)
4781 {
4782 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4783 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4784 bufHdr->tag.blockNum >= firstDelBlock[j])
4785 {
4786 InvalidateBuffer(bufHdr); /* releases spinlock */
4787 break;
4788 }
4789 }
4790 if (j >= nforks)
4792 }
4793}
4794
4795/* ---------------------------------------------------------------------
4796 * DropRelationsAllBuffers
4797 *
4798 * This function removes from the buffer pool all the pages of all
4799 * forks of the specified relations. It's equivalent to calling
4800 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4801 * --------------------------------------------------------------------
4802 */
4803void
4805{
4806 int i;
4807 int n = 0;
4808 SMgrRelation *rels;
4809 BlockNumber (*block)[MAX_FORKNUM + 1];
4812 bool cached = true;
4813 bool use_bsearch;
4814
4815 if (nlocators == 0)
4816 return;
4817
4818 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4819
4820 /* If it's a local relation, it's localbuf.c's problem. */
4821 for (i = 0; i < nlocators; i++)
4822 {
4823 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4824 {
4825 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4826 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4827 }
4828 else
4829 rels[n++] = smgr_reln[i];
4830 }
4831
4832 /*
4833 * If there are no non-local relations, then we're done. Release the
4834 * memory and return.
4835 */
4836 if (n == 0)
4837 {
4838 pfree(rels);
4839 return;
4840 }
4841
4842 /*
4843 * This is used to remember the number of blocks for all the relations
4844 * forks.
4845 */
4846 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4847 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4848
4849 /*
4850 * We can avoid scanning the entire buffer pool if we know the exact size
4851 * of each of the given relation forks. See DropRelationBuffers.
4852 */
4853 for (i = 0; i < n && cached; i++)
4854 {
4855 for (int j = 0; j <= MAX_FORKNUM; j++)
4856 {
4857 /* Get the number of blocks for a relation's fork. */
4858 block[i][j] = smgrnblocks_cached(rels[i], j);
4859
4860 /* We need to only consider the relation forks that exists. */
4861 if (block[i][j] == InvalidBlockNumber)
4862 {
4863 if (!smgrexists(rels[i], j))
4864 continue;
4865 cached = false;
4866 break;
4867 }
4868
4869 /* calculate the total number of blocks to be invalidated */
4870 nBlocksToInvalidate += block[i][j];
4871 }
4872 }
4873
4874 /*
4875 * We apply the optimization iff the total number of blocks to invalidate
4876 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4877 */
4879 {
4880 for (i = 0; i < n; i++)
4881 {
4882 for (int j = 0; j <= MAX_FORKNUM; j++)
4883 {
4884 /* ignore relation forks that doesn't exist */
4885 if (!BlockNumberIsValid(block[i][j]))
4886 continue;
4887
4888 /* drop all the buffers for a particular relation fork */
4889 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4890 j, block[i][j], 0);
4891 }
4892 }
4893
4894 pfree(block);
4895 pfree(rels);
4896 return;
4897 }
4898
4899 pfree(block);
4900 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4901 for (i = 0; i < n; i++)
4902 locators[i] = rels[i]->smgr_rlocator.locator;
4903
4904 /*
4905 * For low number of relations to drop just use a simple walk through, to
4906 * save the bsearch overhead. The threshold to use is rather a guess than
4907 * an exactly determined value, as it depends on many factors (CPU and RAM
4908 * speeds, amount of shared buffers etc.).
4909 */
4911
4912 /* sort the list of rlocators if necessary */
4913 if (use_bsearch)
4915
4916 for (i = 0; i < NBuffers; i++)
4917 {
4918 RelFileLocator *rlocator = NULL;
4920
4921 /*
4922 * As in DropRelationBuffers, an unlocked precheck should be safe and
4923 * saves some cycles.
4924 */
4925
4926 if (!use_bsearch)
4927 {
4928 int j;
4929
4930 for (j = 0; j < n; j++)
4931 {
4933 {
4934 rlocator = &locators[j];
4935 break;
4936 }
4937 }
4938 }
4939 else
4940 {
4941 RelFileLocator locator;
4942
4943 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4944 rlocator = bsearch(&locator,
4945 locators, n, sizeof(RelFileLocator),
4947 }
4948
4949 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4950 if (rlocator == NULL)
4951 continue;
4952
4954 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4955 InvalidateBuffer(bufHdr); /* releases spinlock */
4956 else
4958 }
4959
4960 pfree(locators);
4961 pfree(rels);
4962}
4963
4964/* ---------------------------------------------------------------------
4965 * FindAndDropRelationBuffers
4966 *
4967 * This function performs look up in BufMapping table and removes from the
4968 * buffer pool all the pages of the specified relation fork that has block
4969 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4970 * pages are removed.)
4971 * --------------------------------------------------------------------
4972 */
4973static void
4977{
4978 BlockNumber curBlock;
4979
4980 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4981 {
4982 uint32 bufHash; /* hash value for tag */
4983 BufferTag bufTag; /* identity of requested block */
4984 LWLock *bufPartitionLock; /* buffer partition lock for it */
4985 int buf_id;
4987
4988 /* create a tag so we can lookup the buffer */
4989 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4990
4991 /* determine its hash code and partition lock ID */
4994
4995 /* Check that it is in the buffer pool. If not, do nothing. */
4997 buf_id = BufTableLookup(&bufTag, bufHash);
4999
5000 if (buf_id < 0)
5001 continue;
5002
5003 bufHdr = GetBufferDescriptor(buf_id);
5004
5005 /*
5006 * We need to lock the buffer header and recheck if the buffer is
5007 * still associated with the same block because the buffer could be
5008 * evicted by some other backend loading blocks for a different
5009 * relation after we release lock on the BufMapping table.
5010 */
5012
5013 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5014 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5015 bufHdr->tag.blockNum >= firstDelBlock)
5016 InvalidateBuffer(bufHdr); /* releases spinlock */
5017 else
5019 }
5020}
5021
5022/* ---------------------------------------------------------------------
5023 * DropDatabaseBuffers
5024 *
5025 * This function removes all the buffers in the buffer cache for a
5026 * particular database. Dirty pages are simply dropped, without
5027 * bothering to write them out first. This is used when we destroy a
5028 * database, to avoid trying to flush data to disk when the directory
5029 * tree no longer exists. Implementation is pretty similar to
5030 * DropRelationBuffers() which is for destroying just one relation.
5031 * --------------------------------------------------------------------
5032 */
5033void
5035{
5036 int i;
5037
5038 /*
5039 * We needn't consider local buffers, since by assumption the target
5040 * database isn't our own.
5041 */
5042
5043 for (i = 0; i < NBuffers; i++)
5044 {
5046
5047 /*
5048 * As in DropRelationBuffers, an unlocked precheck should be safe and
5049 * saves some cycles.
5050 */
5051 if (bufHdr->tag.dbOid != dbid)
5052 continue;
5053
5055 if (bufHdr->tag.dbOid == dbid)
5056 InvalidateBuffer(bufHdr); /* releases spinlock */
5057 else
5059 }
5060}
5061
5062/* ---------------------------------------------------------------------
5063 * FlushRelationBuffers
5064 *
5065 * This function writes all dirty pages of a relation out to disk
5066 * (or more accurately, out to kernel disk buffers), ensuring that the
5067 * kernel has an up-to-date view of the relation.
5068 *
5069 * Generally, the caller should be holding AccessExclusiveLock on the
5070 * target relation to ensure that no other backend is busy dirtying
5071 * more blocks of the relation; the effects can't be expected to last
5072 * after the lock is released.
5073 *
5074 * XXX currently it sequentially searches the buffer pool, should be
5075 * changed to more clever ways of searching. This routine is not
5076 * used in any performance-critical code paths, so it's not worth
5077 * adding additional overhead to normal paths to make it go faster.
5078 * --------------------------------------------------------------------
5079 */
5080void
5082{
5083 int i;
5085 SMgrRelation srel = RelationGetSmgr(rel);
5086
5087 if (RelationUsesLocalBuffers(rel))
5088 {
5089 for (i = 0; i < NLocBuffer; i++)
5090 {
5092
5094 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5095 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5096 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5097 {
5098 ErrorContextCallback errcallback;
5099
5100 /* Setup error traceback support for ereport() */
5102 errcallback.arg = bufHdr;
5103 errcallback.previous = error_context_stack;
5104 error_context_stack = &errcallback;
5105
5106 /* Make sure we can handle the pin */
5109
5110 /*
5111 * Pin/unpin mostly to make valgrind work, but it also seems
5112 * like the right thing to do.
5113 */
5114 PinLocalBuffer(bufHdr, false);
5115
5116
5117 FlushLocalBuffer(bufHdr, srel);
5118
5120
5121 /* Pop the error context stack */
5122 error_context_stack = errcallback.previous;
5123 }
5124 }
5125
5126 return;
5127 }
5128
5129 for (i = 0; i < NBuffers; i++)
5130 {
5132
5134
5135 /*
5136 * As in DropRelationBuffers, an unlocked precheck should be safe and
5137 * saves some cycles.
5138 */
5140 continue;
5141
5142 /* Make sure we can handle the pin */
5145
5147 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5149 {
5153 }
5154 else
5156 }
5157}
5158
5159/* ---------------------------------------------------------------------
5160 * FlushRelationsAllBuffers
5161 *
5162 * This function flushes out of the buffer pool all the pages of all
5163 * forks of the specified smgr relations. It's equivalent to calling
5164 * FlushRelationBuffers once per relation. The relations are assumed not
5165 * to use local buffers.
5166 * --------------------------------------------------------------------
5167 */
5168void
5170{
5171 int i;
5173 bool use_bsearch;
5174
5175 if (nrels == 0)
5176 return;
5177
5178 /* fill-in array for qsort */
5180
5181 for (i = 0; i < nrels; i++)
5182 {
5183 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5184
5185 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5186 srels[i].srel = smgrs[i];
5187 }
5188
5189 /*
5190 * Save the bsearch overhead for low number of relations to sync. See
5191 * DropRelationsAllBuffers for details.
5192 */
5194
5195 /* sort the list of SMgrRelations if necessary */
5196 if (use_bsearch)
5197 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5198
5199 for (i = 0; i < NBuffers; i++)
5200 {
5204
5205 /*
5206 * As in DropRelationBuffers, an unlocked precheck should be safe and
5207 * saves some cycles.
5208 */
5209
5210 if (!use_bsearch)
5211 {
5212 int j;
5213
5214 for (j = 0; j < nrels; j++)
5215 {
5216 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5217 {
5218 srelent = &srels[j];
5219 break;
5220 }
5221 }
5222 }
5223 else
5224 {
5225 RelFileLocator rlocator;
5226
5227 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5228 srelent = bsearch(&rlocator,
5229 srels, nrels, sizeof(SMgrSortArray),
5231 }
5232
5233 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5234 if (srelent == NULL)
5235 continue;
5236
5237 /* Make sure we can handle the pin */
5240
5242 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5244 {
5248 }
5249 else
5251 }
5252
5253 pfree(srels);
5254}
5255
5256/* ---------------------------------------------------------------------
5257 * RelationCopyStorageUsingBuffer
5258 *
5259 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5260 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5261 *
5262 * Refer comments atop CreateAndCopyRelationData() for details about
5263 * 'permanent' parameter.
5264 * --------------------------------------------------------------------
5265 */
5266static void
5269 ForkNumber forkNum, bool permanent)
5270{
5271 Buffer srcBuf;
5272 Buffer dstBuf;
5273 Page srcPage;
5274 Page dstPage;
5275 bool use_wal;
5276 BlockNumber nblocks;
5277 BlockNumber blkno;
5284
5285 /*
5286 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5287 * can skip it when copying any fork of an unlogged relation other than
5288 * the init fork.
5289 */
5290 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5291
5292 /* Get number of blocks in the source relation. */
5294 forkNum);
5295
5296 /* Nothing to copy; just return. */
5297 if (nblocks == 0)
5298 return;
5299
5300 /*
5301 * Bulk extend the destination relation of the same size as the source
5302 * relation before starting to copy block by block.
5303 */
5304 memset(buf.data, 0, BLCKSZ);
5305 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5306 buf.data, true);
5307
5308 /* This is a bulk operation, so use buffer access strategies. */
5311
5312 /* Initialize streaming read */
5313 p.current_blocknum = 0;
5314 p.last_exclusive = nblocks;
5316
5317 /*
5318 * It is safe to use batchmode as block_range_read_stream_cb takes no
5319 * locks.
5320 */
5324 src_smgr,
5326 forkNum,
5328 &p,
5329 0);
5330
5331 /* Iterate over each block of the source relation file. */
5332 for (blkno = 0; blkno < nblocks; blkno++)
5333 {
5335
5336 /* Read block from source relation. */
5340
5344 permanent);
5346
5348
5349 /* Copy page data from the source to the destination. */
5352
5353 /* WAL-log the copied page. */
5354 if (use_wal)
5356
5358
5361 }
5364
5367}
5368
5369/* ---------------------------------------------------------------------
5370 * CreateAndCopyRelationData
5371 *
5372 * Create destination relation storage and copy all forks from the
5373 * source relation to the destination.
5374 *
5375 * Pass permanent as true for permanent relations and false for
5376 * unlogged relations. Currently this API is not supported for
5377 * temporary relations.
5378 * --------------------------------------------------------------------
5379 */
5380void
5382 RelFileLocator dst_rlocator, bool permanent)
5383{
5384 char relpersistence;
5387
5388 /* Set the relpersistence. */
5389 relpersistence = permanent ?
5391
5394
5395 /*
5396 * Create and copy all forks of the relation. During create database we
5397 * have a separate cleanup mechanism which deletes complete database
5398 * directory. Therefore, each individual relation doesn't need to be
5399 * registered for cleanup.
5400 */
5401 RelationCreateStorage(dst_rlocator, relpersistence, false);
5402
5403 /* copy main fork. */
5405 permanent);
5406
5407 /* copy those extra forks that exist */
5408 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5409 forkNum <= MAX_FORKNUM; forkNum++)
5410 {
5411 if (smgrexists(src_rel, forkNum))
5412 {
5413 smgrcreate(dst_rel, forkNum, false);
5414
5415 /*
5416 * WAL log creation if the relation is persistent, or this is the
5417 * init fork of an unlogged relation.
5418 */
5419 if (permanent || forkNum == INIT_FORKNUM)
5420 log_smgrcreate(&dst_rlocator, forkNum);
5421
5422 /* Copy a fork's data, block by block. */
5424 permanent);
5425 }
5426 }
5427}
5428
5429/* ---------------------------------------------------------------------
5430 * FlushDatabaseBuffers
5431 *
5432 * This function writes all dirty pages of a database out to disk
5433 * (or more accurately, out to kernel disk buffers), ensuring that the
5434 * kernel has an up-to-date view of the database.
5435 *
5436 * Generally, the caller should be holding an appropriate lock to ensure
5437 * no other backend is active in the target database; otherwise more
5438 * pages could get dirtied.
5439 *
5440 * Note we don't worry about flushing any pages of temporary relations.
5441 * It's assumed these wouldn't be interesting.
5442 * --------------------------------------------------------------------
5443 */
5444void
5446{
5447 int i;
5449
5450 for (i = 0; i < NBuffers; i++)
5451 {
5453
5455
5456 /*
5457 * As in DropRelationBuffers, an unlocked precheck should be safe and
5458 * saves some cycles.
5459 */
5460 if (bufHdr->tag.dbOid != dbid)
5461 continue;
5462
5463 /* Make sure we can handle the pin */
5466
5468 if (bufHdr->tag.dbOid == dbid &&
5470 {
5474 }
5475 else
5477 }
5478}
5479
5480/*
5481 * Flush a previously, share-exclusively or exclusively, locked and pinned
5482 * buffer to the OS.
5483 */
5484void
5486{
5488
5489 /* currently not needed, but no fundamental reason not to support */
5491
5493
5495
5497
5499}
5500
5501/*
5502 * ReleaseBuffer -- release the pin on a buffer
5503 */
5504void
5506{
5507 if (!BufferIsValid(buffer))
5508 elog(ERROR, "bad buffer ID: %d", buffer);
5509
5510 if (BufferIsLocal(buffer))
5512 else
5514}
5515
5516/*
5517 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5518 *
5519 * This is just a shorthand for a common combination.
5520 */
5521void
5527
5528/*
5529 * IncrBufferRefCount
5530 * Increment the pin count on a buffer that we have *already* pinned
5531 * at least once.
5532 *
5533 * This function cannot be used on a buffer we do not have pinned,
5534 * because it doesn't change the shared buffer state.
5535 */
5536void
5553
5554/*
5555 * Shared-buffer only helper for MarkBufferDirtyHint() and
5556 * BufferSetHintBits16().
5557 *
5558 * This is separated out because it turns out that the repeated checks for
5559 * local buffers, repeated GetBufferDescriptor() and repeated reading of the
5560 * buffer's state sufficiently hurts the performance of BufferSetHintBits16().
5561 */
5562static inline void
5564 bool buffer_std)
5565{
5566 Page page = BufferGetPage(buffer);
5567
5569
5570 /* here, either share-exclusive or exclusive lock is OK */
5573
5574 /*
5575 * This routine might get called many times on the same page, if we are
5576 * making the first scan after commit of an xact that added/deleted many
5577 * tuples. So, be as quick as we can if the buffer is already dirty.
5578 *
5579 * As we are holding (at least) a share-exclusive lock, nobody could have
5580 * cleaned or dirtied the page concurrently, so we can just rely on the
5581 * previously fetched value here without any danger of races.
5582 */
5583 if (unlikely(!(lockstate & BM_DIRTY)))
5584 {
5586 bool wal_log = false;
5588
5589 /*
5590 * If we need to protect hint bit updates from torn writes, WAL-log a
5591 * full page image of the page. This full page image is only necessary
5592 * if the hint bit update is the first change to the page since the
5593 * last checkpoint.
5594 *
5595 * We don't check full_page_writes here because that logic is included
5596 * when we call XLogInsert() since the value changes dynamically.
5597 */
5599 {
5600 /*
5601 * If we must not write WAL, due to a relfilelocator-specific
5602 * condition or being in recovery, don't dirty the page. We can
5603 * set the hint, just not dirty the page as a result so the hint
5604 * is lost when we evict the page or shutdown.
5605 *
5606 * See src/backend/storage/page/README for longer discussion.
5607 */
5608 if (RecoveryInProgress() ||
5610 return;
5611
5612 wal_log = true;
5613 }
5614
5615 /*
5616 * We must mark the page dirty before we emit the WAL record, as per
5617 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5618 * flush the buffer, even if we haven't inserted the WAL record yet.
5619 * As we hold at least a share-exclusive lock, checkpoints will wait
5620 * for this backend to be done with the buffer before continuing. If
5621 * we did it the other way round, a checkpoint could start between
5622 * writing the WAL record and marking the buffer dirty.
5623 */
5625
5626 /*
5627 * It should not be possible for the buffer to already be dirty, see
5628 * comment above.
5629 */
5633 BM_DIRTY,
5634 0, 0);
5635
5636 /*
5637 * If the block is already dirty because we either made a change or
5638 * set a hint already, then we don't need to write a full page image.
5639 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5640 * would increase the call rate. Bulk setting of hint bits would
5641 * reduce the call rate...
5642 */
5643 if (wal_log)
5645
5646 if (XLogRecPtrIsValid(lsn))
5647 {
5648 /*
5649 * Set the page LSN if we wrote a backup block. To allow backends
5650 * that only hold a share lock on the buffer to read the LSN in a
5651 * tear-free manner, we set the page LSN while holding the buffer
5652 * header lock. This allows any reader of an LSN who holds only a
5653 * share lock to also obtain a buffer header lock before using
5654 * PageGetLSN() to read the LSN in a tear free way. This is done
5655 * in BufferGetLSNAtomic().
5656 *
5657 * If checksums are enabled, you might think we should reset the
5658 * checksum here. That will happen when the page is written
5659 * sometime later in this checkpoint cycle.
5660 */
5662 PageSetLSN(page, lsn);
5664 }
5665
5667 if (VacuumCostActive)
5669 }
5670}
5671
5672/*
5673 * MarkBufferDirtyHint
5674 *
5675 * Mark a buffer dirty for non-critical changes.
5676 *
5677 * This is essentially the same as MarkBufferDirty, except:
5678 *
5679 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5680 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5681 * 2. The caller might have only a share-exclusive-lock instead of an
5682 * exclusive-lock on the buffer's content lock.
5683 * 3. This function does not guarantee that the buffer is always marked dirty
5684 * (it e.g. can't always on a hot standby), so it cannot be used for
5685 * important changes.
5686 */
5687inline void
5689{
5691
5693
5694 if (!BufferIsValid(buffer))
5695 elog(ERROR, "bad buffer ID: %d", buffer);
5696
5697 if (BufferIsLocal(buffer))
5698 {
5700 return;
5701 }
5702
5704 pg_atomic_read_u64(&bufHdr->state),
5705 buffer_std);
5706}
5707
5708/*
5709 * Release buffer content locks for shared buffers.
5710 *
5711 * Used to clean up after errors.
5712 *
5713 * Currently, we can expect that resource owner cleanup, via
5714 * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5715 * se; the only thing we need to deal with here is clearing any PIN_COUNT
5716 * request that was in progress.
5717 */
5718void
5720{
5722
5723 if (buf)
5724 {
5726 uint64 unset_bits = 0;
5727
5729
5730 /*
5731 * Don't complain if flag bit not set; it could have been reset but we
5732 * got a cancel/die interrupt before getting the signal.
5733 */
5734 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5735 buf->wait_backend_pgprocno == MyProcNumber)
5737
5739 0, unset_bits,
5740 0);
5741
5743 }
5744}
5745
5746/*
5747 * Acquire the buffer content lock in the specified mode
5748 *
5749 * If the lock is not available, sleep until it is.
5750 *
5751 * Side effect: cancel/die interrupts are held off until lock release.
5752 *
5753 * This uses almost the same locking approach as lwlock.c's
5754 * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5755 * detailed discussion.
5756 *
5757 * The reason that this, and most of the other BufferLock* functions, get both
5758 * the Buffer and BufferDesc* as parameters, is that looking up one from the
5759 * other repeatedly shows up noticeably in profiles.
5760 *
5761 * Callers should provide a constant for mode, for more efficient code
5762 * generation.
5763 */
5764static inline void
5766{
5767 PrivateRefCountEntry *entry;
5768 int extraWaits = 0;
5769
5770 /*
5771 * Get reference to the refcount entry before we hold the lock, it seems
5772 * better to do before holding the lock.
5773 */
5774 entry = GetPrivateRefCountEntry(buffer, true);
5775
5776 /*
5777 * We better not already hold a lock on the buffer.
5778 */
5780
5781 /*
5782 * Lock out cancel/die interrupts until we exit the code section protected
5783 * by the content lock. This ensures that interrupts will not interfere
5784 * with manipulations of data structures in shared memory.
5785 */
5787
5788 for (;;)
5789 {
5790 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5791 bool mustwait;
5792
5793 /*
5794 * Try to grab the lock the first time, we're not in the waitqueue
5795 * yet/anymore.
5796 */
5798
5799 if (likely(!mustwait))
5800 {
5801 break;
5802 }
5803
5804 /*
5805 * Ok, at this point we couldn't grab the lock on the first try. We
5806 * cannot simply queue ourselves to the end of the list and wait to be
5807 * woken up because by now the lock could long have been released.
5808 * Instead add us to the queue and try to grab the lock again. If we
5809 * succeed we need to revert the queuing and be happy, otherwise we
5810 * recheck the lock. If we still couldn't grab it, we know that the
5811 * other locker will see our queue entries when releasing since they
5812 * existed before we checked for the lock.
5813 */
5814
5815 /* add to the queue */
5817
5818 /* we're now guaranteed to be woken up if necessary */
5820
5821 /* ok, grabbed the lock the second time round, need to undo queueing */
5822 if (!mustwait)
5823 {
5825 break;
5826 }
5827
5828 switch (mode)
5829 {
5832 break;
5835 break;
5836 case BUFFER_LOCK_SHARE:
5838 break;
5839 case BUFFER_LOCK_UNLOCK:
5841
5842 }
5844
5845 /*
5846 * Wait until awakened.
5847 *
5848 * It is possible that we get awakened for a reason other than being
5849 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5850 * Once we've gotten the lock, re-increment the sema by the number of
5851 * additional signals received.
5852 */
5853 for (;;)
5854 {
5857 break;
5858 extraWaits++;
5859 }
5860
5862
5863 /* Retrying, allow BufferLockRelease to release waiters again. */
5865 }
5866
5867 /* Remember that we now hold this lock */
5868 entry->data.lockmode = mode;
5869
5870 /*
5871 * Fix the process wait semaphore's count for any absorbed wakeups.
5872 */
5873 while (unlikely(extraWaits-- > 0))
5875}
5876
5877/*
5878 * Release a previously acquired buffer content lock.
5879 */
5880static void
5882{
5885 uint64 sub;
5886
5888
5889 /*
5890 * Release my hold on lock, after that it can immediately be acquired by
5891 * others, even if we still have to wakeup other waiters.
5892 */
5894
5896
5898
5899 /*
5900 * Now okay to allow cancel/die interrupts.
5901 */
5903}
5904
5905
5906/*
5907 * Acquire the content lock for the buffer, but only if we don't have to wait.
5908 *
5909 * It is allowed to try to conditionally acquire a lock on a buffer that this
5910 * backend has already locked, but the lock acquisition will always fail, even
5911 * if the new lock acquisition does not conflict with an already held lock
5912 * (e.g. two share locks). This is because we currently do not have space to
5913 * track multiple lock ownerships of the same buffer within one backend. That
5914 * is ok for the current uses of BufferLockConditional().
5915 */
5916static bool
5918{
5920 bool mustwait;
5921
5922 /*
5923 * As described above, if we're trying to lock a buffer this backend
5924 * already has locked, return false, independent of the existing and
5925 * desired lock level.
5926 */
5927 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
5928 return false;
5929
5930 /*
5931 * Lock out cancel/die interrupts until we exit the code section protected
5932 * by the content lock. This ensures that interrupts will not interfere
5933 * with manipulations of data structures in shared memory.
5934 */
5936
5937 /* Check for the lock */
5939
5940 if (mustwait)
5941 {
5942 /* Failed to get lock, so release interrupt holdoff */
5944 }
5945 else
5946 {
5947 entry->data.lockmode = mode;
5948 }
5949
5950 return !mustwait;
5951}
5952
5953/*
5954 * Internal function that tries to atomically acquire the content lock in the
5955 * passed in mode.
5956 *
5957 * This function will not block waiting for a lock to become free - that's the
5958 * caller's job.
5959 *
5960 * Similar to LWLockAttemptLock().
5961 */
5962static inline bool
5964{
5966
5967 /*
5968 * Read once outside the loop, later iterations will get the newer value
5969 * via compare & exchange.
5970 */
5972
5973 /* loop until we've determined whether we could acquire the lock or not */
5974 while (true)
5975 {
5977 bool lock_free;
5978
5980
5982 {
5983 lock_free = (old_state & BM_LOCK_MASK) == 0;
5984 if (lock_free)
5986 }
5988 {
5990 if (lock_free)
5992 }
5993 else
5994 {
5996 if (lock_free)
5998 }
5999
6000 /*
6001 * Attempt to swap in the state we are expecting. If we didn't see
6002 * lock to be free, that's just the old value. If we saw it as free,
6003 * we'll attempt to mark it acquired. The reason that we always swap
6004 * in the value is that this doubles as a memory barrier. We could try
6005 * to be smarter and only swap in values if we saw the lock as free,
6006 * but benchmark haven't shown it as beneficial so far.
6007 *
6008 * Retry if the value changed since we last looked at it.
6009 */
6012 {
6013 if (lock_free)
6014 {
6015 /* Great! Got the lock. */
6016 return false;
6017 }
6018 else
6019 return true; /* somebody else has the lock */
6020 }
6021 }
6022
6024}
6025
6026/*
6027 * Add ourselves to the end of the content lock's wait queue.
6028 */
6029static void
6031{
6032 /*
6033 * If we don't have a PGPROC structure, there's no way to wait. This
6034 * should never occur, since MyProc should only be null during shared
6035 * memory initialization.
6036 */
6037 if (MyProc == NULL)
6038 elog(PANIC, "cannot wait without a PGPROC structure");
6039
6041 elog(PANIC, "queueing for lock while waiting on another one");
6042
6044
6045 /* setting the flag is protected by the spinlock */
6047
6048 /*
6049 * These are currently used both for lwlocks and buffer content locks,
6050 * which is acceptable, although not pretty, because a backend can't wait
6051 * for both types of locks at the same time.
6052 */
6055
6056 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6057
6058 /* Can release the mutex now */
6060}
6061
6062/*
6063 * Remove ourselves from the waitlist.
6064 *
6065 * This is used if we queued ourselves because we thought we needed to sleep
6066 * but, after further checking, we discovered that we don't actually need to
6067 * do so.
6068 */
6069static void
6071{
6072 bool on_waitlist;
6073
6075
6077 if (on_waitlist)
6078 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6079
6080 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6082 {
6084 }
6085
6086 /* XXX: combine with fetch_and above? */
6088
6089 /* clear waiting state again, nice for debugging */
6090 if (on_waitlist)
6092 else
6093 {
6094 int extraWaits = 0;
6095
6096
6097 /*
6098 * Somebody else dequeued us and has or will wake us up. Deal with the
6099 * superfluous absorption of a wakeup.
6100 */
6101
6102 /*
6103 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6104 * removed ourselves - they'll have set it.
6105 */
6107
6108 /*
6109 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6110 * get reset at some inconvenient point later. Most of the time this
6111 * will immediately return.
6112 */
6113 for (;;)
6114 {
6117 break;
6118 extraWaits++;
6119 }
6120
6121 /*
6122 * Fix the process wait semaphore's count for any absorbed wakeups.
6123 */
6124 while (extraWaits-- > 0)
6126 }
6127}
6128
6129/*
6130 * Stop treating lock as held by current backend.
6131 *
6132 * After calling this function it's the callers responsibility to ensure that
6133 * the lock gets released, even in case of an error. This only is desirable if
6134 * the lock is going to be released in a different process than the process
6135 * that acquired it.
6136 */
6137static inline void
6143
6144/*
6145 * Stop treating lock as held by current backend.
6146 *
6147 * This is the code that can be shared between actually releasing a lock
6148 * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6149 * without releasing the lock (BufferLockDisown()).
6150 */
6151static inline int
6153{
6156
6158 if (ref == NULL)
6159 elog(ERROR, "lock %d is not held", buffer);
6160 mode = ref->data.lockmode;
6161 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6162
6163 return mode;
6164}
6165
6166/*
6167 * Wakeup all the lockers that currently have a chance to acquire the lock.
6168 *
6169 * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6170 */
6171static void
6173{
6174 bool new_wake_in_progress = false;
6175 bool wake_share_exclusive = true;
6178
6180
6181 /* lock wait list while collecting backends to wake up */
6183
6184 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6185 {
6186 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6187
6188 /*
6189 * Already woke up a conflicting lock, so skip over this wait list
6190 * entry.
6191 */
6193 continue;
6195 continue;
6196
6197 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6198 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6199
6200 /*
6201 * Prevent additional wakeups until retryer gets to run. Backends that
6202 * are just waiting for the lock to become free don't retry
6203 * automatically.
6204 */
6205 new_wake_in_progress = true;
6206
6207 /*
6208 * Signal that the process isn't on the wait list anymore. This allows
6209 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6210 * proclist_delete(), rather than having to check if it has been
6211 * removed from the list.
6212 */
6213 Assert(waiter->lwWaiting == LW_WS_WAITING);
6215
6216 /*
6217 * Don't wakeup further waiters after waking a conflicting waiter.
6218 */
6219 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6220 {
6221 /*
6222 * Share locks conflict with exclusive locks.
6223 */
6224 wake_exclusive = false;
6225 }
6226 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6227 {
6228 /*
6229 * Share-exclusive locks conflict with share-exclusive and
6230 * exclusive locks.
6231 */
6232 wake_exclusive = false;
6233 wake_share_exclusive = false;
6234 }
6235 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6236 {
6237 /*
6238 * Exclusive locks conflict with all other locks, there's no point
6239 * in waking up anybody else.
6240 */
6241 break;
6242 }
6243 }
6244
6246
6247 /* unset required flags, and release lock, in one fell swoop */
6248 {
6251
6253 while (true)
6254 {
6256
6257 /* compute desired flags */
6258
6261 else
6263
6264 if (proclist_is_empty(&buf_hdr->lock_waiters))
6266
6267 desired_state &= ~BM_LOCKED; /* release lock */
6268
6271 break;
6272 }
6273 }
6274
6275 /* Awaken any waiters I removed from the queue. */
6276 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6277 {
6278 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6279
6280 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6281
6282 /*
6283 * Guarantee that lwWaiting being unset only becomes visible once the
6284 * unlink from the link has completed. Otherwise the target backend
6285 * could be woken up for other reason and enqueue for a new lock - if
6286 * that happens before the list unlink happens, the list would end up
6287 * being corrupted.
6288 *
6289 * The barrier pairs with the LockBufHdr() when enqueuing for another
6290 * lock.
6291 */
6293 waiter->lwWaiting = LW_WS_NOT_WAITING;
6294 PGSemaphoreUnlock(waiter->sem);
6295 }
6296}
6297
6298/*
6299 * Compute subtraction from buffer state for a release of a held lock in
6300 * `mode`.
6301 *
6302 * This is separated from BufferLockUnlock() as we want to combine the lock
6303 * release with other atomic operations when possible, leading to the lock
6304 * release being done in multiple places, each needing to compute what to
6305 * subtract from the lock state.
6306 */
6307static inline uint64
6309{
6310 /*
6311 * Turns out that a switch() leads gcc to generate sufficiently worse code
6312 * for this to show up in profiles...
6313 */
6315 return BM_LOCK_VAL_EXCLUSIVE;
6318 else
6319 {
6321 return BM_LOCK_VAL_SHARED;
6322 }
6323
6324 return 0; /* keep compiler quiet */
6325}
6326
6327/*
6328 * Handle work that needs to be done after releasing a lock that was held in
6329 * `mode`, where `lockstate` is the result of the atomic operation modifying
6330 * the state variable.
6331 *
6332 * This is separated from BufferLockUnlock() as we want to combine the lock
6333 * release with other atomic operations when possible, leading to the lock
6334 * release being done in multiple places.
6335 */
6336static void
6338{
6339 bool check_waiters = false;
6340 bool wake_exclusive = false;
6341
6342 /* nobody else can have that kind of lock */
6344
6345 /*
6346 * If we're still waiting for backends to get scheduled, don't wake them
6347 * up again. Otherwise check if we need to look through the waitqueue to
6348 * wake other backends.
6349 */
6352 {
6353 if ((lockstate & BM_LOCK_MASK) == 0)
6354 {
6355 /*
6356 * We released a lock and the lock was, in that moment, free. We
6357 * therefore can wake waiters for any kind of lock.
6358 */
6359 check_waiters = true;
6360 wake_exclusive = true;
6361 }
6363 {
6364 /*
6365 * We released the lock, but another backend still holds a lock.
6366 * We can't have released an exclusive lock, as there couldn't
6367 * have been other lock holders. If we released a share lock, no
6368 * waiters need to be woken up, as there must be other share
6369 * lockers. However, if we held a share-exclusive lock, another
6370 * backend now could acquire a share-exclusive lock.
6371 */
6372 check_waiters = true;
6373 wake_exclusive = false;
6374 }
6375 }
6376
6377 /*
6378 * As waking up waiters requires the spinlock to be acquired, only do so
6379 * if necessary.
6380 */
6381 if (check_waiters)
6383}
6384
6385/*
6386 * BufferLockHeldByMeInMode - test whether my process holds the content lock
6387 * in the specified mode
6388 *
6389 * This is meant as debug support only.
6390 */
6391static bool
6393{
6394 PrivateRefCountEntry *entry =
6396
6397 if (!entry)
6398 return false;
6399 else
6400 return entry->data.lockmode == mode;
6401}
6402
6403/*
6404 * BufferLockHeldByMe - test whether my process holds the content lock in any
6405 * mode
6406 *
6407 * This is meant as debug support only.
6408 */
6409static bool
6411{
6412 PrivateRefCountEntry *entry =
6414
6415 if (!entry)
6416 return false;
6417 else
6418 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6419}
6420
6421/*
6422 * Release the content lock for the buffer.
6423 */
6424void
6426{
6428
6430 if (BufferIsLocal(buffer))
6431 return; /* local buffers need no lock */
6432
6435}
6436
6437/*
6438 * Acquire the content_lock for the buffer.
6439 */
6440void
6442{
6444
6445 /*
6446 * We can't wait if we haven't got a PGPROC. This should only occur
6447 * during bootstrap or shared memory initialization. Put an Assert here
6448 * to catch unsafe coding practices.
6449 */
6451
6452 /* handled in LockBuffer() wrapper */
6454
6456 if (BufferIsLocal(buffer))
6457 return; /* local buffers need no lock */
6458
6460
6461 /*
6462 * Test the most frequent lock modes first. While a switch (mode) would be
6463 * nice, at least gcc generates considerably worse code for it.
6464 *
6465 * Call BufferLockAcquire() with a constant argument for mode, to generate
6466 * more efficient code for the different lock modes.
6467 */
6468 if (mode == BUFFER_LOCK_SHARE)
6470 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6474 else
6475 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6476}
6477
6478/*
6479 * Acquire the content_lock for the buffer, but only if we don't have to wait.
6480 *
6481 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6482 */
6483bool
6485{
6486 BufferDesc *buf;
6487
6489 if (BufferIsLocal(buffer))
6490 return true; /* act as though we got it */
6491
6493
6495}
6496
6497/*
6498 * Verify that this backend is pinning the buffer exactly once.
6499 *
6500 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6501 * holds a pin on the buffer. We do not care whether some other backend does.
6502 */
6503void
6505{
6506 if (BufferIsLocal(buffer))
6507 {
6508 if (LocalRefCount[-buffer - 1] != 1)
6509 elog(ERROR, "incorrect local pin count: %d",
6510 LocalRefCount[-buffer - 1]);
6511 }
6512 else
6513 {
6514 if (GetPrivateRefCount(buffer) != 1)
6515 elog(ERROR, "incorrect local pin count: %d",
6517 }
6518}
6519
6520/*
6521 * LockBufferForCleanup - lock a buffer in preparation for deleting items
6522 *
6523 * Items may be deleted from a disk page only when the caller (a) holds an
6524 * exclusive lock on the buffer and (b) has observed that no other backend
6525 * holds a pin on the buffer. If there is a pin, then the other backend
6526 * might have a pointer into the buffer (for example, a heapscan reference
6527 * to an item --- see README for more details). It's OK if a pin is added
6528 * after the cleanup starts, however; the newly-arrived backend will be
6529 * unable to look at the page until we release the exclusive lock.
6530 *
6531 * To implement this protocol, a would-be deleter must pin the buffer and
6532 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6533 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6534 * it has successfully observed pin count = 1.
6535 */
6536void
6538{
6540 TimestampTz waitStart = 0;
6541 bool waiting = false;
6542 bool logged_recovery_conflict = false;
6543
6546
6548
6549 /*
6550 * We do not yet need to be worried about in-progress AIOs holding a pin,
6551 * as we, so far, only support doing reads via AIO and this function can
6552 * only be called once the buffer is valid (i.e. no read can be in
6553 * flight).
6554 */
6555
6556 /* Nobody else to wait for */
6557 if (BufferIsLocal(buffer))
6558 return;
6559
6561
6562 for (;;)
6563 {
6565 uint64 unset_bits = 0;
6566
6567 /* Try to acquire lock */
6570
6573 {
6574 /* Successfully acquired exclusive lock with pincount 1 */
6576
6577 /*
6578 * Emit the log message if recovery conflict on buffer pin was
6579 * resolved but the startup process waited longer than
6580 * deadlock_timeout for it.
6581 */
6584 waitStart, GetCurrentTimestamp(),
6585 NULL, false);
6586
6587 if (waiting)
6588 {
6589 /* reset ps display to remove the suffix if we added one */
6591 waiting = false;
6592 }
6593 return;
6594 }
6595 /* Failed, so mark myself as waiting for pincount 1 */
6597 {
6600 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6601 }
6602 bufHdr->wait_backend_pgprocno = MyProcNumber;
6606 0);
6608
6609 /* Wait to be signaled by UnpinBuffer() */
6610 if (InHotStandby)
6611 {
6612 if (!waiting)
6613 {
6614 /* adjust the process title to indicate that it's waiting */
6615 set_ps_display_suffix("waiting");
6616 waiting = true;
6617 }
6618
6619 /*
6620 * Emit the log message if the startup process is waiting longer
6621 * than deadlock_timeout for recovery conflict on buffer pin.
6622 *
6623 * Skip this if first time through because the startup process has
6624 * not started waiting yet in this case. So, the wait start
6625 * timestamp is set after this logic.
6626 */
6627 if (waitStart != 0 && !logged_recovery_conflict)
6628 {
6630
6631 if (TimestampDifferenceExceeds(waitStart, now,
6633 {
6635 waitStart, now, NULL, true);
6637 }
6638 }
6639
6640 /*
6641 * Set the wait start timestamp if logging is enabled and first
6642 * time through.
6643 */
6644 if (log_recovery_conflict_waits && waitStart == 0)
6645 waitStart = GetCurrentTimestamp();
6646
6647 /* Publish the bufid that Startup process waits on */
6649 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6651 /* Reset the published bufid */
6653 }
6654 else
6656
6657 /*
6658 * Remove flag marking us as waiter. Normally this will not be set
6659 * anymore, but ProcWaitForSignal() can return for other signals as
6660 * well. We take care to only reset the flag if we're the waiter, as
6661 * theoretically another backend could have started waiting. That's
6662 * impossible with the current usages due to table level locking, but
6663 * better be safe.
6664 */
6666 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6667 bufHdr->wait_backend_pgprocno == MyProcNumber)
6669
6671 0, unset_bits,
6672 0);
6673
6675 /* Loop back and try again */
6676 }
6677}
6678
6679/*
6680 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6681 * requests cancellation of all pin holders that are blocking it.
6682 */
6683bool
6685{
6687
6688 /*
6689 * If we get woken slowly then it's possible that the Startup process was
6690 * already woken by other backends before we got here. Also possible that
6691 * we get here by multiple interrupts or interrupts at inappropriate
6692 * times, so make sure we do nothing if the bufid is not set.
6693 */
6694 if (bufid < 0)
6695 return false;
6696
6697 if (GetPrivateRefCount(bufid + 1) > 0)
6698 return true;
6699
6700 return false;
6701}
6702
6703/*
6704 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6705 *
6706 * We won't loop, but just check once to see if the pin count is OK. If
6707 * not, return false with no lock held.
6708 */
6709bool
6711{
6714 refcount;
6715
6717
6718 /* see AIO related comment in LockBufferForCleanup() */
6719
6720 if (BufferIsLocal(buffer))
6721 {
6722 refcount = LocalRefCount[-buffer - 1];
6723 /* There should be exactly one pin */
6724 Assert(refcount > 0);
6725 if (refcount != 1)
6726 return false;
6727 /* Nobody else to wait for */
6728 return true;
6729 }
6730
6731 /* There should be exactly one local pin */
6732 refcount = GetPrivateRefCount(buffer);
6733 Assert(refcount);
6734 if (refcount != 1)
6735 return false;
6736
6737 /* Try to acquire lock */
6739 return false;
6740
6744
6745 Assert(refcount > 0);
6746 if (refcount == 1)
6747 {
6748 /* Successfully acquired exclusive lock with pincount 1 */
6750 return true;
6751 }
6752
6753 /* Failed, so release the lock */
6756 return false;
6757}
6758
6759/*
6760 * IsBufferCleanupOK - as above, but we already have the lock
6761 *
6762 * Check whether it's OK to perform cleanup on a buffer we've already
6763 * locked. If we observe that the pin count is 1, our exclusive lock
6764 * happens to be a cleanup lock, and we can proceed with anything that
6765 * would have been allowable had we sought a cleanup lock originally.
6766 */
6767bool
6769{
6772
6774
6775 /* see AIO related comment in LockBufferForCleanup() */
6776
6777 if (BufferIsLocal(buffer))
6778 {
6779 /* There should be exactly one pin */
6780 if (LocalRefCount[-buffer - 1] != 1)
6781 return false;
6782 /* Nobody else to wait for */
6783 return true;
6784 }
6785
6786 /* There should be exactly one local pin */
6787 if (GetPrivateRefCount(buffer) != 1)
6788 return false;
6789
6791
6792 /* caller must hold exclusive lock on buffer */
6794
6796
6799 {
6800 /* pincount is OK. */
6802 return true;
6803 }
6804
6806 return false;
6807}
6808
6809/*
6810 * Helper for BufferBeginSetHintBits() and BufferSetHintBits16().
6811 *
6812 * This checks if the current lock mode already suffices to allow hint bits
6813 * being set and, if not, whether the current lock can be upgraded.
6814 *
6815 * Updates *lockstate when returning true.
6816 */
6817static inline bool
6819{
6823
6825
6826 if (ref == NULL)
6827 elog(ERROR, "buffer is not pinned");
6828
6829 mode = ref->data.lockmode;
6830 if (mode == BUFFER_LOCK_UNLOCK)
6831 elog(ERROR, "buffer is not locked");
6832
6833 /* we're done if we are already holding a sufficient lock level */
6835 {
6837 return true;
6838 }
6839
6840 /*
6841 * We are only holding a share lock right now, try to upgrade it to
6842 * SHARE_EXCLUSIVE.
6843 */
6845
6847 while (true)
6848 {
6850
6852
6853 /*
6854 * Can't upgrade if somebody else holds the lock in exclusive or
6855 * share-exclusive mode.
6856 */
6858 {
6859 return false;
6860 }
6861
6862 /* currently held lock state */
6864
6865 /* new lock level */
6867
6870 {
6871 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
6873
6874 return true;
6875 }
6876 }
6877}
6878
6879/*
6880 * Try to acquire the right to set hint bits on the buffer.
6881 *
6882 * To be allowed to set hint bits, this backend needs to hold either a
6883 * share-exclusive or an exclusive lock. In case this backend only holds a
6884 * share lock, this function will try to upgrade the lock to
6885 * share-exclusive. The caller is only allowed to set hint bits if true is
6886 * returned.
6887 *
6888 * Once BufferBeginSetHintBits() has returned true, hint bits may be set
6889 * without further calls to BufferBeginSetHintBits(), until the buffer is
6890 * unlocked.
6891 *
6892 *
6893 * Requiring a share-exclusive lock to set hint bits prevents setting hint
6894 * bits on buffers that are currently being written out, which could corrupt
6895 * the checksum on the page. Flushing buffers also requires a share-exclusive
6896 * lock.
6897 *
6898 * Due to a lock >= share-exclusive being required to set hint bits, only one
6899 * backend can set hint bits at a time. Allowing multiple backends to set hint
6900 * bits would require more complicated locking: For setting hint bits we'd
6901 * need to store the count of backends currently setting hint bits, for I/O we
6902 * would need another lock-level conflicting with the hint-setting
6903 * lock-level. Given that the share-exclusive lock for setting hint bits is
6904 * only held for a short time, that backends often would just set the same
6905 * hint bits and that the cost of occasionally not setting hint bits in hotly
6906 * accessed pages is fairly low, this seems like an acceptable tradeoff.
6907 */
6908bool
6910{
6913
6914 if (BufferIsLocal(buffer))
6915 {
6916 /*
6917 * NB: Will need to check if there is a write in progress, once it is
6918 * possible for writes to be done asynchronously.
6919 */
6920 return true;
6921 }
6922
6924
6926}
6927
6928/*
6929 * End a phase of setting hint bits on this buffer, started with
6930 * BufferBeginSetHintBits().
6931 *
6932 * This would strictly speaking not be required (i.e. the caller could do
6933 * MarkBufferDirtyHint() if so desired), but allows us to perform some sanity
6934 * checks.
6935 */
6936void
6946
6947/*
6948 * Try to set hint bits on a single 16bit value in a buffer.
6949 *
6950 * If hint bits are allowed to be set, set *ptr = val, try to mark the buffer
6951 * dirty and return true. Otherwise false is returned.
6952 *
6953 * *ptr needs to be a pointer to memory within the buffer.
6954 *
6955 * This is a bit faster than BufferBeginSetHintBits() /
6956 * BufferFinishSetHintBits() when setting hints once in a buffer, but slower
6957 * than the former when setting hint bits multiple times in the same buffer.
6958 */
6959bool
6961{
6964#ifdef USE_ASSERT_CHECKING
6965 char *page;
6966
6967 /* verify that the address is on the page */
6968 page = BufferGetPage(buffer);
6969 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
6970#endif
6971
6972 if (BufferIsLocal(buffer))
6973 {
6974 *ptr = val;
6975
6977
6978 return true;
6979 }
6980
6982
6984 {
6985 *ptr = val;
6986
6988
6989 return true;
6990 }
6991
6992 return false;
6993}
6994
6995
6996/*
6997 * Functions for buffer I/O handling
6998 *
6999 * Also note that these are used only for shared buffers, not local ones.
7000 */
7001
7002/*
7003 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
7004 */
7005static void
7007{
7009
7011 for (;;)
7012 {
7015
7016 /*
7017 * It may not be necessary to acquire the spinlock to check the flag
7018 * here, but since this test is essential for correctness, we'd better
7019 * play it safe.
7020 */
7022
7023 /*
7024 * Copy the wait reference while holding the spinlock. This protects
7025 * against a concurrent TerminateBufferIO() in another backend from
7026 * clearing the wref while it's being read.
7027 */
7028 iow = buf->io_wref;
7030
7031 /* no IO in progress, we don't need to wait */
7033 break;
7034
7035 /*
7036 * The buffer has asynchronous IO in progress, wait for it to
7037 * complete.
7038 */
7039 if (pgaio_wref_valid(&iow))
7040 {
7042
7043 /*
7044 * The AIO subsystem internally uses condition variables and thus
7045 * might remove this backend from the BufferDesc's CV. While that
7046 * wouldn't cause a correctness issue (the first CV sleep just
7047 * immediately returns if not already registered), it seems worth
7048 * avoiding unnecessary loop iterations, given that we take care
7049 * to do so at the start of the function.
7050 */
7052 continue;
7053 }
7054
7055 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7057 }
7059}
7060
7061/*
7062 * StartBufferIO: begin I/O on this buffer
7063 * (Assumptions)
7064 * My process is executing no IO on this buffer
7065 * The buffer is Pinned
7066 *
7067 * In some scenarios multiple backends could attempt the same I/O operation
7068 * concurrently. If someone else has already started I/O on this buffer then
7069 * we will wait for completion of the IO using WaitIO().
7070 *
7071 * Input operations are only attempted on buffers that are not BM_VALID,
7072 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
7073 * so we can always tell if the work is already done.
7074 *
7075 * Returns true if we successfully marked the buffer as I/O busy,
7076 * false if someone else already did the work.
7077 *
7078 * If nowait is true, then we don't wait for an I/O to be finished by another
7079 * backend. In that case, false indicates either that the I/O was already
7080 * finished, or is still in progress. This is useful for callers that want to
7081 * find out if they can perform the I/O as part of a larger operation, without
7082 * waiting for the answer or distinguishing the reasons why not.
7083 */
7084bool
7086{
7088
7090
7091 for (;;)
7092 {
7094
7096 break;
7098 if (nowait)
7099 return false;
7100 WaitIO(buf);
7101 }
7102
7103 /* Once we get here, there is definitely no I/O active on this buffer */
7104
7105 /* Check if someone else already did the I/O */
7106 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7107 {
7109 return false;
7110 }
7111
7114 0);
7115
7118
7119 return true;
7120}
7121
7122/*
7123 * TerminateBufferIO: release a buffer we were doing I/O on
7124 * (Assumptions)
7125 * My process is executing IO for the buffer
7126 * BM_IO_IN_PROGRESS bit is set for the buffer
7127 * The buffer is Pinned
7128 *
7129 * If clear_dirty is true, we clear the buffer's BM_DIRTY flag. This is
7130 * appropriate when terminating a successful write.
7131 *
7132 * set_flag_bits gets ORed into the buffer's flags. It must include
7133 * BM_IO_ERROR in a failure case. For successful completion it could
7134 * be 0, or BM_VALID if we just finished reading in the page.
7135 *
7136 * If forget_owner is true, we release the buffer I/O from the current
7137 * resource owner. (forget_owner=false is used when the resource owner itself
7138 * is being released)
7139 */
7140void
7142 bool forget_owner, bool release_aio)
7143{
7146 int refcount_change = 0;
7147
7149
7152
7153 /* Clear earlier errors, if this IO failed, it'll be marked again */
7155
7156 if (clear_dirty)
7158
7159 if (release_aio)
7160 {
7161 /* release ownership by the AIO subsystem */
7163 refcount_change = -1;
7164 pgaio_wref_clear(&buf->io_wref);
7165 }
7166
7170
7171 if (forget_owner)
7174
7176
7177 /*
7178 * Support LockBufferForCleanup()
7179 *
7180 * We may have just released the last pin other than the waiter's. In most
7181 * cases, this backend holds another pin on the buffer. But, if, for
7182 * example, this backend is completing an IO issued by another backend, it
7183 * may be time to wake the waiter.
7184 */
7187}
7188
7189/*
7190 * AbortBufferIO: Clean up active buffer I/O after an error.
7191 *
7192 * All LWLocks & content locks we might have held have been released, but we
7193 * haven't yet released buffer pins, so the buffer is still pinned.
7194 *
7195 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
7196 * possible the error condition wasn't related to the I/O.
7197 *
7198 * Note: this does not remove the buffer I/O from the resource owner.
7199 * That's correct when we're releasing the whole resource owner, but
7200 * beware if you use this in other contexts.
7201 */
7202static void
7204{
7207
7210
7211 if (!(buf_state & BM_VALID))
7212 {
7215 }
7216 else
7217 {
7220
7221 /* Issue notice if this is not the first failure... */
7222 if (buf_state & BM_IO_ERROR)
7223 {
7224 /* Buffer is pinned, so we can read tag without spinlock */
7227 errmsg("could not write block %u of %s",
7228 buf_hdr->tag.blockNum,
7230 BufTagGetForkNum(&buf_hdr->tag)).str),
7231 errdetail("Multiple failures --- write error might be permanent.")));
7232 }
7233 }
7234
7235 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7236}
7237
7238/*
7239 * Error context callback for errors occurring during shared buffer writes.
7240 */
7241static void
7243{
7245
7246 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7247 if (bufHdr != NULL)
7248 errcontext("writing block %u of relation \"%s\"",
7249 bufHdr->tag.blockNum,
7251 BufTagGetForkNum(&bufHdr->tag)).str);
7252}
7253
7254/*
7255 * Error context callback for errors occurring during local buffer writes.
7256 */
7257static void
7259{
7261
7262 if (bufHdr != NULL)
7263 errcontext("writing block %u of relation \"%s\"",
7264 bufHdr->tag.blockNum,
7267 BufTagGetForkNum(&bufHdr->tag)).str);
7268}
7269
7270/*
7271 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7272 */
7273static int
7274rlocator_comparator(const void *p1, const void *p2)
7275{
7276 RelFileLocator n1 = *(const RelFileLocator *) p1;
7277 RelFileLocator n2 = *(const RelFileLocator *) p2;
7278
7279 if (n1.relNumber < n2.relNumber)
7280 return -1;
7281 else if (n1.relNumber > n2.relNumber)
7282 return 1;
7283
7284 if (n1.dbOid < n2.dbOid)
7285 return -1;
7286 else if (n1.dbOid > n2.dbOid)
7287 return 1;
7288
7289 if (n1.spcOid < n2.spcOid)
7290 return -1;
7291 else if (n1.spcOid > n2.spcOid)
7292 return 1;
7293 else
7294 return 0;
7295}
7296
7297/*
7298 * Lock buffer header - set BM_LOCKED in buffer state.
7299 */
7300uint64
7302{
7304
7306
7307 while (true)
7308 {
7309 /*
7310 * Always try once to acquire the lock directly, without setting up
7311 * the spin-delay infrastructure. The work necessary for that shows up
7312 * in profiles and is rarely necessary.
7313 */
7315 if (likely(!(old_buf_state & BM_LOCKED)))
7316 break; /* got lock */
7317
7318 /* and then spin without atomic operations until lock is released */
7319 {
7321
7323
7324 while (old_buf_state & BM_LOCKED)
7325 {
7328 }
7330 }
7331
7332 /*
7333 * Retry. The lock might obviously already be re-acquired by the time
7334 * we're attempting to get it again.
7335 */
7336 }
7337
7338 return old_buf_state | BM_LOCKED;
7339}
7340
7341/*
7342 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7343 * state at that point.
7344 *
7345 * Obviously the buffer could be locked by the time the value is returned, so
7346 * this is primarily useful in CAS style loops.
7347 */
7350{
7353
7355
7356 buf_state = pg_atomic_read_u64(&buf->state);
7357
7358 while (buf_state & BM_LOCKED)
7359 {
7361 buf_state = pg_atomic_read_u64(&buf->state);
7362 }
7363
7365
7366 return buf_state;
7367}
7368
7369/*
7370 * BufferTag comparator.
7371 */
7372static inline int
7374{
7375 int ret;
7378
7381
7383
7384 if (ret != 0)
7385 return ret;
7386
7388 return -1;
7390 return 1;
7391
7392 if (ba->blockNum < bb->blockNum)
7393 return -1;
7394 if (ba->blockNum > bb->blockNum)
7395 return 1;
7396
7397 return 0;
7398}
7399
7400/*
7401 * Comparator determining the writeout order in a checkpoint.
7402 *
7403 * It is important that tablespaces are compared first, the logic balancing
7404 * writes between tablespaces relies on it.
7405 */
7406static inline int
7408{
7409 /* compare tablespace */
7410 if (a->tsId < b->tsId)
7411 return -1;
7412 else if (a->tsId > b->tsId)
7413 return 1;
7414 /* compare relation */
7415 if (a->relNumber < b->relNumber)
7416 return -1;
7417 else if (a->relNumber > b->relNumber)
7418 return 1;
7419 /* compare fork */
7420 else if (a->forkNum < b->forkNum)
7421 return -1;
7422 else if (a->forkNum > b->forkNum)
7423 return 1;
7424 /* compare block number */
7425 else if (a->blockNum < b->blockNum)
7426 return -1;
7427 else if (a->blockNum > b->blockNum)
7428 return 1;
7429 /* equal page IDs are unlikely, but not impossible */
7430 return 0;
7431}
7432
7433/*
7434 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7435 * progress.
7436 */
7437static int
7439{
7442
7443 /* we want a min-heap, so return 1 for the a < b */
7444 if (sa->progress < sb->progress)
7445 return 1;
7446 else if (sa->progress == sb->progress)
7447 return 0;
7448 else
7449 return -1;
7450}
7451
7452/*
7453 * Initialize a writeback context, discarding potential previous state.
7454 *
7455 * *max_pending is a pointer instead of an immediate value, so the coalesce
7456 * limits can easily changed by the GUC mechanism, and so calling code does
7457 * not have to check the current configuration. A value of 0 means that no
7458 * writeback control will be performed.
7459 */
7460void
7461WritebackContextInit(WritebackContext *context, int *max_pending)
7462{
7463 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7464
7465 context->max_pending = max_pending;
7466 context->nr_pending = 0;
7467}
7468
7469/*
7470 * Add buffer to list of pending writeback requests.
7471 */
7472void
7474 BufferTag *tag)
7475{
7476 PendingWriteback *pending;
7477
7478 /*
7479 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7480 * point in tracking in that case.
7481 */
7483 !enableFsync)
7484 return;
7485
7486 /*
7487 * Add buffer to the pending writeback array, unless writeback control is
7488 * disabled.
7489 */
7490 if (*wb_context->max_pending > 0)
7491 {
7493
7494 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7495
7496 pending->tag = *tag;
7497 }
7498
7499 /*
7500 * Perform pending flushes if the writeback limit is exceeded. This
7501 * includes the case where previously an item has been added, but control
7502 * is now disabled.
7503 */
7504 if (wb_context->nr_pending >= *wb_context->max_pending)
7506}
7507
7508#define ST_SORT sort_pending_writebacks
7509#define ST_ELEMENT_TYPE PendingWriteback
7510#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7511#define ST_SCOPE static
7512#define ST_DEFINE
7513#include "lib/sort_template.h"
7514
7515/*
7516 * Issue all pending writeback requests, previously scheduled with
7517 * ScheduleBufferTagForWriteback, to the OS.
7518 *
7519 * Because this is only used to improve the OSs IO scheduling we try to never
7520 * error out - it's just a hint.
7521 */
7522void
7524{
7526 int i;
7527
7528 if (wb_context->nr_pending == 0)
7529 return;
7530
7531 /*
7532 * Executing the writes in-order can make them a lot faster, and allows to
7533 * merge writeback requests to consecutive blocks into larger writebacks.
7534 */
7535 sort_pending_writebacks(wb_context->pending_writebacks,
7536 wb_context->nr_pending);
7537
7539
7540 /*
7541 * Coalesce neighbouring writes, but nothing else. For that we iterate
7542 * through the, now sorted, array of pending flushes, and look forward to
7543 * find all neighbouring (or identical) writes.
7544 */
7545 for (i = 0; i < wb_context->nr_pending; i++)
7546 {
7550 int ahead;
7551 BufferTag tag;
7553 Size nblocks = 1;
7554
7555 cur = &wb_context->pending_writebacks[i];
7556 tag = cur->tag;
7558
7559 /*
7560 * Peek ahead, into following writeback requests, to see if they can
7561 * be combined with the current one.
7562 */
7563 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7564 {
7565
7566 next = &wb_context->pending_writebacks[i + ahead + 1];
7567
7568 /* different file, stop */
7570 BufTagGetRelFileLocator(&next->tag)) ||
7571 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7572 break;
7573
7574 /* ok, block queued twice, skip */
7575 if (cur->tag.blockNum == next->tag.blockNum)
7576 continue;
7577
7578 /* only merge consecutive writes */
7579 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7580 break;
7581
7582 nblocks++;
7583 cur = next;
7584 }
7585
7586 i += ahead;
7587
7588 /* and finally tell the kernel to write the data to storage */
7590 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7591 }
7592
7593 /*
7594 * Assume that writeback requests are only issued for buffers containing
7595 * blocks of permanent relations.
7596 */
7598 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7599
7600 wb_context->nr_pending = 0;
7601}
7602
7603/* ResourceOwner callbacks */
7604
7605static void
7612
7613static char *
7615{
7617
7618 return psprintf("lost track of buffer IO on buffer %d", buffer);
7619}
7620
7621/*
7622 * Release buffer as part of resource owner cleanup. This will only be called
7623 * if the buffer is pinned. If this backend held the content lock at the time
7624 * of the error we also need to release that (note that it is not possible to
7625 * hold a content lock without a pin).
7626 */
7627static void
7629{
7631
7632 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7633 if (!BufferIsValid(buffer))
7634 elog(ERROR, "bad buffer ID: %d", buffer);
7635
7636 if (BufferIsLocal(buffer))
7638 else
7639 {
7641
7643
7644 /* not having a private refcount would imply resowner corruption */
7645 Assert(ref != NULL);
7646
7647 /*
7648 * If the buffer was locked at the time of the resowner release,
7649 * release the lock now. This should only happen after errors.
7650 */
7651 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7652 {
7654
7655 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7657 }
7658
7660 }
7661}
7662
7663static char *
7668
7669/*
7670 * Helper function to evict unpinned buffer whose buffer header lock is
7671 * already acquired.
7672 */
7673static bool
7675{
7677 bool result;
7678
7679 *buffer_flushed = false;
7680
7683
7684 if ((buf_state & BM_VALID) == 0)
7685 {
7686 UnlockBufHdr(desc);
7687 return false;
7688 }
7689
7690 /* Check that it's not pinned already. */
7692 {
7693 UnlockBufHdr(desc);
7694 return false;
7695 }
7696
7697 PinBuffer_Locked(desc); /* releases spinlock */
7698
7699 /* If it was dirty, try to clean it once. */
7700 if (buf_state & BM_DIRTY)
7701 {
7703 *buffer_flushed = true;
7704 }
7705
7706 /* This will return false if it becomes dirty or someone else pins it. */
7707 result = InvalidateVictimBuffer(desc);
7708
7709 UnpinBuffer(desc);
7710
7711 return result;
7712}
7713
7714/*
7715 * Try to evict the current block in a shared buffer.
7716 *
7717 * This function is intended for testing/development use only!
7718 *
7719 * To succeed, the buffer must not be pinned on entry, so if the caller had a
7720 * particular block in mind, it might already have been replaced by some other
7721 * block by the time this function runs. It's also unpinned on return, so the
7722 * buffer might be occupied again by the time control is returned, potentially
7723 * even by the same block. This inherent raciness without other interlocking
7724 * makes the function unsuitable for non-testing usage.
7725 *
7726 * *buffer_flushed is set to true if the buffer was dirty and has been
7727 * flushed, false otherwise. However, *buffer_flushed=true does not
7728 * necessarily mean that we flushed the buffer, it could have been flushed by
7729 * someone else.
7730 *
7731 * Returns true if the buffer was valid and it has now been made invalid.
7732 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7733 * or if the buffer becomes dirty again while we're trying to write it out.
7734 */
7735bool
7737{
7738 BufferDesc *desc;
7739
7741
7742 /* Make sure we can pin the buffer. */
7745
7746 desc = GetBufferDescriptor(buf - 1);
7747 LockBufHdr(desc);
7748
7750}
7751
7752/*
7753 * Try to evict all the shared buffers.
7754 *
7755 * This function is intended for testing/development use only! See
7756 * EvictUnpinnedBuffer().
7757 *
7758 * The buffers_* parameters are mandatory and indicate the total count of
7759 * buffers that:
7760 * - buffers_evicted - were evicted
7761 * - buffers_flushed - were flushed
7762 * - buffers_skipped - could not be evicted
7763 */
7764void
7767{
7768 *buffers_evicted = 0;
7769 *buffers_skipped = 0;
7770 *buffers_flushed = 0;
7771
7772 for (int buf = 1; buf <= NBuffers; buf++)
7773 {
7774 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7776 bool buffer_flushed;
7777
7779
7781 if (!(buf_state & BM_VALID))
7782 continue;
7783
7786
7787 LockBufHdr(desc);
7788
7790 (*buffers_evicted)++;
7791 else
7792 (*buffers_skipped)++;
7793
7794 if (buffer_flushed)
7795 (*buffers_flushed)++;
7796 }
7797}
7798
7799/*
7800 * Try to evict all the shared buffers containing provided relation's pages.
7801 *
7802 * This function is intended for testing/development use only! See
7803 * EvictUnpinnedBuffer().
7804 *
7805 * The caller must hold at least AccessShareLock on the relation to prevent
7806 * the relation from being dropped.
7807 *
7808 * The buffers_* parameters are mandatory and indicate the total count of
7809 * buffers that:
7810 * - buffers_evicted - were evicted
7811 * - buffers_flushed - were flushed
7812 * - buffers_skipped - could not be evicted
7813 */
7814void
7817{
7819
7820 *buffers_skipped = 0;
7821 *buffers_evicted = 0;
7822 *buffers_flushed = 0;
7823
7824 for (int buf = 1; buf <= NBuffers; buf++)
7825 {
7826 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7828 bool buffer_flushed;
7829
7831
7832 /* An unlocked precheck should be safe and saves some cycles. */
7833 if ((buf_state & BM_VALID) == 0 ||
7835 continue;
7836
7837 /* Make sure we can pin the buffer. */
7840
7841 buf_state = LockBufHdr(desc);
7842
7843 /* recheck, could have changed without the lock */
7844 if ((buf_state & BM_VALID) == 0 ||
7846 {
7847 UnlockBufHdr(desc);
7848 continue;
7849 }
7850
7852 (*buffers_evicted)++;
7853 else
7854 (*buffers_skipped)++;
7855
7856 if (buffer_flushed)
7857 (*buffers_flushed)++;
7858 }
7859}
7860
7861/*
7862 * Helper function to mark unpinned buffer dirty whose buffer header lock is
7863 * already acquired.
7864 */
7865static bool
7868{
7870 bool result = false;
7871
7872 *buffer_already_dirty = false;
7873
7876
7877 if ((buf_state & BM_VALID) == 0)
7878 {
7879 UnlockBufHdr(desc);
7880 return false;
7881 }
7882
7883 /* Check that it's not pinned already. */
7885 {
7886 UnlockBufHdr(desc);
7887 return false;
7888 }
7889
7890 /* Pin the buffer and then release the buffer spinlock */
7891 PinBuffer_Locked(desc);
7892
7893 /* If it was not already dirty, mark it as dirty. */
7894 if (!(buf_state & BM_DIRTY))
7895 {
7898 result = true;
7899 BufferLockUnlock(buf, desc);
7900 }
7901 else
7902 *buffer_already_dirty = true;
7903
7904 UnpinBuffer(desc);
7905
7906 return result;
7907}
7908
7909/*
7910 * Try to mark the provided shared buffer as dirty.
7911 *
7912 * This function is intended for testing/development use only!
7913 *
7914 * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
7915 *
7916 * The buffer_already_dirty parameter is mandatory and indicate if the buffer
7917 * could not be dirtied because it is already dirty.
7918 *
7919 * Returns true if the buffer has successfully been marked as dirty.
7920 */
7921bool
7923{
7924 BufferDesc *desc;
7925 bool buffer_dirtied = false;
7926
7928
7929 /* Make sure we can pin the buffer. */
7932
7933 desc = GetBufferDescriptor(buf - 1);
7934 LockBufHdr(desc);
7935
7937 /* Both can not be true at the same time */
7939
7940 return buffer_dirtied;
7941}
7942
7943/*
7944 * Try to mark all the shared buffers containing provided relation's pages as
7945 * dirty.
7946 *
7947 * This function is intended for testing/development use only! See
7948 * MarkDirtyUnpinnedBuffer().
7949 *
7950 * The buffers_* parameters are mandatory and indicate the total count of
7951 * buffers that:
7952 * - buffers_dirtied - were dirtied
7953 * - buffers_already_dirty - were already dirty
7954 * - buffers_skipped - could not be dirtied because of a reason different
7955 * than a buffer being already dirty.
7956 */
7957void
7962{
7964
7965 *buffers_dirtied = 0;
7967 *buffers_skipped = 0;
7968
7969 for (int buf = 1; buf <= NBuffers; buf++)
7970 {
7971 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7974
7976
7977 /* An unlocked precheck should be safe and saves some cycles. */
7978 if ((buf_state & BM_VALID) == 0 ||
7980 continue;
7981
7982 /* Make sure we can pin the buffer. */
7985
7986 buf_state = LockBufHdr(desc);
7987
7988 /* recheck, could have changed without the lock */
7989 if ((buf_state & BM_VALID) == 0 ||
7991 {
7992 UnlockBufHdr(desc);
7993 continue;
7994 }
7995
7997 (*buffers_dirtied)++;
7998 else if (buffer_already_dirty)
7999 (*buffers_already_dirty)++;
8000 else
8001 (*buffers_skipped)++;
8002 }
8003}
8004
8005/*
8006 * Try to mark all the shared buffers as dirty.
8007 *
8008 * This function is intended for testing/development use only! See
8009 * MarkDirtyUnpinnedBuffer().
8010 *
8011 * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
8012 * parameters.
8013 */
8014void
8018{
8019 *buffers_dirtied = 0;
8021 *buffers_skipped = 0;
8022
8023 for (int buf = 1; buf <= NBuffers; buf++)
8024 {
8025 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8028
8030
8032 if (!(buf_state & BM_VALID))
8033 continue;
8034
8037
8038 LockBufHdr(desc);
8039
8041 (*buffers_dirtied)++;
8042 else if (buffer_already_dirty)
8043 (*buffers_already_dirty)++;
8044 else
8045 (*buffers_skipped)++;
8046 }
8047}
8048
8049/*
8050 * Generic implementation of the AIO handle staging callback for readv/writev
8051 * on local/shared buffers.
8052 *
8053 * Each readv/writev can target multiple buffers. The buffers have already
8054 * been registered with the IO handle.
8055 *
8056 * To make the IO ready for execution ("staging"), we need to ensure that the
8057 * targeted buffers are in an appropriate state while the IO is ongoing. For
8058 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
8059 * in this backend could lead to this backend's buffer pin being released as
8060 * part of error handling, which in turn could lead to the buffer being
8061 * replaced while IO is ongoing.
8062 */
8065{
8066 uint64 *io_data;
8067 uint8 handle_data_len;
8070
8071 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8072
8074
8075 /* iterate over all buffers affected by the vectored readv/writev */
8076 for (int i = 0; i < handle_data_len; i++)
8077 {
8079 BufferDesc *buf_hdr = is_temp ?
8083
8084 /*
8085 * Check that all the buffers are actually ones that could conceivably
8086 * be done in one IO, i.e. are sequential. This is the last
8087 * buffer-aware code before IO is actually executed and confusion
8088 * about which buffers are targeted by IO can be hard to debug, making
8089 * it worth doing extra-paranoid checks.
8090 */
8091 if (i == 0)
8092 first = buf_hdr->tag;
8093 else
8094 {
8095 Assert(buf_hdr->tag.relNumber == first.relNumber);
8096 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8097 }
8098
8099 if (is_temp)
8101 else
8103
8104 /* verify the buffer is in the expected state */
8106 if (is_write)
8107 {
8110 }
8111 else
8112 {
8115 }
8116
8117 /* temp buffers don't use BM_IO_IN_PROGRESS */
8118 if (!is_temp)
8120
8122
8123 /*
8124 * Reflect that the buffer is now owned by the AIO subsystem.
8125 *
8126 * For local buffers: This can't be done just via LocalRefCount, as
8127 * one might initially think, as this backend could error out while
8128 * AIO is still in progress, releasing all the pins by the backend
8129 * itself.
8130 *
8131 * This pin is released again in TerminateBufferIO().
8132 */
8133 buf_hdr->io_wref = io_ref;
8134
8135 if (is_temp)
8136 {
8139 }
8140 else
8142
8143 /*
8144 * Ensure the content lock that prevents buffer modifications while
8145 * the buffer is being written out is not released early due to an
8146 * error.
8147 */
8148 if (is_write && !is_temp)
8149 {
8151
8152 /*
8153 * Lock is now owned by AIO subsystem.
8154 */
8156 }
8157
8158 /*
8159 * Stop tracking this buffer via the resowner - the AIO system now
8160 * keeps track.
8161 */
8162 if (!is_temp)
8164 }
8165}
8166
8167/*
8168 * Decode readv errors as encoded by buffer_readv_encode_error().
8169 */
8170static inline void
8172 bool *zeroed_any,
8173 bool *ignored_any,
8177{
8178 uint32 rem_error = result.error_data;
8179
8180 /* see static asserts in buffer_readv_encode_error */
8181#define READV_COUNT_BITS 7
8182#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8183
8184 *zeroed_any = rem_error & 1;
8185 rem_error >>= 1;
8186
8187 *ignored_any = rem_error & 1;
8188 rem_error >>= 1;
8189
8192
8195
8198}
8199
8200/*
8201 * Helper to encode errors for buffer_readv_complete()
8202 *
8203 * Errors are encoded as follows:
8204 * - bit 0 indicates whether any page was zeroed (1) or not (0)
8205 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8206 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8207 * - next READV_COUNT_BITS bits indicate the number of checksum failures
8208 * - next READV_COUNT_BITS bits indicate the first offset of the first page
8209 * that was errored or zeroed or, if no errors/zeroes, the first ignored
8210 * checksum
8211 */
8212static inline void
8214 bool is_temp,
8215 bool zeroed_any,
8216 bool ignored_any,
8223{
8224
8225 uint8 shift = 0;
8229
8231 "PG_IOV_MAX is bigger than reserved space for error data");
8233 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8234
8235 /*
8236 * We only have space to encode one offset - but luckily that's good
8237 * enough. If there is an error, the error is the interesting offset, same
8238 * with a zeroed buffer vs an ignored buffer.
8239 */
8240 if (error_count > 0)
8242 else if (zeroed_count > 0)
8244 else
8246
8247 Assert(!zeroed_any || error_count == 0);
8248
8249 result->error_data = 0;
8250
8251 result->error_data |= zeroed_any << shift;
8252 shift += 1;
8253
8254 result->error_data |= ignored_any << shift;
8255 shift += 1;
8256
8257 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8258 shift += READV_COUNT_BITS;
8259
8260 result->error_data |= ((uint32) checkfail_count) << shift;
8261 shift += READV_COUNT_BITS;
8262
8263 result->error_data |= ((uint32) first_off) << shift;
8264 shift += READV_COUNT_BITS;
8265
8266 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8268
8269 if (error_count > 0)
8270 result->status = PGAIO_RS_ERROR;
8271 else
8272 result->status = PGAIO_RS_WARNING;
8273
8274 /*
8275 * The encoding is complicated enough to warrant cross-checking it against
8276 * the decode function.
8277 */
8278#ifdef USE_ASSERT_CHECKING
8279 {
8280 bool zeroed_any_2,
8285
8290 &first_off_2);
8296 }
8297#endif
8298
8299#undef READV_COUNT_BITS
8300#undef READV_COUNT_MASK
8301}
8302
8303/*
8304 * Helper for AIO readv completion callbacks, supporting both shared and temp
8305 * buffers. Gets called once for each buffer in a multi-page read.
8306 */
8309 uint8 flags, bool failed, bool is_temp,
8310 bool *buffer_invalid,
8311 bool *failed_checksum,
8312 bool *ignored_checksum,
8313 bool *zeroed_buffer)
8314{
8315 BufferDesc *buf_hdr = is_temp ?
8318 BufferTag tag = buf_hdr->tag;
8319 char *bufdata = BufferGetBlock(buffer);
8321 int piv_flags;
8322
8323 /* check that the buffer is in the expected state for a read */
8324#ifdef USE_ASSERT_CHECKING
8325 {
8327
8330 /* temp buffers don't use BM_IO_IN_PROGRESS */
8331 if (!is_temp)
8334 }
8335#endif
8336
8337 *buffer_invalid = false;
8338 *failed_checksum = false;
8339 *ignored_checksum = false;
8340 *zeroed_buffer = false;
8341
8342 /*
8343 * We ask PageIsVerified() to only log the message about checksum errors,
8344 * as the completion might be run in any backend (or IO workers). We will
8345 * report checksum errors in buffer_readv_report().
8346 */
8348
8349 /* the local zero_damaged_pages may differ from the definer's */
8352
8353 /* Check for garbage data. */
8354 if (!failed)
8355 {
8356 /*
8357 * If the buffer is not currently pinned by this backend, e.g. because
8358 * we're completing this IO after an error, the buffer data will have
8359 * been marked as inaccessible when the buffer was unpinned. The AIO
8360 * subsystem holds a pin, but that doesn't prevent the buffer from
8361 * having been marked as inaccessible. The completion might also be
8362 * executed in a different process.
8363 */
8364#ifdef USE_VALGRIND
8365 if (!BufferIsPinned(buffer))
8367#endif
8368
8369 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8371 {
8372 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8373 {
8374 memset(bufdata, 0, BLCKSZ);
8375 *zeroed_buffer = true;
8376 }
8377 else
8378 {
8379 *buffer_invalid = true;
8380 /* mark buffer as having failed */
8381 failed = true;
8382 }
8383 }
8384 else if (*failed_checksum)
8385 *ignored_checksum = true;
8386
8387 /* undo what we did above */
8388#ifdef USE_VALGRIND
8389 if (!BufferIsPinned(buffer))
8391#endif
8392
8393 /*
8394 * Immediately log a message about the invalid page, but only to the
8395 * server log. The reason to do so immediately is that this may be
8396 * executed in a different backend than the one that originated the
8397 * request. The reason to do so immediately is that the originator
8398 * might not process the query result immediately (because it is busy
8399 * doing another part of query processing) or at all (e.g. if it was
8400 * cancelled or errored out due to another IO also failing). The
8401 * definer of the IO will emit an ERROR or WARNING when processing the
8402 * IO's results
8403 *
8404 * To avoid duplicating the code to emit these log messages, we reuse
8405 * buffer_readv_report().
8406 */
8408 {
8409 PgAioResult result_one = {0};
8410
8415 *zeroed_buffer ? 1 : 0,
8416 *failed_checksum ? 1 : 0,
8419 }
8420 }
8421
8422 /* Terminate I/O and set BM_VALID. */
8423 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8424 if (is_temp)
8426 else
8427 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8428
8429 /*
8430 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8431 * callback may not be executed in the same backend that called
8432 * BUFFER_READ_START. The alternative would be to defer calling the
8433 * tracepoint to a later point (e.g. the local completion callback for
8434 * shared buffer reads), which seems even less helpful.
8435 */
8437 tag.blockNum,
8438 tag.spcOid,
8439 tag.dbOid,
8440 tag.relNumber,
8442 false);
8443}
8444
8445/*
8446 * Perform completion handling of a single AIO read. This read may cover
8447 * multiple blocks / buffers.
8448 *
8449 * Shared between shared and local buffers, to reduce code duplication.
8450 */
8453 uint8 cb_data, bool is_temp)
8454{
8455 PgAioResult result = prior_result;
8460 uint8 error_count = 0;
8461 uint8 zeroed_count = 0;
8462 uint8 ignored_count = 0;
8464 uint64 *io_data;
8465 uint8 handle_data_len;
8466
8467 if (is_temp)
8468 {
8469 Assert(td->smgr.is_temp);
8471 }
8472 else
8473 Assert(!td->smgr.is_temp);
8474
8475 /*
8476 * Iterate over all the buffers affected by this IO and call the
8477 * per-buffer completion function for each buffer.
8478 */
8479 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8480 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8481 {
8483 bool failed;
8484 bool failed_verification = false;
8485 bool failed_checksum = false;
8486 bool zeroed_buffer = false;
8487 bool ignored_checksum = false;
8488
8490
8491 /*
8492 * If the entire I/O failed on a lower-level, each buffer needs to be
8493 * marked as failed. In case of a partial read, the first few buffers
8494 * may be ok.
8495 */
8496 failed =
8498 || prior_result.result <= buf_off;
8499
8500 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8504 &zeroed_buffer);
8505
8506 /*
8507 * Track information about the number of different kinds of error
8508 * conditions across all pages, as there can be multiple pages failing
8509 * verification as part of one IO.
8510 */
8513 if (zeroed_buffer && zeroed_count++ == 0)
8515 if (ignored_checksum && ignored_count++ == 0)
8517 if (failed_checksum)
8519 }
8520
8521 /*
8522 * If the smgr read succeeded [partially] and page verification failed for
8523 * some of the pages, adjust the IO's result state appropriately.
8524 */
8525 if (prior_result.status != PGAIO_RS_ERROR &&
8526 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8527 {
8528 buffer_readv_encode_error(&result, is_temp,
8529 zeroed_count > 0, ignored_count > 0,
8533 pgaio_result_report(result, td, DEBUG1);
8534 }
8535
8536 /*
8537 * For shared relations this reporting is done in
8538 * shared_buffer_readv_complete_local().
8539 */
8540 if (is_temp && checkfail_count > 0)
8543
8544 return result;
8545}
8546
8547/*
8548 * AIO error reporting callback for aio_shared_buffer_readv_cb and
8549 * aio_local_buffer_readv_cb.
8550 *
8551 * The error is encoded / decoded in buffer_readv_encode_error() /
8552 * buffer_readv_decode_error().
8553 */
8554static void
8556 int elevel)
8557{
8558 int nblocks = td->smgr.nblocks;
8559 BlockNumber first = td->smgr.blockNum;
8560 BlockNumber last = first + nblocks - 1;
8563 RelPathStr rpath =
8565 bool zeroed_any,
8569 first_off;
8571 const char *msg_one,
8572 *msg_mult,
8573 *det_mult,
8574 *hint_mult;
8575
8579 &first_off);
8580
8581 /*
8582 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8583 * special case, it's too irregular to be emitted the same way as the
8584 * other cases.
8585 */
8586 if (zeroed_any && ignored_any)
8587 {
8589 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8590 Assert(result.status != PGAIO_RS_ERROR);
8592
8593 ereport(elevel,
8595 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8596 affected_count, checkfail_count, first, last, rpath.str),
8597 affected_count > 1 ?
8598 errdetail("Block %u held the first zeroed page.",
8599 first + first_off) : 0,
8600 errhint_plural("See server log for details about the other %d invalid block.",
8601 "See server log for details about the other %d invalid blocks.",
8604 return;
8605 }
8606
8607 /*
8608 * The other messages are highly repetitive. To avoid duplicating a long
8609 * and complicated ereport(), gather the translated format strings
8610 * separately and then do one common ereport.
8611 */
8612 if (result.status == PGAIO_RS_ERROR)
8613 {
8614 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8616 msg_one = _("invalid page in block %u of relation \"%s\"");
8617 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8618 det_mult = _("Block %u held the first invalid page.");
8619 hint_mult = _("See server log for the other %u invalid block(s).");
8620 }
8621 else if (zeroed_any && !ignored_any)
8622 {
8624 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8625 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8626 det_mult = _("Block %u held the first zeroed page.");
8627 hint_mult = _("See server log for the other %u zeroed block(s).");
8628 }
8629 else if (!zeroed_any && ignored_any)
8630 {
8632 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8633 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8634 det_mult = _("Block %u held the first ignored page.");
8635 hint_mult = _("See server log for the other %u ignored block(s).");
8636 }
8637 else
8639
8640 ereport(elevel,
8642 affected_count == 1 ?
8643 errmsg_internal(msg_one, first + first_off, rpath.str) :
8644 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8647}
8648
8649static void
8654
8655static PgAioResult
8661
8662/*
8663 * We need a backend-local completion callback for shared buffers, to be able
8664 * to report checksum errors correctly. Unfortunately that can only safely
8665 * happen if the reporting backend has previously called
8666 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8667 * the backend that started the IO. Hence this callback.
8668 */
8669static PgAioResult
8699
8700static void
8705
8706static PgAioResult
8712
8713/* readv callback is passed READ_BUFFERS_* flags as callback data */
8716 .complete_shared = shared_buffer_readv_complete,
8717 /* need a local callback to report checksum failures */
8718 .complete_local = shared_buffer_readv_complete_local,
8719 .report = buffer_readv_report,
8720};
8721
8722/* readv callback is passed READ_BUFFERS_* flags as callback data */
8725
8726 /*
8727 * Note that this, in contrast to the shared_buffers case, uses
8728 * complete_local, as only the issuing backend has access to the required
8729 * datastructures. This is important in case the IO completion may be
8730 * consumed incidentally by another backend.
8731 */
8732 .complete_local = local_buffer_readv_complete,
8733 .report = buffer_readv_report,
8734};
int io_method
Definition aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
bool pgaio_have_staged(void)
Definition aio.c:1107
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
void pgaio_submit_staged(void)
Definition aio.c:1123
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ IOMETHOD_SYNC
Definition aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_OK
Definition aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define pg_write_barrier()
Definition atomics.h:155
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1772
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1636
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1600
int BgWriterDelay
Definition bgwriter.c:59
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
#define MaxBlockNumber
Definition block.h:35
static int32 next
Definition blutils.c:225
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
#define BufferIsLocal(buffer)
Definition buf.h:37
CkptSortItem * CkptBufferIds
Definition buf_init.c:26
WritebackContext BackendWritebackContext
Definition buf_init.c:25
#define BM_MAX_USAGE_COUNT
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
#define BM_PERMANENT
#define BUF_USAGECOUNT_MASK
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BM_LOCK_VAL_SHARED
#define BUF_REFCOUNT_ONE
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_LOCK_VAL_EXCLUSIVE
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
#define BM_PIN_COUNT_WAITER
#define BM_DIRTY
#define BM_LOCK_WAKE_IN_PROGRESS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
#define BUF_STATE_GET_USAGECOUNT(state)
#define BM_LOCK_MASK
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
#define BUF_STATE_GET_REFCOUNT(state)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_LOCK_HAS_WAITERS
#define BM_IO_ERROR
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BM_LOCK_VAL_SHARE_EXCLUSIVE
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:118
bool track_io_timing
Definition bufmgr.c:192
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7628
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6504
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition bufmgr.c:5169
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5537
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5563
void DropDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5034
bool BufferSetHintBits16(uint16 *ptr, uint16 val, Buffer buffer)
Definition bufmgr.c:6960
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition bufmgr.c:7407
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8452
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
void BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
Definition bufmgr.c:6937
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition bufmgr.c:1674
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition bufmgr.c:4684
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition bufmgr.c:3128
static int ReservedRefCountSlot
Definition bufmgr.c:268
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8670
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1372
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1637
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:782
static uint32 PrivateRefCountClock
Definition bufmgr.c:267
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4414
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7606
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8707
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1599
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7765
int io_max_combine_limit
Definition bufmgr.c:217
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4545
const ResourceOwnerDesc buffer_io_resowner_desc
Definition bufmgr.c:285
bool zero_damaged_pages
Definition bufmgr.c:189
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3299
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7815
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8308
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7664
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5765
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5963
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition bufmgr.c:7373
bool IsBufferCleanupOK(Buffer buffer)
Definition bufmgr.c:6768
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7614
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3003
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6138
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:974
void AtEOXact_Buffers(bool isCommit)
Definition bufmgr.c:4110
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7203
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition bufmgr.c:8714
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:5881
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1006
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1303
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6172
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1703
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7349
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1141
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2110
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4174
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition bufmgr.c:1661
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition bufmgr.c:5381
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition bufmgr.c:4804
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6070
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7274
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6392
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition bufmgr.c:1035
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition bufmgr.c:8723
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2384
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4156
int io_combine_limit_guc
Definition bufmgr.c:216
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7438
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4378
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5917
const ResourceOwnerDesc buffer_resowner_desc
Definition bufmgr.c:294
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8064
void UnlockBuffer(Buffer buffer)
Definition bufmgr.c:6425
#define BUF_REUSABLE
Definition bufmgr.c:85
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7258
static void BufferSync(int flags)
Definition bufmgr.c:3463
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1874
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8701
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4300
void CheckPointBuffers(int flags)
Definition bufmgr.c:4343
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3030
static uint32 MaxProportionalPins
Definition bufmgr.c:271
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6030
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2709
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6152
bool BgBufferSync(WritebackContext *wb_context)
Definition bufmgr.c:3742
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7301
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3331
bool BufferIsPermanent(Buffer buffer)
Definition bufmgr.c:4596
void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:8015
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8650
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6337
void UnlockBuffers(void)
Definition bufmgr.c:5719
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:692
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8656
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2461
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6484
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition bufmgr.c:4564
int bgwriter_flush_after
Definition bufmgr.c:224
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5505
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:2977
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3188
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:4974
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4632
void LockBufferInternal(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:6441
bool HoldingBufferPinThatDelaysRecovery(void)
Definition bufmgr.c:6684
bool MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
Definition bufmgr.c:7922
int checkpoint_flush_after
Definition bufmgr.c:223
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5522
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition bufmgr.c:1220
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3376
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7242
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7473
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1742
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7461
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3063
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7674
bool BufferBeginSetHintBits(Buffer buffer)
Definition bufmgr.c:6909
int backend_flush_after
Definition bufmgr.c:225
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2647
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8555
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2665
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6537
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6818
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5688
void FlushRelationBuffers(Relation rel)
Definition bufmgr.c:5081
#define READV_COUNT_BITS
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6308
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7523
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition bufmgr.c:7736
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:958
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition bufmgr.c:813
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
int maintenance_io_concurrency
Definition bufmgr.c:207
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3367
void FlushDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5445
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2283
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5267
int effective_io_concurrency
Definition bufmgr.c:200
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6410
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7141
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition bufmgr.c:7085
void MarkDirtyRelUnpinnedBuffers(Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:7958
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1618
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:921
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:7866
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8171
#define READV_COUNT_MASK
static int PrivateRefCountEntryLast
Definition bufmgr.c:269
int io_combine_limit
Definition bufmgr.c:215
void InitBufferManagerAccess(void)
Definition bufmgr.c:4127
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8213
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4040
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2621
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:874
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3423
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6710
int bgwriter_lru_maxpages
Definition bufmgr.c:190
uint32 GetPinLimit(void)
Definition bufmgr.c:2609
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7006
#define BUF_WRITTEN
Definition bufmgr.c:84
void FlushOneBuffer(Buffer buffer)
Definition bufmgr.c:5485
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define P_NEW
Definition bufmgr.h:198
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
#define DEFAULT_IO_COMBINE_LIMIT
Definition bufmgr.h:174
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:437
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
BufferLockMode
Definition bufmgr.h:204
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:215
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:173
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition bufmgr.h:168
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition bufmgr.h:169
void * Block
Definition bufmgr.h:26
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
ReadBufferMode
Definition bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
@ RBM_NORMAL
Definition bufmgr.h:46
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
bool ignore_checksum_failure
Definition bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:501
static bool PageIsNew(const PageData *page)
Definition bufpage.h:259
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:411
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:502
#define pg_noinline
Definition c.h:315
#define likely(x)
Definition c.h:431
uint8_t uint8
Definition c.h:616
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:243
#define Max(x, y)
Definition c.h:1087
#define Assert(condition)
Definition c.h:945
double float8
Definition c.h:716
#define pg_attribute_always_inline
Definition c.h:299
int16_t int16
Definition c.h:613
int32_t int32
Definition c.h:614
uint64_t uint64
Definition c.h:619
uint16_t uint16
Definition c.h:617
#define pg_unreachable()
Definition c.h:361
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
#define lengthof(array)
Definition c.h:875
#define MemSet(start, val, len)
Definition c.h:1109
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1010
size_t Size
Definition c.h:691
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition timestamp.h:39
struct cursor * cur
Definition ecpg.c:29
Datum arg
Definition elog.c:1322
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
#define _(x)
Definition elog.c:95
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
#define errcontext
Definition elog.h:198
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
#define DEBUG3
Definition elog.h:28
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define LOG_SERVER_ONLY
Definition elog.h:32
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
#define palloc_array(type, count)
Definition fe_memutils.h:76
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:321
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:174
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:747
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:787
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
int NBuffers
Definition globals.c:142
bool enableFsync
Definition globals.c:129
ProcNumber MyProcNumber
Definition globals.c:90
int VacuumCostPageMiss
Definition globals.c:152
bool VacuumCostActive
Definition globals.c:158
bool IsUnderPostmaster
Definition globals.c:120
int VacuumCostBalance
Definition globals.c:157
int MaxBackends
Definition globals.c:146
int VacuumCostPageDirty
Definition globals.c:153
int VacuumCostPageHit
Definition globals.c:151
const char * str
long val
Definition informix.c:689
BufferUsage pgBufferUsage
Definition instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
int b
Definition isn.c:74
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
int32 * LocalRefCount
Definition localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:841
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition localbuf.c:523
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1003
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1014
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:805
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:491
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:702
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:562
int NLocBuffer
Definition localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:346
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:848
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:665
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1177
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1794
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
@ LW_WS_WAITING
Definition lwlock.h:31
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
@ LW_SHARED
Definition lwlock.h:113
@ LW_EXCLUSIVE
Definition lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define RESUME_INTERRUPTS()
Definition miscadmin.h:136
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define HOLD_INTERRUPTS()
Definition miscadmin.h:134
#define END_CRIT_SECTION()
Definition miscadmin.h:152
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
static PgChecksumMode mode
static int64 current_size
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
const void * data
#define PG_IOV_MAX
Definition pg_iovec.h:47
static char buf[DEFAULT_XLOG_SEG_SIZE]
IOObject
Definition pgstat.h:279
@ IOOBJECT_RELATION
Definition pgstat.h:280
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:281
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:718
IOContext
Definition pgstat.h:288
@ IOCONTEXT_NORMAL
Definition pgstat.h:292
@ IOOP_EXTEND
Definition pgstat.h:317
@ IOOP_READ
Definition pgstat.h:318
@ IOOP_WRITEBACK
Definition pgstat.h:314
@ IOOP_HIT
Definition pgstat.h:312
@ IOOP_EVICT
Definition pgstat.h:310
@ IOOP_REUSE
Definition pgstat.h:313
@ IOOP_WRITE
Definition pgstat.h:319
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:723
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define qsort(a, b, c, d)
Definition port.h:495
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:335
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:315
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define NUM_AUXILIARY_PROCS
Definition proc.h:524
#define GetPGProcByNumber(n)
Definition proc.h:501
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
int ProcNumber
Definition procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition procsignal.c:502
void set_ps_display_remove_suffix(void)
Definition ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:387
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:576
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:667
#define RelationIsValid(relation)
Definition rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
#define free(a)
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2003
PGPROC * MyProc
Definition proc.c:68
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:759
int DeadlockTimeout
Definition proc.c:59
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:747
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:1991
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:794
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:46
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187
BufferTag tag
pg_atomic_uint64 state
int64 shared_blks_dirtied
Definition instrument.h:28
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 shared_blks_written
Definition instrument.h:29
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
int ckpt_bufs_written
Definition xlog.h:178
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_scanned
Definition bufmgr.c:169
float8 progress
Definition bufmgr.c:163
int num_to_scan
Definition bufmgr.c:167
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
Definition proc.h:176
uint8 lwWaitMode
Definition proc.h:281
PGSemaphore sem
Definition proc.h:255
uint8 lwWaiting
Definition proc.h:280
PgAioHandleCallbackStage stage
Definition aio.h:219
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
uint32 id
Definition aio_types.h:105
PgAioResult result
Definition aio_types.h:132
PgStat_Counter buf_written_clean
Definition pgstat.h:245
PgStat_Counter maxwritten_clean
Definition pgstat.h:246
PgStat_Counter buf_alloc
Definition pgstat.h:247
PgStat_Counter buffers_written
Definition pgstat.h:269
Buffer recent_buffer
Definition bufmgr.h:61
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
ForkNumber forknum
Definition bufmgr.h:137
PgAioWaitRef io_wref
Definition bufmgr.h:150
SMgrRelation smgr
Definition bufmgr.h:135
BufferAccessStrategy strategy
Definition bufmgr.h:138
BlockNumber blocknum
Definition bufmgr.h:146
PgAioReturn io_return
Definition bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
const char * name
Definition resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
SMgrRelation srel
Definition bufmgr.c:185
RelFileLocator rlocator
Definition bufmgr.c:184
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1858
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@128 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
static volatile sig_atomic_t waiting
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
bool RecoveryInProgress(void)
Definition xlog.c:6444
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3129
CheckpointStatsData CheckpointStats
Definition xlog.c:213
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2767
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:154
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:151
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:150
#define XLogIsNeeded()
Definition xlog.h:111
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
#define InHotStandby
Definition xlogutils.h:60