PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/proclist.h"
62#include "storage/read_stream.h"
63#include "storage/smgr.h"
64#include "storage/standby.h"
65#include "utils/memdebug.h"
66#include "utils/ps_status.h"
67#include "utils/rel.h"
68#include "utils/resowner.h"
69#include "utils/timestamp.h"
70
71
72/* Note: these two macros only work on shared buffers, not local ones! */
73#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
74#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
75
76/* Note: this macro only works on local buffers, not shared ones! */
77#define LocalBufHdrGetBlock(bufHdr) \
78 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
79
80/* Bits in SyncOneBuffer's return value */
81#define BUF_WRITTEN 0x01
82#define BUF_REUSABLE 0x02
83
84#define RELS_BSEARCH_THRESHOLD 20
85
86/*
87 * This is the size (in the number of blocks) above which we scan the
88 * entire buffer pool to remove the buffers for all the pages of relation
89 * being dropped. For the relations with size below this threshold, we find
90 * the buffers by doing lookups in BufMapping table.
91 */
92#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
93
94/*
95 * This is separated out from PrivateRefCountEntry to allow for copying all
96 * the data members via struct assignment.
97 */
98typedef struct PrivateRefCountData
99{
100 /*
101 * How many times has the buffer been pinned by this backend.
102 */
104
105 /*
106 * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that
107 * the buffer is not locked.
108 */
111
113{
114 /*
115 * Note that this needs to be same as the entry's corresponding
116 * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We
117 * store it in both places as this is used for the hashtable key and
118 * because it is more convenient (passing around a PrivateRefCountEntry
119 * suffices to identify the buffer) and faster (checking the keys array is
120 * faster when checking many entries, checking the entry is faster if just
121 * checking a single entry).
122 */
124
127
128/* 64 bytes, about the size of a cache line on common systems */
129#define REFCOUNT_ARRAY_ENTRIES 8
130
131/*
132 * Status of buffers to checkpoint for a particular tablespace, used
133 * internally in BufferSync.
134 */
135typedef struct CkptTsStatus
136{
137 /* oid of the tablespace */
139
140 /*
141 * Checkpoint progress for this tablespace. To make progress comparable
142 * between tablespaces the progress is, for each tablespace, measured as a
143 * number between 0 and the total number of to-be-checkpointed pages. Each
144 * page checkpointed in this tablespace increments this space's progress
145 * by progress_slice.
146 */
149
150 /* number of to-be checkpointed pages in this tablespace */
152 /* already processed pages in this tablespace */
154
155 /* current offset in CkptBufferIds for this tablespace */
156 int index;
158
159/*
160 * Type for array used to sort SMgrRelations
161 *
162 * FlushRelationsAllBuffers shares the same comparator function with
163 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
164 * compatible.
165 */
166typedef struct SMgrSortArray
167{
168 RelFileLocator rlocator; /* This must be the first member */
171
172/* GUC variables */
176bool track_io_timing = false;
177
178/*
179 * How many buffers PrefetchBuffer callers should try to stay ahead of their
180 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
181 * for buffers not belonging to tablespaces that have their
182 * effective_io_concurrency parameter set.
183 */
185
186/*
187 * Like effective_io_concurrency, but used by maintenance code paths that might
188 * benefit from a higher setting because they work on behalf of many sessions.
189 * Overridden by the tablespace setting of the same name.
190 */
192
193/*
194 * Limit on how many blocks should be handled in single I/O operations.
195 * StartReadBuffers() callers should respect it, as should other operations
196 * that call smgr APIs directly. It is computed as the minimum of underlying
197 * GUCs io_combine_limit_guc and io_max_combine_limit.
198 */
202
203/*
204 * GUC variables about triggering kernel writeback for buffers written; OS
205 * dependent defaults are set via the GUC mechanism.
206 */
210
211/* local state for LockBufferForCleanup */
213
214/*
215 * Backend-Private refcount management:
216 *
217 * Each buffer also has a private refcount that keeps track of the number of
218 * times the buffer is pinned in the current process. This is so that the
219 * shared refcount needs to be modified only once if a buffer is pinned more
220 * than once by an individual backend. It's also used to check that no
221 * buffers are still pinned at the end of transactions and when exiting. We
222 * also use this mechanism to track whether this backend has a buffer locked,
223 * and, if so, in what mode.
224 *
225 *
226 * To avoid - as we used to - requiring an array with NBuffers entries to keep
227 * track of local buffers, we use a small sequentially searched array
228 * (PrivateRefCountArrayKeys, with the corresponding data stored in
229 * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
230 * keep track of backend local pins.
231 *
232 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
233 * refcounts are kept track of in the array; after that, new array entries
234 * displace old ones into the hash table. That way a frequently used entry
235 * can't get "stuck" in the hashtable while infrequent ones clog the array.
236 *
237 * Note that in most scenarios the number of pinned buffers will not exceed
238 * REFCOUNT_ARRAY_ENTRIES.
239 *
240 *
241 * To enter a buffer into the refcount tracking mechanism first reserve a free
242 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
243 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
244 * memory allocations in NewPrivateRefCountEntry() which can be important
245 * because in some scenarios it's called with a spinlock held...
246 */
252static int ReservedRefCountSlot = -1;
254
256
257static void ReservePrivateRefCountEntry(void);
262
263/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
264static void ResOwnerReleaseBufferIO(Datum res);
265static char *ResOwnerPrintBufferIO(Datum res);
266static void ResOwnerReleaseBuffer(Datum res);
267static char *ResOwnerPrintBuffer(Datum res);
268
270{
271 .name = "buffer io",
272 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
273 .release_priority = RELEASE_PRIO_BUFFER_IOS,
274 .ReleaseResource = ResOwnerReleaseBufferIO,
275 .DebugPrint = ResOwnerPrintBufferIO
276};
277
279{
280 .name = "buffer",
281 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
282 .release_priority = RELEASE_PRIO_BUFFER_PINS,
283 .ReleaseResource = ResOwnerReleaseBuffer,
284 .DebugPrint = ResOwnerPrintBuffer
285};
286
287/*
288 * Ensure that the PrivateRefCountArray has sufficient space to store one more
289 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
290 * a new entry - but it's perfectly fine to not use a reserved entry.
291 */
292static void
294{
295 /* Already reserved (or freed), nothing to do */
296 if (ReservedRefCountSlot != -1)
297 return;
298
299 /*
300 * First search for a free entry the array, that'll be sufficient in the
301 * majority of cases.
302 */
303 {
304 int i;
305
306 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
307 {
309 {
311
312 /*
313 * We could return immediately, but iterating till the end of
314 * the array allows compiler-autovectorization.
315 */
316 }
317 }
318
319 if (ReservedRefCountSlot != -1)
320 return;
321 }
322
323 /*
324 * No luck. All array entries are full. Move one array entry into the hash
325 * table.
326 */
327 {
328 /*
329 * Move entry from the current clock position in the array into the
330 * hashtable. Use that slot.
331 */
332 int victim_slot;
335 bool found;
336
337 /* select victim slot */
341
342 /* Better be used, otherwise we shouldn't get here. */
346
347 /* enter victim array entry into hashtable */
351 &found);
352 Assert(!found);
353 /* move data from the entry in the array to the hash entry */
354 hashent->data = victim_entry->data;
355
356 /* clear the now free array slot */
358 victim_entry->buffer = InvalidBuffer;
359
360 /* clear the whole data member, just for future proofing */
361 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
362 victim_entry->data.refcount = 0;
363 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
364
366 }
367}
368
369/*
370 * Fill a previously reserved refcount entry.
371 */
374{
376
377 /* only allowed to be called when a reservation has been made */
379
380 /* use up the reserved entry */
382
383 /* and fill it */
385 res->buffer = buffer;
386 res->data.refcount = 0;
388
389 /* update cache for the next lookup */
391
393
394 return res;
395}
396
397/*
398 * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
399 * inlining. This particularly seems to be true if the compiler is capable of
400 * auto-vectorizing the code, as that imposes additional stack-alignment
401 * requirements etc.
402 */
405{
407 int match = -1;
408 int i;
409
410 /*
411 * First search for references in the array, that'll be sufficient in the
412 * majority of cases.
413 */
414 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
415 {
417 {
418 match = i;
419 /* see ReservePrivateRefCountEntry() for why we don't return */
420 }
421 }
422
423 if (likely(match != -1))
424 {
425 /* update cache for the next lookup */
427
428 return &PrivateRefCountArray[match];
429 }
430
431 /*
432 * By here we know that the buffer, if already pinned, isn't residing in
433 * the array.
434 *
435 * Only look up the buffer in the hashtable if we've previously overflowed
436 * into it.
437 */
439 return NULL;
440
442
443 if (res == NULL)
444 return NULL;
445 else if (!do_move)
446 {
447 /* caller doesn't want us to move the hash entry into the array */
448 return res;
449 }
450 else
451 {
452 /* move buffer from hashtable into the free array slot */
453 bool found;
455
456 /* Ensure there's a free array slot */
458
459 /* Use up the reserved slot */
463 Assert(free->buffer == InvalidBuffer);
464
465 /* and fill it */
466 free->buffer = buffer;
467 free->data = res->data;
469 /* update cache for the next lookup */
471
473
474
475 /* delete from hashtable */
477 Assert(found);
480
481 return free;
482 }
483}
484
485/*
486 * Return the PrivateRefCount entry for the passed buffer.
487 *
488 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
489 * do_move is true, and the entry resides in the hashtable the entry is
490 * optimized for frequent access by moving it to the array.
491 */
492static inline PrivateRefCountEntry *
494{
497
498 /*
499 * It's very common to look up the same buffer repeatedly. To make that
500 * fast, we have a one-entry cache.
501 *
502 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
503 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
504 * fewer addresses are computed and fewer cachelines are accessed. Whereas
505 * in GetPrivateRefCountEntrySlow()'s case, checking
506 * PrivateRefCountArrayKeys saves a lot of memory accesses.
507 */
508 if (likely(PrivateRefCountEntryLast != -1) &&
510 {
512 }
513
514 /*
515 * The code for the cached lookup is small enough to be worth inlining
516 * into the caller. In the miss case however, that empirically doesn't
517 * seem worth it.
518 */
520}
521
522/*
523 * Returns how many times the passed buffer is pinned by this backend.
524 *
525 * Only works for shared memory buffers!
526 */
527static inline int32
529{
531
534
535 /*
536 * Not moving the entry - that's ok for the current users, but we might
537 * want to change this one day.
538 */
540
541 if (ref == NULL)
542 return 0;
543 return ref->data.refcount;
544}
545
546/*
547 * Release resources used to track the reference count of a buffer which we no
548 * longer have pinned and don't want to pin again immediately.
549 */
550static void
552{
553 Assert(ref->data.refcount == 0);
554 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
555
556 if (ref >= &PrivateRefCountArray[0] &&
558 {
559 ref->buffer = InvalidBuffer;
561
562
563 /*
564 * Mark the just used entry as reserved - in many scenarios that
565 * allows us to avoid ever having to search the array/hash for free
566 * entries.
567 */
569 }
570 else
571 {
572 bool found;
574
576 Assert(found);
579 }
580}
581
582/*
583 * BufferIsPinned
584 * True iff the buffer is pinned (also checks for valid buffer number).
585 *
586 * NOTE: what we check here is that *this* backend holds a pin on
587 * the buffer. We do not care whether some other backend does.
588 */
589#define BufferIsPinned(bufnum) \
590( \
591 !BufferIsValid(bufnum) ? \
592 false \
593 : \
594 BufferIsLocal(bufnum) ? \
595 (LocalRefCount[-(bufnum) - 1] > 0) \
596 : \
597 (GetPrivateRefCount(bufnum) > 0) \
598)
599
600
603 ForkNumber forkNum, BlockNumber blockNum,
607 BufferAccessStrategy strategy,
608 uint32 flags,
611 Buffer *buffers,
615 BufferAccessStrategy strategy,
616 uint32 flags,
619 Buffer *buffers,
621static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
622 bool skip_if_not_valid);
623static void PinBuffer_Locked(BufferDesc *buf);
624static void UnpinBuffer(BufferDesc *buf);
625static void UnpinBufferNoOwner(BufferDesc *buf);
626static void BufferSync(int flags);
627static int SyncOneBuffer(int buf_id, bool skip_recently_used,
629static void WaitIO(BufferDesc *buf);
630static void AbortBufferIO(Buffer buffer);
631static void shared_buffer_write_error_callback(void *arg);
632static void local_buffer_write_error_callback(void *arg);
633static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
634 char relpersistence,
635 ForkNumber forkNum,
636 BlockNumber blockNum,
637 BufferAccessStrategy strategy,
639static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
640static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
646static void FindAndDropRelationBuffers(RelFileLocator rlocator,
647 ForkNumber forkNum,
652 ForkNumber forkNum, bool permanent);
653static void AtProcExit_Buffers(int code, Datum arg);
654static void CheckForBufferLeaks(void);
655#ifdef USE_ASSERT_CHECKING
657#endif
658static int rlocator_comparator(const void *p1, const void *p2);
659static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
660static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
661static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
662
668static inline void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr);
673static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked);
676
677
678/*
679 * Implementation of PrefetchBuffer() for shared buffers.
680 */
683 ForkNumber forkNum,
684 BlockNumber blockNum)
685{
686 PrefetchBufferResult result = {InvalidBuffer, false};
687 BufferTag newTag; /* identity of requested block */
688 uint32 newHash; /* hash value for newTag */
689 LWLock *newPartitionLock; /* buffer partition lock for it */
690 int buf_id;
691
692 Assert(BlockNumberIsValid(blockNum));
693
694 /* create a tag so we can lookup the buffer */
695 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
696 forkNum, blockNum);
697
698 /* determine its hash code and partition lock ID */
701
702 /* see if the block is in the buffer pool already */
704 buf_id = BufTableLookup(&newTag, newHash);
706
707 /* If not in buffers, initiate prefetch */
708 if (buf_id < 0)
709 {
710#ifdef USE_PREFETCH
711 /*
712 * Try to initiate an asynchronous read. This returns false in
713 * recovery if the relation file doesn't exist.
714 */
715 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
716 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
717 {
718 result.initiated_io = true;
719 }
720#endif /* USE_PREFETCH */
721 }
722 else
723 {
724 /*
725 * Report the buffer it was in at that time. The caller may be able
726 * to avoid a buffer table lookup, but it's not pinned and it must be
727 * rechecked!
728 */
729 result.recent_buffer = buf_id + 1;
730 }
731
732 /*
733 * If the block *is* in buffers, we do nothing. This is not really ideal:
734 * the block might be just about to be evicted, which would be stupid
735 * since we know we are going to need it soon. But the only easy answer
736 * is to bump the usage_count, which does not seem like a great solution:
737 * when the caller does ultimately touch the block, usage_count would get
738 * bumped again, resulting in too much favoritism for blocks that are
739 * involved in a prefetch sequence. A real fix would involve some
740 * additional per-buffer state, and it's not clear that there's enough of
741 * a problem to justify that.
742 */
743
744 return result;
745}
746
747/*
748 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
749 *
750 * This is named by analogy to ReadBuffer but doesn't actually allocate a
751 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
752 * block will not be delayed by the I/O. Prefetching is optional.
753 *
754 * There are three possible outcomes:
755 *
756 * 1. If the block is already cached, the result includes a valid buffer that
757 * could be used by the caller to avoid the need for a later buffer lookup, but
758 * it's not pinned, so the caller must recheck it.
759 *
760 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
761 * true. Currently there is no way to know if the data was already cached by
762 * the kernel and therefore didn't really initiate I/O, and no way to know when
763 * the I/O completes other than using synchronous ReadBuffer().
764 *
765 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
766 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
767 * lack of a kernel facility), direct I/O is enabled, or the underlying
768 * relation file wasn't found and we are in recovery. (If the relation file
769 * wasn't found and we are not in recovery, an error is raised).
770 */
773{
775 Assert(BlockNumberIsValid(blockNum));
776
778 {
779 /* see comments in ReadBufferExtended */
783 errmsg("cannot access temporary tables of other sessions")));
784
785 /* pass it off to localbuf.c */
786 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
787 }
788 else
789 {
790 /* pass it to the shared buffer version */
791 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
792 }
793}
794
795/*
796 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
797 *
798 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
799 * successful. Return true if the buffer is valid and still has the expected
800 * tag. In that case, the buffer is pinned and the usage count is bumped.
801 */
802bool
804 Buffer recent_buffer)
805{
807 BufferTag tag;
809
810 Assert(BufferIsValid(recent_buffer));
811
814 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
815
816 if (BufferIsLocal(recent_buffer))
817 {
818 int b = -recent_buffer - 1;
819
822
823 /* Is it still valid and holding the right tag? */
824 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
825 {
826 PinLocalBuffer(bufHdr, true);
827
829
830 return true;
831 }
832 }
833 else
834 {
835 bufHdr = GetBufferDescriptor(recent_buffer - 1);
836
837 /*
838 * Is it still valid and holding the right tag? We do an unlocked tag
839 * comparison first, to make it unlikely that we'll increment the
840 * usage counter of the wrong buffer, if someone calls us with a very
841 * out of date recent_buffer. Then we'll check it again if we get the
842 * pin.
843 */
844 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
845 PinBuffer(bufHdr, NULL, true))
846 {
847 if (BufferTagsEqual(&tag, &bufHdr->tag))
848 {
850 return true;
851 }
853 }
854 }
855
856 return false;
857}
858
859/*
860 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
861 * fork with RBM_NORMAL mode and default strategy.
862 */
863Buffer
868
869/*
870 * ReadBufferExtended -- returns a buffer containing the requested
871 * block of the requested relation. If the blknum
872 * requested is P_NEW, extend the relation file and
873 * allocate a new block. (Caller is responsible for
874 * ensuring that only one backend tries to extend a
875 * relation at the same time!)
876 *
877 * Returns: the buffer number for the buffer containing
878 * the block read. The returned buffer has been pinned.
879 * Does not return on error --- elog's instead.
880 *
881 * Assume when this function is called, that reln has been opened already.
882 *
883 * In RBM_NORMAL mode, the page is read from disk, and the page header is
884 * validated. An error is thrown if the page header is not valid. (But
885 * note that an all-zero page is considered "valid"; see
886 * PageIsVerified().)
887 *
888 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
889 * valid, the page is zeroed instead of throwing an error. This is intended
890 * for non-critical data, where the caller is prepared to repair errors.
891 *
892 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
893 * filled with zeros instead of reading it from disk. Useful when the caller
894 * is going to fill the page from scratch, since this saves I/O and avoids
895 * unnecessary failure if the page-on-disk has corrupt page headers.
896 * The page is returned locked to ensure that the caller has a chance to
897 * initialize the page before it's made visible to others.
898 * Caution: do not use this mode to read a page that is beyond the relation's
899 * current physical EOF; that is likely to cause problems in md.c when
900 * the page is modified and written out. P_NEW is OK, though.
901 *
902 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
903 * a cleanup-strength lock on the page.
904 *
905 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
906 *
907 * If strategy is not NULL, a nondefault buffer access strategy is used.
908 * See buffer/README for details.
909 */
910inline Buffer
913{
914 Buffer buf;
915
916 /*
917 * Reject attempts to read non-local temporary relations; we would be
918 * likely to get wrong data since we have no visibility into the owning
919 * session's local buffers.
920 */
924 errmsg("cannot access temporary tables of other sessions")));
925
926 /*
927 * Read the buffer, and update pgstat counters to reflect a cache hit or
928 * miss.
929 */
931 forkNum, blockNum, mode, strategy);
932
933 return buf;
934}
935
936
937/*
938 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
939 * a relcache entry for the relation.
940 *
941 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
942 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
943 * cannot be used for temporary relations (and making that work might be
944 * difficult, unless we only want to read temporary relations for our own
945 * ProcNumber).
946 */
947Buffer
950 BufferAccessStrategy strategy, bool permanent)
951{
952 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
953
954 return ReadBuffer_common(NULL, smgr,
956 forkNum, blockNum,
957 mode, strategy);
958}
959
960/*
961 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
962 */
963Buffer
965 ForkNumber forkNum,
966 BufferAccessStrategy strategy,
967 uint32 flags)
968{
969 Buffer buf;
970 uint32 extend_by = 1;
971
972 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
973 &buf, &extend_by);
974
975 return buf;
976}
977
978/*
979 * Extend relation by multiple blocks.
980 *
981 * Tries to extend the relation by extend_by blocks. Depending on the
982 * availability of resources the relation may end up being extended by a
983 * smaller number of pages (unless an error is thrown, always by at least one
984 * page). *extended_by is updated to the number of pages the relation has been
985 * extended to.
986 *
987 * buffers needs to be an array that is at least extend_by long. Upon
988 * completion, the first extend_by array elements will point to a pinned
989 * buffer.
990 *
991 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
992 * locked. This is useful for callers that want a buffer that is guaranteed to
993 * be empty.
994 */
998 BufferAccessStrategy strategy,
999 uint32 flags,
1001 Buffer *buffers,
1003{
1004 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1005 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1006 Assert(extend_by > 0);
1007
1008 if (bmr.relpersistence == '\0')
1009 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1010
1011 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1013 buffers, extended_by);
1014}
1015
1016/*
1017 * Extend the relation so it is at least extend_to blocks large, return buffer
1018 * (extend_to - 1).
1019 *
1020 * This is useful for callers that want to write a specific page, regardless
1021 * of the current size of the relation (e.g. useful for visibilitymap and for
1022 * crash recovery).
1023 */
1024Buffer
1027 BufferAccessStrategy strategy,
1028 uint32 flags,
1031{
1033 uint32 extended_by = 0;
1035 Buffer buffers[64];
1036
1037 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1038 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1040
1041 if (bmr.relpersistence == '\0')
1042 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1043
1044 /*
1045 * If desired, create the file if it doesn't exist. If
1046 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1047 * an smgrexists call.
1048 */
1049 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1050 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1051 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1053 {
1055
1056 /* recheck, fork might have been created concurrently */
1059
1061 }
1062
1063 /*
1064 * If requested, invalidate size cache, so that smgrnblocks asks the
1065 * kernel.
1066 */
1067 if (flags & EB_CLEAR_SIZE_CACHE)
1068 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1069
1070 /*
1071 * Estimate how many pages we'll need to extend by. This avoids acquiring
1072 * unnecessarily many victim buffers.
1073 */
1075
1076 /*
1077 * Since no-one else can be looking at the page contents yet, there is no
1078 * difference between an exclusive lock and a cleanup-strength lock. Note
1079 * that we pass the original mode to ReadBuffer_common() below, when
1080 * falling back to reading the buffer to a concurrent relation extension.
1081 */
1083 flags |= EB_LOCK_TARGET;
1084
1085 while (current_size < extend_to)
1086 {
1087 uint32 num_pages = lengthof(buffers);
1089
1090 if ((uint64) current_size + num_pages > extend_to)
1091 num_pages = extend_to - current_size;
1092
1093 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1094 num_pages, extend_to,
1095 buffers, &extended_by);
1096
1098 Assert(num_pages != 0 || current_size >= extend_to);
1099
1100 for (uint32 i = 0; i < extended_by; i++)
1101 {
1102 if (first_block + i != extend_to - 1)
1103 ReleaseBuffer(buffers[i]);
1104 else
1105 buffer = buffers[i];
1106 }
1107 }
1108
1109 /*
1110 * It's possible that another backend concurrently extended the relation.
1111 * In that case read the buffer.
1112 *
1113 * XXX: Should we control this via a flag?
1114 */
1115 if (buffer == InvalidBuffer)
1116 {
1117 Assert(extended_by == 0);
1118 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1119 fork, extend_to - 1, mode, strategy);
1120 }
1121
1122 return buffer;
1123}
1124
1125/*
1126 * Lock and optionally zero a buffer, as part of the implementation of
1127 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1128 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1129 */
1130static void
1132{
1134 bool need_to_zero;
1136
1138
1139 if (already_valid)
1140 {
1141 /*
1142 * If the caller already knew the buffer was valid, we can skip some
1143 * header interaction. The caller just wants to lock the buffer.
1144 */
1145 need_to_zero = false;
1146 }
1147 else if (isLocalBuf)
1148 {
1149 /* Simple case for non-shared buffers. */
1151 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1152 }
1153 else
1154 {
1155 /*
1156 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1157 * concurrently. Even though we aren't doing I/O, that ensures that
1158 * we don't zero a page that someone else has pinned. An exclusive
1159 * content lock wouldn't be enough, because readers are allowed to
1160 * drop the content lock after determining that a tuple is visible
1161 * (see buffer access rules in README).
1162 */
1164 need_to_zero = StartBufferIO(bufHdr, true, false);
1165 }
1166
1167 if (need_to_zero)
1168 {
1170
1171 /*
1172 * Grab the buffer content lock before marking the page as valid, to
1173 * make sure that no other backend sees the zeroed page before the
1174 * caller has had a chance to initialize it.
1175 *
1176 * Since no-one else can be looking at the page contents yet, there is
1177 * no difference between an exclusive lock and a cleanup-strength
1178 * lock. (Note that we cannot use LockBuffer() or
1179 * LockBufferForCleanup() here, because they assert that the buffer is
1180 * already valid.)
1181 */
1182 if (!isLocalBuf)
1184
1185 /* Set BM_VALID, terminate IO, and wake up any waiters */
1186 if (isLocalBuf)
1187 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1188 else
1189 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1190 }
1191 else if (!isLocalBuf)
1192 {
1193 /*
1194 * The buffer is valid, so we can't zero it. The caller still expects
1195 * the page to be locked on return.
1196 */
1197 if (mode == RBM_ZERO_AND_LOCK)
1199 else
1201 }
1202}
1203
1204/*
1205 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1206 * already present, or false if more work is required to either read it in or
1207 * zero it.
1208 */
1211 SMgrRelation smgr,
1212 char persistence,
1213 ForkNumber forkNum,
1214 BlockNumber blockNum,
1215 BufferAccessStrategy strategy,
1216 bool *foundPtr)
1217{
1221
1222 Assert(blockNum != P_NEW);
1223
1224 /* Persistence should be set before */
1225 Assert((persistence == RELPERSISTENCE_TEMP ||
1226 persistence == RELPERSISTENCE_PERMANENT ||
1227 persistence == RELPERSISTENCE_UNLOGGED));
1228
1229 if (persistence == RELPERSISTENCE_TEMP)
1230 {
1233 }
1234 else
1235 {
1236 io_context = IOContextForStrategy(strategy);
1238 }
1239
1240 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1244 smgr->smgr_rlocator.backend);
1245
1246 if (persistence == RELPERSISTENCE_TEMP)
1247 {
1248 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1249 if (*foundPtr)
1251 }
1252 else
1253 {
1254 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1255 strategy, foundPtr, io_context);
1256 if (*foundPtr)
1258 }
1259 if (rel)
1260 {
1261 /*
1262 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1263 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1264 * zeroed instead), the per-relation stats always count them.
1265 */
1267 if (*foundPtr)
1269 }
1270 if (*foundPtr)
1271 {
1273 if (VacuumCostActive)
1275
1276 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1280 smgr->smgr_rlocator.backend,
1281 true);
1282 }
1283
1285}
1286
1287/*
1288 * ReadBuffer_common -- common logic for all ReadBuffer variants
1289 *
1290 * smgr is required, rel is optional unless using P_NEW.
1291 */
1294 ForkNumber forkNum,
1296 BufferAccessStrategy strategy)
1297{
1298 ReadBuffersOperation operation;
1299 Buffer buffer;
1300 int flags;
1301 char persistence;
1302
1303 /*
1304 * Backward compatibility path, most code should use ExtendBufferedRel()
1305 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1306 * scales a lot better.
1307 */
1308 if (unlikely(blockNum == P_NEW))
1309 {
1311
1312 /*
1313 * Since no-one else can be looking at the page contents yet, there is
1314 * no difference between an exclusive lock and a cleanup-strength
1315 * lock.
1316 */
1318 flags |= EB_LOCK_FIRST;
1319
1320 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1321 }
1322
1323 if (rel)
1324 persistence = rel->rd_rel->relpersistence;
1325 else
1326 persistence = smgr_persistence;
1327
1330 {
1331 bool found;
1332
1333 buffer = PinBufferForBlock(rel, smgr, persistence,
1334 forkNum, blockNum, strategy, &found);
1335 ZeroAndLockBuffer(buffer, mode, found);
1336 return buffer;
1337 }
1338
1339 /*
1340 * Signal that we are going to immediately wait. If we're immediately
1341 * waiting, there is no benefit in actually executing the IO
1342 * asynchronously, it would just add dispatch overhead.
1343 */
1345 if (mode == RBM_ZERO_ON_ERROR)
1347 operation.smgr = smgr;
1348 operation.rel = rel;
1349 operation.persistence = persistence;
1350 operation.forknum = forkNum;
1351 operation.strategy = strategy;
1352 if (StartReadBuffer(&operation,
1353 &buffer,
1354 blockNum,
1355 flags))
1356 WaitReadBuffers(&operation);
1357
1358 return buffer;
1359}
1360
1363 Buffer *buffers,
1364 BlockNumber blockNum,
1365 int *nblocks,
1366 int flags,
1367 bool allow_forwarding)
1368{
1369 int actual_nblocks = *nblocks;
1370 int maxcombine = 0;
1371 bool did_start_io;
1372
1373 Assert(*nblocks == 1 || allow_forwarding);
1374 Assert(*nblocks > 0);
1375 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1376
1377 for (int i = 0; i < actual_nblocks; ++i)
1378 {
1379 bool found;
1380
1381 if (allow_forwarding && buffers[i] != InvalidBuffer)
1382 {
1384
1385 /*
1386 * This is a buffer that was pinned by an earlier call to
1387 * StartReadBuffers(), but couldn't be handled in one operation at
1388 * that time. The operation was split, and the caller has passed
1389 * an already pinned buffer back to us to handle the rest of the
1390 * operation. It must continue at the expected block number.
1391 */
1392 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1393
1394 /*
1395 * It might be an already valid buffer (a hit) that followed the
1396 * final contiguous block of an earlier I/O (a miss) marking the
1397 * end of it, or a buffer that some other backend has since made
1398 * valid by performing the I/O for us, in which case we can handle
1399 * it as a hit now. It is safe to check for a BM_VALID flag with
1400 * a relaxed load, because we got a fresh view of it while pinning
1401 * it in the previous call.
1402 *
1403 * On the other hand if we don't see BM_VALID yet, it must be an
1404 * I/O that was split by the previous call and we need to try to
1405 * start a new I/O from this block. We're also racing against any
1406 * other backend that might start the I/O or even manage to mark
1407 * it BM_VALID after this check, but StartBufferIO() will handle
1408 * those cases.
1409 */
1410 if (BufferIsLocal(buffers[i]))
1411 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1412 else
1413 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1415 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1416 }
1417 else
1418 {
1419 buffers[i] = PinBufferForBlock(operation->rel,
1420 operation->smgr,
1421 operation->persistence,
1422 operation->forknum,
1423 blockNum + i,
1424 operation->strategy,
1425 &found);
1426 }
1427
1428 if (found)
1429 {
1430 /*
1431 * We have a hit. If it's the first block in the requested range,
1432 * we can return it immediately and report that WaitReadBuffers()
1433 * does not need to be called. If the initial value of *nblocks
1434 * was larger, the caller will have to call again for the rest.
1435 */
1436 if (i == 0)
1437 {
1438 *nblocks = 1;
1439
1440#ifdef USE_ASSERT_CHECKING
1441
1442 /*
1443 * Initialize enough of ReadBuffersOperation to make
1444 * CheckReadBuffersOperation() work. Outside of assertions
1445 * that's not necessary when no IO is issued.
1446 */
1447 operation->buffers = buffers;
1448 operation->blocknum = blockNum;
1449 operation->nblocks = 1;
1450 operation->nblocks_done = 1;
1451 CheckReadBuffersOperation(operation, true);
1452#endif
1453 return false;
1454 }
1455
1456 /*
1457 * Otherwise we already have an I/O to perform, but this block
1458 * can't be included as it is already valid. Split the I/O here.
1459 * There may or may not be more blocks requiring I/O after this
1460 * one, we haven't checked, but they can't be contiguous with this
1461 * one in the way. We'll leave this buffer pinned, forwarding it
1462 * to the next call, avoiding the need to unpin it here and re-pin
1463 * it in the next call.
1464 */
1465 actual_nblocks = i;
1466 break;
1467 }
1468 else
1469 {
1470 /*
1471 * Check how many blocks we can cover with the same IO. The smgr
1472 * implementation might e.g. be limited due to a segment boundary.
1473 */
1474 if (i == 0 && actual_nblocks > 1)
1475 {
1476 maxcombine = smgrmaxcombine(operation->smgr,
1477 operation->forknum,
1478 blockNum);
1480 {
1481 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1482 blockNum, actual_nblocks, maxcombine);
1484 }
1485 }
1486 }
1487 }
1488 *nblocks = actual_nblocks;
1489
1490 /* Populate information needed for I/O. */
1491 operation->buffers = buffers;
1492 operation->blocknum = blockNum;
1493 operation->flags = flags;
1494 operation->nblocks = actual_nblocks;
1495 operation->nblocks_done = 0;
1496 pgaio_wref_clear(&operation->io_wref);
1497
1498 /*
1499 * When using AIO, start the IO in the background. If not, issue prefetch
1500 * requests if desired by the caller.
1501 *
1502 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1503 * de-risk the introduction of AIO somewhat. It's a large architectural
1504 * change, with lots of chances for unanticipated performance effects.
1505 *
1506 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1507 * asynchronously, but without the check here we'd execute IO earlier than
1508 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1509 */
1510 if (io_method != IOMETHOD_SYNC)
1511 {
1512 /*
1513 * Try to start IO asynchronously. It's possible that no IO needs to
1514 * be started, if another backend already performed the IO.
1515 *
1516 * Note that if an IO is started, it might not cover the entire
1517 * requested range, e.g. because an intermediary block has been read
1518 * in by another backend. In that case any "trailing" buffers we
1519 * already pinned above will be "forwarded" by read_stream.c to the
1520 * next call to StartReadBuffers().
1521 *
1522 * This is signalled to the caller by decrementing *nblocks *and*
1523 * reducing operation->nblocks. The latter is done here, but not below
1524 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1525 * overall read size anymore, we need to retry until done in its
1526 * entirety or until failed.
1527 */
1528 did_start_io = AsyncReadBuffers(operation, nblocks);
1529
1530 operation->nblocks = *nblocks;
1531 }
1532 else
1533 {
1534 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1535
1536 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1537 {
1538 /*
1539 * In theory we should only do this if PinBufferForBlock() had to
1540 * allocate new buffers above. That way, if two calls to
1541 * StartReadBuffers() were made for the same blocks before
1542 * WaitReadBuffers(), only the first would issue the advice.
1543 * That'd be a better simulation of true asynchronous I/O, which
1544 * would only start the I/O once, but isn't done here for
1545 * simplicity.
1546 */
1547 smgrprefetch(operation->smgr,
1548 operation->forknum,
1549 blockNum,
1551 }
1552
1553 /*
1554 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1555 * will initiate the necessary IO.
1556 */
1557 did_start_io = true;
1558 }
1559
1561
1562 return did_start_io;
1563}
1564
1565/*
1566 * Begin reading a range of blocks beginning at blockNum and extending for
1567 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1568 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1569 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1570 * and is now being continued. On return, *nblocks holds the number of blocks
1571 * accepted by this operation. If it is less than the original number then
1572 * this operation has been split, but buffer elements up to the original
1573 * requested size may hold forwarded buffers to be used for a continuing
1574 * operation. The caller must either start a new I/O beginning at the block
1575 * immediately following the blocks accepted by this call and pass those
1576 * buffers back in, or release them if it chooses not to. It shouldn't make
1577 * any other use of or assumptions about forwarded buffers.
1578 *
1579 * If false is returned, no I/O is necessary and the buffers covered by
1580 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1581 * an I/O has been started, and WaitReadBuffers() must be called with the same
1582 * operation object before the buffers covered by *nblocks on exit can be
1583 * accessed. Along with the operation object, the caller-supplied array of
1584 * buffers must remain valid until WaitReadBuffers() is called, and any
1585 * forwarded buffers must also be preserved for a continuing call unless
1586 * they are explicitly released.
1587 */
1588bool
1590 Buffer *buffers,
1591 BlockNumber blockNum,
1592 int *nblocks,
1593 int flags)
1594{
1595 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1596 true /* expect forwarded buffers */ );
1597}
1598
1599/*
1600 * Single block version of the StartReadBuffers(). This might save a few
1601 * instructions when called from another translation unit, because it is
1602 * specialized for nblocks == 1.
1603 *
1604 * This version does not support "forwarded" buffers: they cannot be created
1605 * by reading only one block and *buffer is ignored on entry.
1606 */
1607bool
1609 Buffer *buffer,
1610 BlockNumber blocknum,
1611 int flags)
1612{
1613 int nblocks = 1;
1614 bool result;
1615
1616 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1617 false /* single block, no forwarding */ );
1618 Assert(nblocks == 1); /* single block can't be short */
1619
1620 return result;
1621}
1622
1623/*
1624 * Perform sanity checks on the ReadBuffersOperation.
1625 */
1626static void
1628{
1629#ifdef USE_ASSERT_CHECKING
1630 Assert(operation->nblocks_done <= operation->nblocks);
1631 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1632
1633 for (int i = 0; i < operation->nblocks; i++)
1634 {
1635 Buffer buffer = operation->buffers[i];
1639
1640 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1642
1643 if (i < operation->nblocks_done)
1645 }
1646#endif
1647}
1648
1649/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1650static inline bool
1652{
1653 if (BufferIsLocal(buffer))
1655 true, nowait);
1656 else
1657 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1658}
1659
1660/*
1661 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1662 */
1663static inline bool
1665{
1666 /*
1667 * If this backend currently has staged IO, we need to submit the pending
1668 * IO before waiting for the right to issue IO, to avoid the potential for
1669 * deadlocks (and, more commonly, unnecessary delays for other backends).
1670 */
1671 if (!nowait && pgaio_have_staged())
1672 {
1674 return true;
1675
1676 /*
1677 * Unfortunately StartBufferIO() returning false doesn't allow to
1678 * distinguish between the buffer already being valid and IO already
1679 * being in progress. Since IO already being in progress is quite
1680 * rare, this approach seems fine.
1681 */
1683 }
1684
1685 return ReadBuffersCanStartIOOnce(buffer, nowait);
1686}
1687
1688/*
1689 * Helper for WaitReadBuffers() that processes the results of a readv
1690 * operation, raising an error if necessary.
1691 */
1692static void
1694{
1695 PgAioReturn *aio_ret = &operation->io_return;
1697 int newly_read_blocks = 0;
1698
1699 Assert(pgaio_wref_valid(&operation->io_wref));
1700 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1701
1702 /*
1703 * SMGR reports the number of blocks successfully read as the result of
1704 * the IO operation. Thus we can simply add that to ->nblocks_done.
1705 */
1706
1707 if (likely(rs != PGAIO_RS_ERROR))
1708 newly_read_blocks = aio_ret->result.result;
1709
1710 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1711 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1712 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1713 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1714 {
1715 /*
1716 * We'll retry, so we just emit a debug message to the server log (or
1717 * not even that in prod scenarios).
1718 */
1719 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1720 elog(DEBUG3, "partial read, will retry");
1721 }
1722
1725
1726 operation->nblocks_done += newly_read_blocks;
1727
1728 Assert(operation->nblocks_done <= operation->nblocks);
1729}
1730
1731void
1733{
1734 PgAioReturn *aio_ret = &operation->io_return;
1737
1738 if (operation->persistence == RELPERSISTENCE_TEMP)
1739 {
1742 }
1743 else
1744 {
1747 }
1748
1749 /*
1750 * If we get here without an IO operation having been issued, the
1751 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1752 * caller should not have called WaitReadBuffers().
1753 *
1754 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1755 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1756 * of the retry logic below, no extra code is required.
1757 *
1758 * This path is expected to eventually go away.
1759 */
1760 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1761 elog(ERROR, "waiting for read operation that didn't read");
1762
1763 /*
1764 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1765 * done. We may need multiple retries, not just because we could get
1766 * multiple partial reads, but also because some of the remaining
1767 * to-be-read buffers may have been read in by other backends, limiting
1768 * the IO size.
1769 */
1770 while (true)
1771 {
1773
1774 CheckReadBuffersOperation(operation, false);
1775
1776 /*
1777 * If there is an IO associated with the operation, we may need to
1778 * wait for it.
1779 */
1780 if (pgaio_wref_valid(&operation->io_wref))
1781 {
1782 /*
1783 * Track the time spent waiting for the IO to complete. As
1784 * tracking a wait even if we don't actually need to wait
1785 *
1786 * a) is not cheap, due to the timestamping overhead
1787 *
1788 * b) reports some time as waiting, even if we never waited
1789 *
1790 * we first check if we already know the IO is complete.
1791 */
1792 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1793 !pgaio_wref_check_done(&operation->io_wref))
1794 {
1796
1797 pgaio_wref_wait(&operation->io_wref);
1798
1799 /*
1800 * The IO operation itself was already counted earlier, in
1801 * AsyncReadBuffers(), this just accounts for the wait time.
1802 */
1804 io_start, 0, 0);
1805 }
1806 else
1807 {
1808 Assert(pgaio_wref_check_done(&operation->io_wref));
1809 }
1810
1811 /*
1812 * We now are sure the IO completed. Check the results. This
1813 * includes reporting on errors if there were any.
1814 */
1815 ProcessReadBuffersResult(operation);
1816 }
1817
1818 /*
1819 * Most of the time, the one IO we already started, will read in
1820 * everything. But we need to deal with partial reads and buffers not
1821 * needing IO anymore.
1822 */
1823 if (operation->nblocks_done == operation->nblocks)
1824 break;
1825
1827
1828 /*
1829 * This may only complete the IO partially, either because some
1830 * buffers were already valid, or because of a partial read.
1831 *
1832 * NB: In contrast to after the AsyncReadBuffers() call in
1833 * StartReadBuffers(), we do *not* reduce
1834 * ReadBuffersOperation->nblocks here, callers expect the full
1835 * operation to be completed at this point (as more operations may
1836 * have been queued).
1837 */
1839 }
1840
1841 CheckReadBuffersOperation(operation, true);
1842
1843 /* NB: READ_DONE tracepoint was already executed in completion callback */
1844}
1845
1846/*
1847 * Initiate IO for the ReadBuffersOperation
1848 *
1849 * This function only starts a single IO at a time. The size of the IO may be
1850 * limited to below the to-be-read blocks, if one of the buffers has
1851 * concurrently been read in. If the first to-be-read buffer is already valid,
1852 * no IO will be issued.
1853 *
1854 * To support retries after partial reads, the first operation->nblocks_done
1855 * buffers are skipped.
1856 *
1857 * On return *nblocks_progress is updated to reflect the number of buffers
1858 * affected by the call. If the first buffer is valid, *nblocks_progress is
1859 * set to 1 and operation->nblocks_done is incremented.
1860 *
1861 * Returns true if IO was initiated, false if no IO was necessary.
1862 */
1863static bool
1865{
1866 Buffer *buffers = &operation->buffers[0];
1867 int flags = operation->flags;
1868 BlockNumber blocknum = operation->blocknum;
1869 ForkNumber forknum = operation->forknum;
1870 char persistence = operation->persistence;
1871 int16 nblocks_done = operation->nblocks_done;
1872 Buffer *io_buffers = &operation->buffers[nblocks_done];
1873 int io_buffers_len = 0;
1875 uint32 ioh_flags = 0;
1879 bool did_start_io;
1880
1881 /*
1882 * When this IO is executed synchronously, either because the caller will
1883 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1884 * the AIO subsystem needs to know.
1885 */
1886 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1888
1889 if (persistence == RELPERSISTENCE_TEMP)
1890 {
1894 }
1895 else
1896 {
1899 }
1900
1901 /*
1902 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1903 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1904 * set globally, but on a per-session basis. The completion callback,
1905 * which may be run in other processes, e.g. in IO workers, may have a
1906 * different value of the zero_damaged_pages GUC.
1907 *
1908 * XXX: We probably should eventually use a different flag for
1909 * zero_damaged_pages, so we can report different log levels / error codes
1910 * for zero_damaged_pages and ZERO_ON_ERROR.
1911 */
1914
1915 /*
1916 * For the same reason as with zero_damaged_pages we need to use this
1917 * backend's ignore_checksum_failure value.
1918 */
1921
1922
1923 /*
1924 * To be allowed to report stats in the local completion callback we need
1925 * to prepare to report stats now. This ensures we can safely report the
1926 * checksum failure even in a critical section.
1927 */
1929
1930 /*
1931 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1932 * might block, which we don't want after setting IO_IN_PROGRESS.
1933 *
1934 * If we need to wait for IO before we can get a handle, submit
1935 * already-staged IO first, so that other backends don't need to wait.
1936 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1937 * wait for already submitted IO, which doesn't require additional locks,
1938 * but it could still cause undesirable waits.
1939 *
1940 * A secondary benefit is that this would allow us to measure the time in
1941 * pgaio_io_acquire() without causing undue timer overhead in the common,
1942 * non-blocking, case. However, currently the pgstats infrastructure
1943 * doesn't really allow that, as it a) asserts that an operation can't
1944 * have time without operations b) doesn't have an API to report
1945 * "accumulated" time.
1946 */
1948 if (unlikely(!ioh))
1949 {
1951
1953 }
1954
1955 /*
1956 * Check if we can start IO on the first to-be-read buffer.
1957 *
1958 * If an I/O is already in progress in another backend, we want to wait
1959 * for the outcome: either done, or something went wrong and we will
1960 * retry.
1961 */
1962 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1963 {
1964 /*
1965 * Someone else has already completed this block, we're done.
1966 *
1967 * When IO is necessary, ->nblocks_done is updated in
1968 * ProcessReadBuffersResult(), but that is not called if no IO is
1969 * necessary. Thus update here.
1970 */
1971 operation->nblocks_done += 1;
1972 *nblocks_progress = 1;
1973
1975 pgaio_wref_clear(&operation->io_wref);
1976 did_start_io = false;
1977
1978 /*
1979 * Report and track this as a 'hit' for this backend, even though it
1980 * must have started out as a miss in PinBufferForBlock(). The other
1981 * backend will track this as a 'read'.
1982 */
1983 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1984 operation->smgr->smgr_rlocator.locator.spcOid,
1985 operation->smgr->smgr_rlocator.locator.dbOid,
1986 operation->smgr->smgr_rlocator.locator.relNumber,
1987 operation->smgr->smgr_rlocator.backend,
1988 true);
1989
1990 if (persistence == RELPERSISTENCE_TEMP)
1992 else
1994
1995 if (operation->rel)
1996 pgstat_count_buffer_hit(operation->rel);
1997
1999
2000 if (VacuumCostActive)
2002 }
2003 else
2004 {
2006
2007 /* We found a buffer that we need to read in. */
2008 Assert(io_buffers[0] == buffers[nblocks_done]);
2009 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2010 io_buffers_len = 1;
2011
2012 /*
2013 * How many neighboring-on-disk blocks can we scatter-read into other
2014 * buffers at the same time? In this case we don't wait if we see an
2015 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2016 * head block, so we should get on with that I/O as soon as possible.
2017 */
2018 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2019 {
2020 if (!ReadBuffersCanStartIO(buffers[i], true))
2021 break;
2022 /* Must be consecutive block numbers. */
2023 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2024 BufferGetBlockNumber(buffers[i]) - 1);
2025 Assert(io_buffers[io_buffers_len] == buffers[i]);
2026
2027 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2028 }
2029
2030 /* get a reference to wait for in WaitReadBuffers() */
2031 pgaio_io_get_wref(ioh, &operation->io_wref);
2032
2033 /* provide the list of buffers to the completion callbacks */
2035
2037 persistence == RELPERSISTENCE_TEMP ?
2040 flags);
2041
2043
2044 /* ---
2045 * Even though we're trying to issue IO asynchronously, track the time
2046 * in smgrstartreadv():
2047 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2048 * immediately
2049 * - the io method might not support the IO (e.g. worker IO for a temp
2050 * table)
2051 * ---
2052 */
2054 smgrstartreadv(ioh, operation->smgr, forknum,
2055 blocknum + nblocks_done,
2059
2060 if (persistence == RELPERSISTENCE_TEMP)
2062 else
2064
2065 /*
2066 * Track vacuum cost when issuing IO, not after waiting for it.
2067 * Otherwise we could end up issuing a lot of IO in a short timespan,
2068 * despite a low cost limit.
2069 */
2070 if (VacuumCostActive)
2072
2074 did_start_io = true;
2075 }
2076
2077 return did_start_io;
2078}
2079
2080/*
2081 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
2082 * buffer. If no buffer exists already, selects a replacement victim and
2083 * evicts the old page, but does NOT read in new page.
2084 *
2085 * "strategy" can be a buffer replacement strategy object, or NULL for
2086 * the default strategy. The selected buffer's usage_count is advanced when
2087 * using the default strategy, but otherwise possibly not (see PinBuffer).
2088 *
2089 * The returned buffer is pinned and is already marked as holding the
2090 * desired page. If it already did have the desired page, *foundPtr is
2091 * set true. Otherwise, *foundPtr is set false.
2092 *
2093 * io_context is passed as an output parameter to avoid calling
2094 * IOContextForStrategy() when there is a shared buffers hit and no IO
2095 * statistics need be captured.
2096 *
2097 * No locks are held either at entry or exit.
2098 */
2100BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
2101 BlockNumber blockNum,
2102 BufferAccessStrategy strategy,
2104{
2105 BufferTag newTag; /* identity of requested block */
2106 uint32 newHash; /* hash value for newTag */
2107 LWLock *newPartitionLock; /* buffer partition lock for it */
2108 int existing_buf_id;
2112 uint64 set_bits = 0;
2113
2114 /* Make sure we will have room to remember the buffer pin */
2117
2118 /* create a tag so we can lookup the buffer */
2119 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2120
2121 /* determine its hash code and partition lock ID */
2124
2125 /* see if the block is in the buffer pool already */
2128 if (existing_buf_id >= 0)
2129 {
2130 BufferDesc *buf;
2131 bool valid;
2132
2133 /*
2134 * Found it. Now, pin the buffer so no one can steal it from the
2135 * buffer pool, and check to see if the correct data has been loaded
2136 * into the buffer.
2137 */
2139
2140 valid = PinBuffer(buf, strategy, false);
2141
2142 /* Can release the mapping lock as soon as we've pinned it */
2144
2145 *foundPtr = true;
2146
2147 if (!valid)
2148 {
2149 /*
2150 * We can only get here if (a) someone else is still reading in
2151 * the page, (b) a previous read attempt failed, or (c) someone
2152 * called StartReadBuffers() but not yet WaitReadBuffers().
2153 */
2154 *foundPtr = false;
2155 }
2156
2157 return buf;
2158 }
2159
2160 /*
2161 * Didn't find it in the buffer pool. We'll have to initialize a new
2162 * buffer. Remember to unlock the mapping lock while doing the work.
2163 */
2165
2166 /*
2167 * Acquire a victim buffer. Somebody else might try to do the same, we
2168 * don't hold any conflicting locks. If so we'll have to undo our work
2169 * later.
2170 */
2173
2174 /*
2175 * Try to make a hashtable entry for the buffer under its new tag. If
2176 * somebody else inserted another buffer for the tag, we'll release the
2177 * victim buffer we acquired and use the already inserted one.
2178 */
2181 if (existing_buf_id >= 0)
2182 {
2184 bool valid;
2185
2186 /*
2187 * Got a collision. Someone has already done what we were about to do.
2188 * We'll just handle this as if it were found in the buffer pool in
2189 * the first place. First, give up the buffer we were planning to
2190 * use.
2191 *
2192 * We could do this after releasing the partition lock, but then we'd
2193 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2194 * before acquiring the lock, for the rare case of such a collision.
2195 */
2197
2198 /* remaining code should match code at top of routine */
2199
2201
2202 valid = PinBuffer(existing_buf_hdr, strategy, false);
2203
2204 /* Can release the mapping lock as soon as we've pinned it */
2206
2207 *foundPtr = true;
2208
2209 if (!valid)
2210 {
2211 /*
2212 * We can only get here if (a) someone else is still reading in
2213 * the page, (b) a previous read attempt failed, or (c) someone
2214 * called StartReadBuffers() but not yet WaitReadBuffers().
2215 */
2216 *foundPtr = false;
2217 }
2218
2219 return existing_buf_hdr;
2220 }
2221
2222 /*
2223 * Need to lock the buffer header too in order to change its tag.
2224 */
2226
2227 /* some sanity checks while we hold the buffer header lock */
2230
2231 victim_buf_hdr->tag = newTag;
2232
2233 /*
2234 * Make sure BM_PERMANENT is set for buffers that must be written at every
2235 * checkpoint. Unlogged buffers only need to be written at shutdown
2236 * checkpoints, except for their "init" forks, which need to be treated
2237 * just like permanent relations.
2238 */
2240 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2242
2244 set_bits, 0, 0);
2245
2247
2248 /*
2249 * Buffer contents are currently invalid.
2250 */
2251 *foundPtr = false;
2252
2253 return victim_buf_hdr;
2254}
2255
2256/*
2257 * InvalidateBuffer -- mark a shared buffer invalid.
2258 *
2259 * The buffer header spinlock must be held at entry. We drop it before
2260 * returning. (This is sane because the caller must have locked the
2261 * buffer in order to be sure it should be dropped.)
2262 *
2263 * This is used only in contexts such as dropping a relation. We assume
2264 * that no other backend could possibly be interested in using the page,
2265 * so the only reason the buffer might be pinned is if someone else is
2266 * trying to write it out. We have to let them finish before we can
2267 * reclaim the buffer.
2268 *
2269 * The buffer could get reclaimed by someone else while we are waiting
2270 * to acquire the necessary locks; if so, don't mess it up.
2271 */
2272static void
2274{
2276 uint32 oldHash; /* hash value for oldTag */
2277 LWLock *oldPartitionLock; /* buffer partition lock for it */
2280
2281 /* Save the original buffer tag before dropping the spinlock */
2282 oldTag = buf->tag;
2283
2285
2286 /*
2287 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2288 * worth storing the hashcode in BufferDesc so we need not recompute it
2289 * here? Probably not.
2290 */
2293
2294retry:
2295
2296 /*
2297 * Acquire exclusive mapping lock in preparation for changing the buffer's
2298 * association.
2299 */
2301
2302 /* Re-lock the buffer header */
2304
2305 /* If it's changed while we were waiting for lock, do nothing */
2306 if (!BufferTagsEqual(&buf->tag, &oldTag))
2307 {
2310 return;
2311 }
2312
2313 /*
2314 * We assume the reason for it to be pinned is that either we were
2315 * asynchronously reading the page in before erroring out or someone else
2316 * is flushing the page out. Wait for the IO to finish. (This could be
2317 * an infinite loop if the refcount is messed up... it would be nice to
2318 * time out after awhile, but there seems no way to be sure how many loops
2319 * may be needed. Note that if the other guy has pinned the buffer but
2320 * not yet done StartBufferIO, WaitIO will fall through and we'll
2321 * effectively be busy-looping here.)
2322 */
2324 {
2327 /* safety check: should definitely not be our *own* pin */
2329 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2330 WaitIO(buf);
2331 goto retry;
2332 }
2333
2334 /*
2335 * An invalidated buffer should not have any backends waiting to lock the
2336 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2337 */
2339
2340 /*
2341 * Clear out the buffer's tag and flags. We must do this to ensure that
2342 * linear scans of the buffer array don't think the buffer is valid.
2343 */
2345 ClearBufferTag(&buf->tag);
2346
2348 0,
2350 0);
2351
2352 /*
2353 * Remove the buffer from the lookup hashtable, if it was in there.
2354 */
2355 if (oldFlags & BM_TAG_VALID)
2357
2358 /*
2359 * Done with mapping lock.
2360 */
2362}
2363
2364/*
2365 * Helper routine for GetVictimBuffer()
2366 *
2367 * Needs to be called on a buffer with a valid tag, pinned, but without the
2368 * buffer header spinlock held.
2369 *
2370 * Returns true if the buffer can be reused, in which case the buffer is only
2371 * pinned by this backend and marked as invalid, false otherwise.
2372 */
2373static bool
2375{
2377 uint32 hash;
2379 BufferTag tag;
2380
2382
2383 /* have buffer pinned, so it's safe to read tag without lock */
2384 tag = buf_hdr->tag;
2385
2386 hash = BufTableHashCode(&tag);
2388
2390
2391 /* lock the buffer header */
2393
2394 /*
2395 * We have the buffer pinned nobody else should have been able to unset
2396 * this concurrently.
2397 */
2400 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2401
2402 /*
2403 * If somebody else pinned the buffer since, or even worse, dirtied it,
2404 * give up on this buffer: It's clearly in use.
2405 */
2407 {
2409
2412
2413 return false;
2414 }
2415
2416 /*
2417 * An invalidated buffer should not have any backends waiting to lock the
2418 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2419 */
2421
2422 /*
2423 * Clear out the buffer's tag and flags and usagecount. This is not
2424 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2425 * doing anything with the buffer. But currently it's beneficial, as the
2426 * cheaper pre-check for several linear scans of shared buffers use the
2427 * tag (see e.g. FlushDatabaseBuffers()).
2428 */
2429 ClearBufferTag(&buf_hdr->tag);
2431 0,
2433 0);
2434
2436
2437 /* finally delete buffer from the buffer mapping table */
2438 BufTableDelete(&tag, hash);
2439
2441
2446
2447 return true;
2448}
2449
2450static Buffer
2452{
2454 Buffer buf;
2456 bool from_ring;
2457
2458 /*
2459 * Ensure, before we pin a victim buffer, that there's a free refcount
2460 * entry and resource owner slot for the pin.
2461 */
2464
2465 /* we return here if a prospective victim buffer gets used concurrently */
2466again:
2467
2468 /*
2469 * Select a victim buffer. The buffer is returned pinned and owned by
2470 * this backend.
2471 */
2474
2475 /*
2476 * We shouldn't have any other pins for this buffer.
2477 */
2479
2480 /*
2481 * If the buffer was dirty, try to write it out. There is a race
2482 * condition here, in that someone might dirty it after we released the
2483 * buffer header lock above, or even while we are writing it out (since
2484 * our share-lock won't prevent hint-bit updates). We will recheck the
2485 * dirty bit after re-locking the buffer header.
2486 */
2487 if (buf_state & BM_DIRTY)
2488 {
2491
2492 /*
2493 * We need a share-lock on the buffer contents to write it out (else
2494 * we might write invalid data, eg because someone else is compacting
2495 * the page contents while we write). We must use a conditional lock
2496 * acquisition here to avoid deadlock. Even though the buffer was not
2497 * pinned (and therefore surely not locked) when StrategyGetBuffer
2498 * returned it, someone else could have pinned and exclusive-locked it
2499 * by the time we get here. If we try to get the lock unconditionally,
2500 * we'd block waiting for them; if they later block waiting for us,
2501 * deadlock ensues. (This has been observed to happen when two
2502 * backends are both trying to split btree index pages, and the second
2503 * one just happens to be trying to split the page the first one got
2504 * from StrategyGetBuffer.)
2505 */
2507 {
2508 /*
2509 * Someone else has locked the buffer, so give it up and loop back
2510 * to get another one.
2511 */
2513 goto again;
2514 }
2515
2516 /*
2517 * If using a nondefault strategy, and writing the buffer would
2518 * require a WAL flush, let the strategy decide whether to go ahead
2519 * and write/reuse the buffer or to choose another victim. We need a
2520 * lock to inspect the page LSN, so this can't be done inside
2521 * StrategyGetBuffer.
2522 */
2523 if (strategy != NULL)
2524 {
2525 XLogRecPtr lsn;
2526
2527 /* Read the LSN while holding buffer header lock */
2529 lsn = BufferGetLSN(buf_hdr);
2531
2532 if (XLogNeedsFlush(lsn)
2533 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2534 {
2537 goto again;
2538 }
2539 }
2540
2541 /* OK, do the I/O */
2544
2546 &buf_hdr->tag);
2547 }
2548
2549
2550 if (buf_state & BM_VALID)
2551 {
2552 /*
2553 * When a BufferAccessStrategy is in use, blocks evicted from shared
2554 * buffers are counted as IOOP_EVICT in the corresponding context
2555 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2556 * strategy in two cases: 1) while initially claiming buffers for the
2557 * strategy ring 2) to replace an existing strategy ring buffer
2558 * because it is pinned or in use and cannot be reused.
2559 *
2560 * Blocks evicted from buffers already in the strategy ring are
2561 * counted as IOOP_REUSE in the corresponding strategy context.
2562 *
2563 * At this point, we can accurately count evictions and reuses,
2564 * because we have successfully claimed the valid buffer. Previously,
2565 * we may have been forced to release the buffer due to concurrent
2566 * pinners or erroring out.
2567 */
2569 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2570 }
2571
2572 /*
2573 * If the buffer has an entry in the buffer mapping table, delete it. This
2574 * can fail because another backend could have pinned or dirtied the
2575 * buffer.
2576 */
2578 {
2580 goto again;
2581 }
2582
2583 /* a final set of sanity checks */
2584#ifdef USE_ASSERT_CHECKING
2586
2589
2591#endif
2592
2593 return buf;
2594}
2595
2596/*
2597 * Return the maximum number of buffers that a backend should try to pin once,
2598 * to avoid exceeding its fair share. This is the highest value that
2599 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2600 * system with a very small buffer pool relative to max_connections.
2601 */
2602uint32
2604{
2605 return MaxProportionalPins;
2606}
2607
2608/*
2609 * Return the maximum number of additional buffers that this backend should
2610 * pin if it wants to stay under the per-backend limit, considering the number
2611 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2612 * return by this function can be zero.
2613 */
2614uint32
2616{
2618
2619 /*
2620 * We get the number of "overflowed" pins for free, but don't know the
2621 * number of pins in PrivateRefCountArray. The cost of calculating that
2622 * exactly doesn't seem worth it, so just assume the max.
2623 */
2625
2626 /* Is this backend already holding more than its fair share? */
2628 return 0;
2629
2631}
2632
2633/*
2634 * Limit the number of pins a batch operation may additionally acquire, to
2635 * avoid running out of pinnable buffers.
2636 *
2637 * One additional pin is always allowed, on the assumption that the operation
2638 * requires at least one to make progress.
2639 */
2640void
2642{
2643 uint32 limit;
2644
2645 if (*additional_pins <= 1)
2646 return;
2647
2648 limit = GetAdditionalPinLimit();
2649 limit = Max(limit, 1);
2650 if (limit < *additional_pins)
2651 *additional_pins = limit;
2652}
2653
2654/*
2655 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2656 * avoid duplicating the tracing and relpersistence related logic.
2657 */
2658static BlockNumber
2661 BufferAccessStrategy strategy,
2662 uint32 flags,
2665 Buffer *buffers,
2667{
2669
2671 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2672 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2673 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2674 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2675 extend_by);
2676
2677 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2680 buffers, &extend_by);
2681 else
2682 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2684 buffers, &extend_by);
2686
2688 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2689 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2690 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2691 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2692 *extended_by,
2693 first_block);
2694
2695 return first_block;
2696}
2697
2698/*
2699 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2700 * shared buffers.
2701 */
2702static BlockNumber
2705 BufferAccessStrategy strategy,
2706 uint32 flags,
2709 Buffer *buffers,
2711{
2715
2717
2718 /*
2719 * Acquire victim buffers for extension without holding extension lock.
2720 * Writing out victim buffers is the most expensive part of extending the
2721 * relation, particularly when doing so requires WAL flushes. Zeroing out
2722 * the buffers is also quite expensive, so do that before holding the
2723 * extension lock as well.
2724 *
2725 * These pages are pinned by us and not valid. While we hold the pin they
2726 * can't be acquired as victim buffers by another backend.
2727 */
2728 for (uint32 i = 0; i < extend_by; i++)
2729 {
2731
2732 buffers[i] = GetVictimBuffer(strategy, io_context);
2734
2735 /* new buffers are zero-filled */
2736 MemSet(buf_block, 0, BLCKSZ);
2737 }
2738
2739 /*
2740 * Lock relation against concurrent extensions, unless requested not to.
2741 *
2742 * We use the same extension lock for all forks. That's unnecessarily
2743 * restrictive, but currently extensions for forks don't happen often
2744 * enough to make it worth locking more granularly.
2745 *
2746 * Note that another backend might have extended the relation by the time
2747 * we get the lock.
2748 */
2749 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2751
2752 /*
2753 * If requested, invalidate size cache, so that smgrnblocks asks the
2754 * kernel.
2755 */
2756 if (flags & EB_CLEAR_SIZE_CACHE)
2757 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2758
2760
2761 /*
2762 * Now that we have the accurate relation size, check if the caller wants
2763 * us to extend to only up to a specific size. If there were concurrent
2764 * extensions, we might have acquired too many buffers and need to release
2765 * them.
2766 */
2768 {
2770
2772 extend_by = 0;
2773 else if ((uint64) first_block + extend_by > extend_upto)
2775
2776 for (uint32 i = extend_by; i < orig_extend_by; i++)
2777 {
2778 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2779
2781 }
2782
2783 if (extend_by == 0)
2784 {
2785 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2788 return first_block;
2789 }
2790 }
2791
2792 /* Fail if relation is already at maximum possible length */
2794 ereport(ERROR,
2796 errmsg("cannot extend relation %s beyond %u blocks",
2797 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2798 MaxBlockNumber)));
2799
2800 /*
2801 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2802 *
2803 * This needs to happen before we extend the relation, because as soon as
2804 * we do, other backends can start to read in those pages.
2805 */
2806 for (uint32 i = 0; i < extend_by; i++)
2807 {
2808 Buffer victim_buf = buffers[i];
2810 BufferTag tag;
2811 uint32 hash;
2813 int existing_id;
2814
2815 /* in case we need to pin an existing buffer below */
2818
2819 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2820 first_block + i);
2821 hash = BufTableHashCode(&tag);
2823
2825
2827
2828 /*
2829 * We get here only in the corner case where we are trying to extend
2830 * the relation but we found a pre-existing buffer. This can happen
2831 * because a prior attempt at extending the relation failed, and
2832 * because mdread doesn't complain about reads beyond EOF (when
2833 * zero_damaged_pages is ON) and so a previous attempt to read a block
2834 * beyond EOF could have left a "valid" zero-filled buffer.
2835 *
2836 * This has also been observed when relation was overwritten by
2837 * external process. Since the legitimate cases should always have
2838 * left a zero-filled buffer, complain if not PageIsNew.
2839 */
2840 if (existing_id >= 0)
2841 {
2844 bool valid;
2845
2846 /*
2847 * Pin the existing buffer before releasing the partition lock,
2848 * preventing it from being evicted.
2849 */
2850 valid = PinBuffer(existing_hdr, strategy, false);
2851
2854
2857
2858 if (valid && !PageIsNew((Page) buf_block))
2859 ereport(ERROR,
2860 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2861 existing_hdr->tag.blockNum,
2862 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2863
2864 /*
2865 * We *must* do smgr[zero]extend before succeeding, else the page
2866 * will not be reserved by the kernel, and the next P_NEW call
2867 * will decide to return the same page. Clear the BM_VALID bit,
2868 * do StartBufferIO() and proceed.
2869 *
2870 * Loop to handle the very small possibility that someone re-sets
2871 * BM_VALID between our clearing it and StartBufferIO inspecting
2872 * it.
2873 */
2874 do
2875 {
2877 } while (!StartBufferIO(existing_hdr, true, false));
2878 }
2879 else
2880 {
2882 uint64 set_bits = 0;
2883
2885
2886 /* some sanity checks while we hold the buffer header lock */
2889
2890 victim_buf_hdr->tag = tag;
2891
2893 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2895
2897 set_bits, 0,
2898 0);
2899
2901
2902 /* XXX: could combine the locked operations in it with the above */
2903 StartBufferIO(victim_buf_hdr, true, false);
2904 }
2905 }
2906
2908
2909 /*
2910 * Note: if smgrzeroextend fails, we will end up with buffers that are
2911 * allocated but not marked BM_VALID. The next relation extension will
2912 * still select the same block number (because the relation didn't get any
2913 * longer on disk) and so future attempts to extend the relation will find
2914 * the same buffers (if they have not been recycled) but come right back
2915 * here to try smgrzeroextend again.
2916 *
2917 * We don't need to set checksum for all-zero pages.
2918 */
2920
2921 /*
2922 * Release the file-extension lock; it's now OK for someone else to extend
2923 * the relation some more.
2924 *
2925 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2926 * take noticeable time.
2927 */
2928 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2930
2932 io_start, 1, extend_by * BLCKSZ);
2933
2934 /* Set BM_VALID, terminate IO, and wake up any waiters */
2935 for (uint32 i = 0; i < extend_by; i++)
2936 {
2937 Buffer buf = buffers[i];
2939 bool lock = false;
2940
2941 if (flags & EB_LOCK_FIRST && i == 0)
2942 lock = true;
2943 else if (flags & EB_LOCK_TARGET)
2944 {
2946 if (first_block + i + 1 == extend_upto)
2947 lock = true;
2948 }
2949
2950 if (lock)
2952
2953 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2954 }
2955
2957
2959
2960 return first_block;
2961}
2962
2963/*
2964 * BufferIsLockedByMe
2965 *
2966 * Checks if this backend has the buffer locked in any mode.
2967 *
2968 * Buffer must be pinned.
2969 */
2970bool
2972{
2974
2976
2977 if (BufferIsLocal(buffer))
2978 {
2979 /* Content locks are not maintained for local buffers. */
2980 return true;
2981 }
2982 else
2983 {
2985 return BufferLockHeldByMe(bufHdr);
2986 }
2987}
2988
2989/*
2990 * BufferIsLockedByMeInMode
2991 *
2992 * Checks if this backend has the buffer locked in the specified mode.
2993 *
2994 * Buffer must be pinned.
2995 */
2996bool
2998{
3000
3002
3003 if (BufferIsLocal(buffer))
3004 {
3005 /* Content locks are not maintained for local buffers. */
3006 return true;
3007 }
3008 else
3009 {
3012 }
3013}
3014
3015/*
3016 * BufferIsDirty
3017 *
3018 * Checks if buffer is already dirty.
3019 *
3020 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
3021 * the result may be stale before it's returned.)
3022 */
3023bool
3025{
3027
3029
3030 if (BufferIsLocal(buffer))
3031 {
3032 int bufid = -buffer - 1;
3033
3035 /* Content locks are not maintained for local buffers. */
3036 }
3037 else
3038 {
3041 }
3042
3043 return pg_atomic_read_u64(&bufHdr->state) & BM_DIRTY;
3044}
3045
3046/*
3047 * MarkBufferDirty
3048 *
3049 * Marks buffer contents as dirty (actual write happens later).
3050 *
3051 * Buffer must be pinned and exclusive-locked. (If caller does not hold
3052 * exclusive lock, then somebody could be in process of writing the buffer,
3053 * leading to risk of bad data written to disk.)
3054 */
3055void
3057{
3061
3062 if (!BufferIsValid(buffer))
3063 elog(ERROR, "bad buffer ID: %d", buffer);
3064
3065 if (BufferIsLocal(buffer))
3066 {
3068 return;
3069 }
3070
3072
3075
3076 /*
3077 * NB: We have to wait for the buffer header spinlock to be not held, as
3078 * TerminateBufferIO() relies on the spinlock.
3079 */
3081 for (;;)
3082 {
3085
3087
3090
3092 buf_state))
3093 break;
3094 }
3095
3096 /*
3097 * If the buffer was not dirty already, do vacuum accounting.
3098 */
3099 if (!(old_buf_state & BM_DIRTY))
3100 {
3102 if (VacuumCostActive)
3104 }
3105}
3106
3107/*
3108 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
3109 *
3110 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
3111 * compared to calling the two routines separately. Now it's mainly just
3112 * a convenience function. However, if the passed buffer is valid and
3113 * already contains the desired block, we just return it as-is; and that
3114 * does save considerable work compared to a full release and reacquire.
3115 *
3116 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3117 * buffer actually needs to be released. This case is the same as ReadBuffer,
3118 * but can save some tests in the caller.
3119 */
3120Buffer
3122 Relation relation,
3123 BlockNumber blockNum)
3124{
3125 ForkNumber forkNum = MAIN_FORKNUM;
3127
3128 if (BufferIsValid(buffer))
3129 {
3131 if (BufferIsLocal(buffer))
3132 {
3134 if (bufHdr->tag.blockNum == blockNum &&
3135 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3136 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3137 return buffer;
3139 }
3140 else
3141 {
3143 /* we have pin, so it's ok to examine tag without spinlock */
3144 if (bufHdr->tag.blockNum == blockNum &&
3145 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3146 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3147 return buffer;
3149 }
3150 }
3151
3152 return ReadBuffer(relation, blockNum);
3153}
3154
3155/*
3156 * PinBuffer -- make buffer unavailable for replacement.
3157 *
3158 * For the default access strategy, the buffer's usage_count is incremented
3159 * when we first pin it; for other strategies we just make sure the usage_count
3160 * isn't zero. (The idea of the latter is that we don't want synchronized
3161 * heap scans to inflate the count, but we need it to not be zero to discourage
3162 * other backends from stealing buffers from our ring. As long as we cycle
3163 * through the ring faster than the global clock-sweep cycles, buffers in
3164 * our ring won't be chosen as victims for replacement by other backends.)
3165 *
3166 * This should be applied only to shared buffers, never local ones.
3167 *
3168 * Since buffers are pinned/unpinned very frequently, pin buffers without
3169 * taking the buffer header lock; instead update the state variable in loop of
3170 * CAS operations. Hopefully it's just a single CAS.
3171 *
3172 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3173 * must have been done already.
3174 *
3175 * Returns true if buffer is BM_VALID, else false. This provision allows
3176 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3177 * true, then a false return value also indicates that the buffer was
3178 * (recently) invalid and has not been pinned.
3179 */
3180static bool
3182 bool skip_if_not_valid)
3183{
3185 bool result;
3187
3190
3191 ref = GetPrivateRefCountEntry(b, true);
3192
3193 if (ref == NULL)
3194 {
3197
3199 for (;;)
3200 {
3202 return false;
3203
3204 /*
3205 * We're not allowed to increase the refcount while the buffer
3206 * header spinlock is held. Wait for the lock to be released.
3207 */
3210
3212
3213 /* increase refcount */
3215
3216 if (strategy == NULL)
3217 {
3218 /* Default case: increase usagecount unless already max. */
3221 }
3222 else
3223 {
3224 /*
3225 * Ring buffers shouldn't evict others from pool. Thus we
3226 * don't make usagecount more than 1.
3227 */
3230 }
3231
3233 buf_state))
3234 {
3235 result = (buf_state & BM_VALID) != 0;
3236
3238 break;
3239 }
3240 }
3241 }
3242 else
3243 {
3244 /*
3245 * If we previously pinned the buffer, it is likely to be valid, but
3246 * it may not be if StartReadBuffers() was called and
3247 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3248 * the flags without locking. This is racy, but it's OK to return
3249 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3250 * it'll see that it's now valid.
3251 *
3252 * Note: We deliberately avoid a Valgrind client request here.
3253 * Individual access methods can optionally superimpose buffer page
3254 * client requests on top of our client requests to enforce that
3255 * buffers are only accessed while locked (and pinned). It's possible
3256 * that the buffer page is legitimately non-accessible here. We
3257 * cannot meddle with that.
3258 */
3259 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3260
3261 Assert(ref->data.refcount > 0);
3262 ref->data.refcount++;
3264 }
3265
3266 return result;
3267}
3268
3269/*
3270 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3271 * The spinlock is released before return.
3272 *
3273 * As this function is called with the spinlock held, the caller has to
3274 * previously call ReservePrivateRefCountEntry() and
3275 * ResourceOwnerEnlarge(CurrentResourceOwner);
3276 *
3277 * Currently, no callers of this function want to modify the buffer's
3278 * usage_count at all, so there's no need for a strategy parameter.
3279 * Also we don't bother with a BM_VALID test (the caller could check that for
3280 * itself).
3281 *
3282 * Also all callers only ever use this function when it's known that the
3283 * buffer can't have a preexisting pin by this backend. That allows us to skip
3284 * searching the private refcount array & hash, which is a boon, because the
3285 * spinlock is still held.
3286 *
3287 * Note: use of this routine is frequently mandatory, not just an optimization
3288 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3289 * its state can change under us.
3290 */
3291static void
3293{
3295
3296 /*
3297 * As explained, We don't expect any preexisting pins. That allows us to
3298 * manipulate the PrivateRefCount after releasing the spinlock
3299 */
3301
3302 /*
3303 * Since we hold the buffer spinlock, we can update the buffer state and
3304 * release the lock in one operation.
3305 */
3307
3309 0, 0, 1);
3310
3312}
3313
3314/*
3315 * Support for waking up another backend that is waiting for the cleanup lock
3316 * to be released using BM_PIN_COUNT_WAITER.
3317 *
3318 * See LockBufferForCleanup().
3319 *
3320 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3321 * not just reducing the backend-local pincount for the buffer).
3322 */
3323static void
3325{
3326 /*
3327 * Acquire the buffer header lock, re-check that there's a waiter. Another
3328 * backend could have unpinned this buffer, and already woken up the
3329 * waiter.
3330 *
3331 * There's no danger of the buffer being replaced after we unpinned it
3332 * above, as it's pinned by the waiter. The waiter removes
3333 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3334 * backend waking it up.
3335 */
3337
3340 {
3341 /* we just released the last pin other than the waiter's */
3342 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3343
3346 0);
3347 ProcSendSignal(wait_backend_pgprocno);
3348 }
3349 else
3351}
3352
3353/*
3354 * UnpinBuffer -- make buffer available for replacement.
3355 *
3356 * This should be applied only to shared buffers, never local ones. This
3357 * always adjusts CurrentResourceOwner.
3358 */
3359static void
3367
3368static void
3370{
3373
3375
3376 /* not moving as we're likely deleting it soon anyway */
3377 ref = GetPrivateRefCountEntry(b, false);
3378 Assert(ref != NULL);
3379 Assert(ref->data.refcount > 0);
3380 ref->data.refcount--;
3381 if (ref->data.refcount == 0)
3382 {
3384
3385 /*
3386 * Mark buffer non-accessible to Valgrind.
3387 *
3388 * Note that the buffer may have already been marked non-accessible
3389 * within access method code that enforces that buffers are only
3390 * accessed while a buffer lock is held.
3391 */
3393
3394 /*
3395 * I'd better not still hold the buffer content lock. Can't use
3396 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3397 */
3399
3400 /* decrement the shared reference count */
3402
3403 /* Support LockBufferForCleanup() */
3406
3408 }
3409}
3410
3411/*
3412 * Set up backend-local tracking of a buffer pinned the first time by this
3413 * backend.
3414 */
3415inline void
3417{
3419
3421 ref->data.refcount++;
3422
3424
3425 /*
3426 * This is the first pin for this page by this backend, mark its page as
3427 * defined to valgrind. While the page contents might not actually be
3428 * valid yet, we don't currently guarantee that such pages are marked
3429 * undefined or non-accessible.
3430 *
3431 * It's not necessarily the prettiest to do this here, but otherwise we'd
3432 * need this block of code in multiple places.
3433 */
3435 BLCKSZ);
3436}
3437
3438#define ST_SORT sort_checkpoint_bufferids
3439#define ST_ELEMENT_TYPE CkptSortItem
3440#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3441#define ST_SCOPE static
3442#define ST_DEFINE
3443#include "lib/sort_template.h"
3444
3445/*
3446 * BufferSync -- Write out all dirty buffers in the pool.
3447 *
3448 * This is called at checkpoint time to write out all dirty shared buffers.
3449 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3450 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3451 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3452 * even unlogged buffers, which are otherwise skipped. The remaining flags
3453 * currently have no effect here.
3454 */
3455static void
3456BufferSync(int flags)
3457{
3459 int buf_id;
3460 int num_to_scan;
3461 int num_spaces;
3462 int num_processed;
3463 int num_written;
3465 Oid last_tsid;
3467 int i;
3468 uint64 mask = BM_DIRTY;
3470
3471 /*
3472 * Unless this is a shutdown checkpoint or we have been explicitly told,
3473 * we write only permanent, dirty buffers. But at shutdown or end of
3474 * recovery, we write all dirty buffers.
3475 */
3478 mask |= BM_PERMANENT;
3479
3480 /*
3481 * Loop over all buffers, and mark the ones that need to be written with
3482 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3483 * can estimate how much work needs to be done.
3484 *
3485 * This allows us to write only those pages that were dirty when the
3486 * checkpoint began, and not those that get dirtied while it proceeds.
3487 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3488 * later in this function, or by normal backends or the bgwriter cleaning
3489 * scan, the flag is cleared. Any buffer dirtied after this point won't
3490 * have the flag set.
3491 *
3492 * Note that if we fail to write some buffer, we may leave buffers with
3493 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3494 * certainly need to be written for the next checkpoint attempt, too.
3495 */
3496 num_to_scan = 0;
3497 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3498 {
3500 uint64 set_bits = 0;
3501
3502 /*
3503 * Header spinlock is enough to examine BM_DIRTY, see comment in
3504 * SyncOneBuffer.
3505 */
3507
3508 if ((buf_state & mask) == mask)
3509 {
3510 CkptSortItem *item;
3511
3513
3514 item = &CkptBufferIds[num_to_scan++];
3515 item->buf_id = buf_id;
3516 item->tsId = bufHdr->tag.spcOid;
3517 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3518 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3519 item->blockNum = bufHdr->tag.blockNum;
3520 }
3521
3523 set_bits, 0,
3524 0);
3525
3526 /* Check for barrier events in case NBuffers is large. */
3529 }
3530
3531 if (num_to_scan == 0)
3532 return; /* nothing to do */
3533
3535
3537
3538 /*
3539 * Sort buffers that need to be written to reduce the likelihood of random
3540 * IO. The sorting is also important for the implementation of balancing
3541 * writes between tablespaces. Without balancing writes we'd potentially
3542 * end up writing to the tablespaces one-by-one; possibly overloading the
3543 * underlying system.
3544 */
3546
3547 num_spaces = 0;
3548
3549 /*
3550 * Allocate progress status for each tablespace with buffers that need to
3551 * be flushed. This requires the to-be-flushed array to be sorted.
3552 */
3554 for (i = 0; i < num_to_scan; i++)
3555 {
3556 CkptTsStatus *s;
3557 Oid cur_tsid;
3558
3560
3561 /*
3562 * Grow array of per-tablespace status structs, every time a new
3563 * tablespace is found.
3564 */
3566 {
3567 Size sz;
3568
3569 num_spaces++;
3570
3571 /*
3572 * Not worth adding grow-by-power-of-2 logic here - even with a
3573 * few hundred tablespaces this should be fine.
3574 */
3575 sz = sizeof(CkptTsStatus) * num_spaces;
3576
3577 if (per_ts_stat == NULL)
3579 else
3581
3582 s = &per_ts_stat[num_spaces - 1];
3583 memset(s, 0, sizeof(*s));
3584 s->tsId = cur_tsid;
3585
3586 /*
3587 * The first buffer in this tablespace. As CkptBufferIds is sorted
3588 * by tablespace all (s->num_to_scan) buffers in this tablespace
3589 * will follow afterwards.
3590 */
3591 s->index = i;
3592
3593 /*
3594 * progress_slice will be determined once we know how many buffers
3595 * are in each tablespace, i.e. after this loop.
3596 */
3597
3599 }
3600 else
3601 {
3602 s = &per_ts_stat[num_spaces - 1];
3603 }
3604
3605 s->num_to_scan++;
3606
3607 /* Check for barrier events. */
3610 }
3611
3612 Assert(num_spaces > 0);
3613
3614 /*
3615 * Build a min-heap over the write-progress in the individual tablespaces,
3616 * and compute how large a portion of the total progress a single
3617 * processed buffer is.
3618 */
3621 NULL);
3622
3623 for (i = 0; i < num_spaces; i++)
3624 {
3626
3627 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3628
3630 }
3631
3633
3634 /*
3635 * Iterate through to-be-checkpointed buffers and write the ones (still)
3636 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3637 * tablespaces; otherwise the sorting would lead to only one tablespace
3638 * receiving writes at a time, making inefficient use of the hardware.
3639 */
3640 num_processed = 0;
3641 num_written = 0;
3642 while (!binaryheap_empty(ts_heap))
3643 {
3647
3648 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3649 Assert(buf_id != -1);
3650
3651 bufHdr = GetBufferDescriptor(buf_id);
3652
3653 num_processed++;
3654
3655 /*
3656 * We don't need to acquire the lock here, because we're only looking
3657 * at a single bit. It's possible that someone else writes the buffer
3658 * and clears the flag right after we check, but that doesn't matter
3659 * since SyncOneBuffer will then do nothing. However, there is a
3660 * further race condition: it's conceivable that between the time we
3661 * examine the bit here and the time SyncOneBuffer acquires the lock,
3662 * someone else not only wrote the buffer but replaced it with another
3663 * page and dirtied it. In that improbable case, SyncOneBuffer will
3664 * write the buffer though we didn't need to. It doesn't seem worth
3665 * guarding against this, though.
3666 */
3668 {
3669 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3670 {
3673 num_written++;
3674 }
3675 }
3676
3677 /*
3678 * Measure progress independent of actually having to flush the buffer
3679 * - otherwise writing become unbalanced.
3680 */
3681 ts_stat->progress += ts_stat->progress_slice;
3682 ts_stat->num_scanned++;
3683 ts_stat->index++;
3684
3685 /* Have all the buffers from the tablespace been processed? */
3686 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3687 {
3689 }
3690 else
3691 {
3692 /* update heap with the new progress */
3694 }
3695
3696 /*
3697 * Sleep to throttle our I/O rate.
3698 *
3699 * (This will check for barrier events even if it doesn't sleep.)
3700 */
3701 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3702 }
3703
3704 /*
3705 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3706 * IOContext will always be IOCONTEXT_NORMAL.
3707 */
3709
3711 per_ts_stat = NULL;
3713
3714 /*
3715 * Update checkpoint statistics. As noted above, this doesn't include
3716 * buffers written by other backends or bgwriter scan.
3717 */
3719
3721}
3722
3723/*
3724 * BgBufferSync -- Write out some dirty buffers in the pool.
3725 *
3726 * This is called periodically by the background writer process.
3727 *
3728 * Returns true if it's appropriate for the bgwriter process to go into
3729 * low-power hibernation mode. (This happens if the strategy clock-sweep
3730 * has been "lapped" and no buffer allocations have occurred recently,
3731 * or if the bgwriter has been effectively disabled by setting
3732 * bgwriter_lru_maxpages to 0.)
3733 */
3734bool
3736{
3737 /* info obtained from freelist.c */
3738 int strategy_buf_id;
3741
3742 /*
3743 * Information saved between calls so we can determine the strategy
3744 * point's advance rate and avoid scanning already-cleaned buffers.
3745 */
3746 static bool saved_info_valid = false;
3747 static int prev_strategy_buf_id;
3749 static int next_to_clean;
3750 static uint32 next_passes;
3751
3752 /* Moving averages of allocation rate and clean-buffer density */
3753 static float smoothed_alloc = 0;
3754 static float smoothed_density = 10.0;
3755
3756 /* Potentially these could be tunables, but for now, not */
3757 float smoothing_samples = 16;
3758 float scan_whole_pool_milliseconds = 120000.0;
3759
3760 /* Used to compute how far we scan ahead */
3761 long strategy_delta;
3762 int bufs_to_lap;
3763 int bufs_ahead;
3764 float scans_per_alloc;
3767 int min_scan_buffers;
3768
3769 /* Variables for the scanning loop proper */
3770 int num_to_scan;
3771 int num_written;
3772 int reusable_buffers;
3773
3774 /* Variables for final smoothed_density update */
3775 long new_strategy_delta;
3777
3778 /*
3779 * Find out where the clock-sweep currently is, and how many buffer
3780 * allocations have happened since our last call.
3781 */
3783
3784 /* Report buffer alloc counts to pgstat */
3786
3787 /*
3788 * If we're not running the LRU scan, just stop after doing the stats
3789 * stuff. We mark the saved state invalid so that we can recover sanely
3790 * if LRU scan is turned back on later.
3791 */
3792 if (bgwriter_lru_maxpages <= 0)
3793 {
3794 saved_info_valid = false;
3795 return true;
3796 }
3797
3798 /*
3799 * Compute strategy_delta = how many buffers have been scanned by the
3800 * clock-sweep since last time. If first time through, assume none. Then
3801 * see if we are still ahead of the clock-sweep, and if so, how many
3802 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3803 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3804 * behavior when the passes counts wrap around.
3805 */
3806 if (saved_info_valid)
3807 {
3809
3812
3813 Assert(strategy_delta >= 0);
3814
3815 if ((int32) (next_passes - strategy_passes) > 0)
3816 {
3817 /* we're one pass ahead of the strategy point */
3819#ifdef BGW_DEBUG
3820 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3824#endif
3825 }
3826 else if (next_passes == strategy_passes &&
3828 {
3829 /* on same pass, but ahead or at least not behind */
3831#ifdef BGW_DEBUG
3832 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3836#endif
3837 }
3838 else
3839 {
3840 /*
3841 * We're behind, so skip forward to the strategy point and start
3842 * cleaning from there.
3843 */
3844#ifdef BGW_DEBUG
3845 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3849#endif
3853 }
3854 }
3855 else
3856 {
3857 /*
3858 * Initializing at startup or after LRU scanning had been off. Always
3859 * start at the strategy point.
3860 */
3861#ifdef BGW_DEBUG
3862 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3864#endif
3865 strategy_delta = 0;
3869 }
3870
3871 /* Update saved info for next time */
3874 saved_info_valid = true;
3875
3876 /*
3877 * Compute how many buffers had to be scanned for each new allocation, ie,
3878 * 1/density of reusable buffers, and track a moving average of that.
3879 *
3880 * If the strategy point didn't move, we don't update the density estimate
3881 */
3882 if (strategy_delta > 0 && recent_alloc > 0)
3883 {
3887 }
3888
3889 /*
3890 * Estimate how many reusable buffers there are between the current
3891 * strategy point and where we've scanned ahead to, based on the smoothed
3892 * density estimate.
3893 */
3896
3897 /*
3898 * Track a moving average of recent buffer allocations. Here, rather than
3899 * a true average we want a fast-attack, slow-decline behavior: we
3900 * immediately follow any increase.
3901 */
3902 if (smoothed_alloc <= (float) recent_alloc)
3904 else
3907
3908 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3910
3911 /*
3912 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3913 * eventually underflow to zero, and the underflows produce annoying
3914 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3915 * zero, there's no point in tracking smaller and smaller values of
3916 * smoothed_alloc, so just reset it to exactly zero to avoid this
3917 * syndrome. It will pop back up as soon as recent_alloc increases.
3918 */
3919 if (upcoming_alloc_est == 0)
3920 smoothed_alloc = 0;
3921
3922 /*
3923 * Even in cases where there's been little or no buffer allocation
3924 * activity, we want to make a small amount of progress through the buffer
3925 * cache so that as many reusable buffers as possible are clean after an
3926 * idle period.
3927 *
3928 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3929 * the BGW will be called during the scan_whole_pool time; slice the
3930 * buffer pool into that many sections.
3931 */
3933
3935 {
3936#ifdef BGW_DEBUG
3937 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3939#endif
3941 }
3942
3943 /*
3944 * Now write out dirty reusable buffers, working forward from the
3945 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3946 * enough buffers to match our estimate of the next cycle's allocation
3947 * requirements, or hit the bgwriter_lru_maxpages limit.
3948 */
3949
3950 num_to_scan = bufs_to_lap;
3951 num_written = 0;
3953
3954 /* Execute the LRU scan */
3955 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3956 {
3958 wb_context);
3959
3960 if (++next_to_clean >= NBuffers)
3961 {
3962 next_to_clean = 0;
3963 next_passes++;
3964 }
3965 num_to_scan--;
3966
3967 if (sync_state & BUF_WRITTEN)
3968 {
3971 {
3973 break;
3974 }
3975 }
3976 else if (sync_state & BUF_REUSABLE)
3978 }
3979
3981
3982#ifdef BGW_DEBUG
3983 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3986 bufs_to_lap - num_to_scan,
3989#endif
3990
3991 /*
3992 * Consider the above scan as being like a new allocation scan.
3993 * Characterize its density and update the smoothed one based on it. This
3994 * effectively halves the moving average period in cases where both the
3995 * strategy and the background writer are doing some useful scanning,
3996 * which is helpful because a long memory isn't as desirable on the
3997 * density estimates.
3998 */
3999 new_strategy_delta = bufs_to_lap - num_to_scan;
4001 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4002 {
4006
4007#ifdef BGW_DEBUG
4008 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4011#endif
4012 }
4013
4014 /* Return true if OK to hibernate */
4015 return (bufs_to_lap == 0 && recent_alloc == 0);
4016}
4017
4018/*
4019 * SyncOneBuffer -- process a single buffer during syncing.
4020 *
4021 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
4022 * buffers marked recently used, as these are not replacement candidates.
4023 *
4024 * Returns a bitmask containing the following flag bits:
4025 * BUF_WRITTEN: we wrote the buffer.
4026 * BUF_REUSABLE: buffer is available for replacement, ie, it has
4027 * pin count 0 and usage count 0.
4028 *
4029 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
4030 * after locking it, but we don't care all that much.)
4031 */
4032static int
4034{
4036 int result = 0;
4038 BufferTag tag;
4039
4040 /* Make sure we can handle the pin */
4043
4044 /*
4045 * Check whether buffer needs writing.
4046 *
4047 * We can make this check without taking the buffer content lock so long
4048 * as we mark pages dirty in access methods *before* logging changes with
4049 * XLogInsert(): if someone marks the buffer dirty just after our check we
4050 * don't worry because our checkpoint.redo points before log record for
4051 * upcoming changes and so we are not required to write such dirty buffer.
4052 */
4054
4057 {
4058 result |= BUF_REUSABLE;
4059 }
4060 else if (skip_recently_used)
4061 {
4062 /* Caller told us not to write recently-used buffers */
4064 return result;
4065 }
4066
4067 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4068 {
4069 /* It's clean, so nothing to do */
4071 return result;
4072 }
4073
4074 /*
4075 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
4076 * buffer is clean by the time we've locked it.)
4077 */
4079
4081
4082 tag = bufHdr->tag;
4083
4085
4086 /*
4087 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4088 * IOContext will always be IOCONTEXT_NORMAL.
4089 */
4091
4092 return result | BUF_WRITTEN;
4093}
4094
4095/*
4096 * AtEOXact_Buffers - clean up at end of transaction.
4097 *
4098 * As of PostgreSQL 8.0, buffer pins should get released by the
4099 * ResourceOwner mechanism. This routine is just a debugging
4100 * cross-check that no pins remain.
4101 */
4102void
4111
4112/*
4113 * Initialize access to shared buffer pool
4114 *
4115 * This is called during backend startup (whether standalone or under the
4116 * postmaster). It sets up for this backend's access to the already-existing
4117 * buffer pool.
4118 */
4119void
4121{
4123
4124 /*
4125 * An advisory limit on the number of pins each backend should hold, based
4126 * on shared_buffers and the maximum number of connections possible.
4127 * That's very pessimistic, but outside toy-sized shared_buffers it should
4128 * allow plenty of pins. LimitAdditionalPins() and
4129 * GetAdditionalPinLimit() can be used to check the remaining balance.
4130 */
4132
4135
4136 hash_ctl.keysize = sizeof(Buffer);
4137 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4138
4139 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4141
4142 /*
4143 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4144 * the corresponding phase of backend shutdown.
4145 */
4146 Assert(MyProc != NULL);
4148}
4149
4150/*
4151 * During backend exit, ensure that we released all shared-buffer locks and
4152 * assert that we have no remaining pins.
4153 */
4154static void
4156{
4157 UnlockBuffers();
4158
4160
4161 /* localbuf.c needs a chance too */
4163}
4164
4165/*
4166 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4167 *
4168 * As of PostgreSQL 8.0, buffer pins should get released by the
4169 * ResourceOwner mechanism. This routine is just a debugging
4170 * cross-check that no pins remain.
4171 */
4172static void
4174{
4175#ifdef USE_ASSERT_CHECKING
4176 int RefCountErrors = 0;
4178 int i;
4179 char *s;
4180
4181 /* check the array */
4182 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4183 {
4185 {
4186 res = &PrivateRefCountArray[i];
4187
4189 elog(WARNING, "buffer refcount leak: %s", s);
4190 pfree(s);
4191
4193 }
4194 }
4195
4196 /* if necessary search the hash */
4198 {
4200
4202 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4203 {
4205 elog(WARNING, "buffer refcount leak: %s", s);
4206 pfree(s);
4208 }
4209 }
4210
4211 Assert(RefCountErrors == 0);
4212#endif
4213}
4214
4215#ifdef USE_ASSERT_CHECKING
4216/*
4217 * Check for exclusive-locked catalog buffers. This is the core of
4218 * AssertCouldGetRelation().
4219 *
4220 * A backend would self-deadlock on the content lock if the catalog scan read
4221 * the exclusive-locked buffer. The main threat is exclusive-locked buffers
4222 * of catalogs used in relcache, because a catcache search on any catalog may
4223 * build that catalog's relcache entry. We don't have an inventory of
4224 * catalogs relcache uses, so just check buffers of most catalogs.
4225 *
4226 * It's better to minimize waits while holding an exclusive buffer lock, so it
4227 * would be nice to broaden this check not to be catalog-specific. However,
4228 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4229 * read tables. That is deadlock-free as long as there's no loop in the
4230 * dependency graph: modifying table A may cause an opclass to read table B,
4231 * but it must not cause a read of table A.
4232 */
4233void
4235{
4237
4238 /* check the array */
4239 for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4240 {
4242 {
4243 res = &PrivateRefCountArray[i];
4244
4245 if (res->buffer == InvalidBuffer)
4246 continue;
4247
4249 }
4250 }
4251
4252 /* if necessary search the hash */
4254 {
4256
4258 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4259 {
4261 }
4262 }
4263}
4264
4265static void
4267{
4269 BufferTag tag;
4270 Oid relid;
4271
4273 return;
4274
4275 tag = bufHdr->tag;
4276
4277 /*
4278 * This relNumber==relid assumption holds until a catalog experiences
4279 * VACUUM FULL or similar. After a command like that, relNumber will be
4280 * in the normal (non-catalog) range, and we lose the ability to detect
4281 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4282 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4283 * held lock.
4284 */
4285 relid = tag.relNumber;
4286
4287 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4288 return;
4289
4291}
4292#endif
4293
4294
4295/*
4296 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4297 */
4298char *
4300{
4301 BufferDesc *buf;
4303 char *result;
4304 ProcNumber backend;
4306
4308 if (BufferIsLocal(buffer))
4309 {
4312 backend = MyProcNumber;
4313 }
4314 else
4315 {
4318 backend = INVALID_PROC_NUMBER;
4319 }
4320
4321 /* theoretically we should lock the bufHdr here */
4322 buf_state = pg_atomic_read_u64(&buf->state);
4323
4324 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4325 buffer,
4327 BufTagGetForkNum(&buf->tag)).str,
4328 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4330 return result;
4331}
4332
4333/*
4334 * CheckPointBuffers
4335 *
4336 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4337 *
4338 * Note: temporary relations do not participate in checkpoints, so they don't
4339 * need to be flushed.
4340 */
4341void
4343{
4344 BufferSync(flags);
4345}
4346
4347/*
4348 * BufferGetBlockNumber
4349 * Returns the block number associated with a buffer.
4350 *
4351 * Note:
4352 * Assumes that the buffer is valid and pinned, else the
4353 * value may be obsolete immediately...
4354 */
4357{
4359
4361
4362 if (BufferIsLocal(buffer))
4364 else
4366
4367 /* pinned, so OK to read tag without spinlock */
4368 return bufHdr->tag.blockNum;
4369}
4370
4371/*
4372 * BufferGetTag
4373 * Returns the relfilelocator, fork number and block number associated with
4374 * a buffer.
4375 */
4376void
4379{
4381
4382 /* Do the same checks as BufferGetBlockNumber. */
4384
4385 if (BufferIsLocal(buffer))
4387 else
4389
4390 /* pinned, so OK to read tag without spinlock */
4391 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4392 *forknum = BufTagGetForkNum(&bufHdr->tag);
4393 *blknum = bufHdr->tag.blockNum;
4394}
4395
4396/*
4397 * FlushBuffer
4398 * Physically write out a shared buffer.
4399 *
4400 * NOTE: this actually just passes the buffer contents to the kernel; the
4401 * real write to disk won't happen until the kernel feels like it. This
4402 * is okay from our point of view since we can redo the changes from WAL.
4403 * However, we will need to force the changes to disk via fsync before
4404 * we can checkpoint WAL.
4405 *
4406 * The caller must hold a pin on the buffer and have share-locked the
4407 * buffer contents. (Note: a share-lock does not prevent updates of
4408 * hint bits in the buffer, so the page could change while the write
4409 * is in progress, but we assume that that will not invalidate the data
4410 * written.)
4411 *
4412 * If the caller has an smgr reference for the buffer's relation, pass it
4413 * as the second parameter. If not, pass NULL.
4414 */
4415static void
4418{
4420 ErrorContextCallback errcallback;
4423 char *bufToWrite;
4425
4426 /*
4427 * Try to start an I/O operation. If StartBufferIO returns false, then
4428 * someone else flushed the buffer before we could, so we need not do
4429 * anything.
4430 */
4431 if (!StartBufferIO(buf, false, false))
4432 return;
4433
4434 /* Setup error traceback support for ereport() */
4436 errcallback.arg = buf;
4437 errcallback.previous = error_context_stack;
4438 error_context_stack = &errcallback;
4439
4440 /* Find smgr relation for buffer */
4441 if (reln == NULL)
4443
4445 buf->tag.blockNum,
4446 reln->smgr_rlocator.locator.spcOid,
4447 reln->smgr_rlocator.locator.dbOid,
4448 reln->smgr_rlocator.locator.relNumber);
4449
4451
4452 /*
4453 * Run PageGetLSN while holding header lock, since we don't have the
4454 * buffer locked exclusively in all cases.
4455 */
4457
4458 /* To check if block content changes while flushing. - vadim 01/17/97 */
4460 0, BM_JUST_DIRTIED,
4461 0);
4462
4463 /*
4464 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4465 * rule that log updates must hit disk before any of the data-file changes
4466 * they describe do.
4467 *
4468 * However, this rule does not apply to unlogged relations, which will be
4469 * lost after a crash anyway. Most unlogged relation pages do not bear
4470 * LSNs since we never emit WAL records for them, and therefore flushing
4471 * up through the buffer LSN would be useless, but harmless. However,
4472 * GiST indexes use LSNs internally to track page-splits, and therefore
4473 * unlogged GiST pages bear "fake" LSNs generated by
4474 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4475 * LSN counter could advance past the WAL insertion point; and if it did
4476 * happen, attempting to flush WAL through that location would fail, with
4477 * disastrous system-wide consequences. To make sure that can't happen,
4478 * skip the flush if the buffer isn't permanent.
4479 */
4480 if (buf_state & BM_PERMANENT)
4482
4483 /*
4484 * Now it's safe to write the buffer to disk. Note that no one else should
4485 * have been able to write it, while we were busy with log flushing,
4486 * because we got the exclusive right to perform I/O by setting the
4487 * BM_IO_IN_PROGRESS bit.
4488 */
4490
4491 /*
4492 * Update page checksum if desired. Since we have only shared lock on the
4493 * buffer, other processes might be updating hint bits in it, so we must
4494 * copy the page to private storage if we do checksumming.
4495 */
4496 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4497
4499
4500 /*
4501 * bufToWrite is either the shared buffer or a copy, as appropriate.
4502 */
4504 BufTagGetForkNum(&buf->tag),
4505 buf->tag.blockNum,
4506 bufToWrite,
4507 false);
4508
4509 /*
4510 * When a strategy is in use, only flushes of dirty buffers already in the
4511 * strategy ring are counted as strategy writes (IOCONTEXT
4512 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4513 * statistics tracking.
4514 *
4515 * If a shared buffer initially added to the ring must be flushed before
4516 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4517 *
4518 * If a shared buffer which was added to the ring later because the
4519 * current strategy buffer is pinned or in use or because all strategy
4520 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4521 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4522 * (from_ring will be false).
4523 *
4524 * When a strategy is not in use, the write can only be a "regular" write
4525 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4526 */
4529
4531
4532 /*
4533 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4534 * end the BM_IO_IN_PROGRESS state.
4535 */
4536 TerminateBufferIO(buf, true, 0, true, false);
4537
4539 buf->tag.blockNum,
4540 reln->smgr_rlocator.locator.spcOid,
4541 reln->smgr_rlocator.locator.dbOid,
4542 reln->smgr_rlocator.locator.relNumber);
4543
4544 /* Pop the error context stack */
4545 error_context_stack = errcallback.previous;
4546}
4547
4548/*
4549 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4550 * before/after calling FlushBuffer().
4551 */
4552static void
4562
4563/*
4564 * RelationGetNumberOfBlocksInFork
4565 * Determines the current number of pages in the specified relation fork.
4566 *
4567 * Note that the accuracy of the result will depend on the details of the
4568 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4569 * it might not be.
4570 */
4573{
4574 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4575 {
4576 /*
4577 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4578 * tableam returns the size in bytes - but for the purpose of this
4579 * routine, we want the number of blocks. Therefore divide, rounding
4580 * up.
4581 */
4583
4584 szbytes = table_relation_size(relation, forkNum);
4585
4586 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4587 }
4588 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4589 {
4590 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4591 }
4592 else
4593 Assert(false);
4594
4595 return 0; /* keep compiler quiet */
4596}
4597
4598/*
4599 * BufferIsPermanent
4600 * Determines whether a buffer will potentially still be around after
4601 * a crash. Caller must hold a buffer pin.
4602 */
4603bool
4605{
4607
4608 /* Local buffers are used only for temp relations. */
4609 if (BufferIsLocal(buffer))
4610 return false;
4611
4612 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4615
4616 /*
4617 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4618 * need not bother with the buffer header spinlock. Even if someone else
4619 * changes the buffer header state while we're doing this, the state is
4620 * changed atomically, so we'll read the old value or the new value, but
4621 * not random garbage.
4622 */
4624 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4625}
4626
4627/*
4628 * BufferGetLSNAtomic
4629 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4630 * This is necessary for some callers who may not have an exclusive lock
4631 * on the buffer.
4632 */
4635{
4636 char *page = BufferGetPage(buffer);
4638 XLogRecPtr lsn;
4639
4640 /*
4641 * If we don't need locking for correctness, fastpath out.
4642 */
4644 return PageGetLSN(page);
4645
4646 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4649
4652 lsn = PageGetLSN(page);
4654
4655 return lsn;
4656}
4657
4658/* ---------------------------------------------------------------------
4659 * DropRelationBuffers
4660 *
4661 * This function removes from the buffer pool all the pages of the
4662 * specified relation forks that have block numbers >= firstDelBlock.
4663 * (In particular, with firstDelBlock = 0, all pages are removed.)
4664 * Dirty pages are simply dropped, without bothering to write them
4665 * out first. Therefore, this is NOT rollback-able, and so should be
4666 * used only with extreme caution!
4667 *
4668 * Currently, this is called only from smgr.c when the underlying file
4669 * is about to be deleted or truncated (firstDelBlock is needed for
4670 * the truncation case). The data in the affected pages would therefore
4671 * be deleted momentarily anyway, and there is no point in writing it.
4672 * It is the responsibility of higher-level code to ensure that the
4673 * deletion or truncation does not lose any data that could be needed
4674 * later. It is also the responsibility of higher-level code to ensure
4675 * that no other process could be trying to load more pages of the
4676 * relation into buffers.
4677 * --------------------------------------------------------------------
4678 */
4679void
4682{
4683 int i;
4684 int j;
4685 RelFileLocatorBackend rlocator;
4688
4689 rlocator = smgr_reln->smgr_rlocator;
4690
4691 /* If it's a local relation, it's localbuf.c's problem. */
4692 if (RelFileLocatorBackendIsTemp(rlocator))
4693 {
4694 if (rlocator.backend == MyProcNumber)
4695 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4697
4698 return;
4699 }
4700
4701 /*
4702 * To remove all the pages of the specified relation forks from the buffer
4703 * pool, we need to scan the entire buffer pool but we can optimize it by
4704 * finding the buffers from BufMapping table provided we know the exact
4705 * size of each fork of the relation. The exact size is required to ensure
4706 * that we don't leave any buffer for the relation being dropped as
4707 * otherwise the background writer or checkpointer can lead to a PANIC
4708 * error while flushing buffers corresponding to files that don't exist.
4709 *
4710 * To know the exact size, we rely on the size cached for each fork by us
4711 * during recovery which limits the optimization to recovery and on
4712 * standbys but we can easily extend it once we have shared cache for
4713 * relation size.
4714 *
4715 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4716 * and the future writes keeps the cached value up-to-date. See
4717 * smgrextend. It is possible that the value of the first lseek is smaller
4718 * than the actual number of existing blocks in the file due to buggy
4719 * Linux kernels that might not have accounted for the recent write. But
4720 * that should be fine because there must not be any buffers after that
4721 * file size.
4722 */
4723 for (i = 0; i < nforks; i++)
4724 {
4725 /* Get the number of blocks for a relation's fork */
4727
4729 {
4731 break;
4732 }
4733
4734 /* calculate the number of blocks to be invalidated */
4736 }
4737
4738 /*
4739 * We apply the optimization iff the total number of blocks to invalidate
4740 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4741 */
4744 {
4745 for (j = 0; j < nforks; j++)
4746 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4748 return;
4749 }
4750
4751 for (i = 0; i < NBuffers; i++)
4752 {
4754
4755 /*
4756 * We can make this a tad faster by prechecking the buffer tag before
4757 * we attempt to lock the buffer; this saves a lot of lock
4758 * acquisitions in typical cases. It should be safe because the
4759 * caller must have AccessExclusiveLock on the relation, or some other
4760 * reason to be certain that no one is loading new pages of the rel
4761 * into the buffer pool. (Otherwise we might well miss such pages
4762 * entirely.) Therefore, while the tag might be changing while we
4763 * look at it, it can't be changing *to* a value we care about, only
4764 * *away* from such a value. So false negatives are impossible, and
4765 * false positives are safe because we'll recheck after getting the
4766 * buffer lock.
4767 *
4768 * We could check forkNum and blockNum as well as the rlocator, but
4769 * the incremental win from doing so seems small.
4770 */
4771 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4772 continue;
4773
4775
4776 for (j = 0; j < nforks; j++)
4777 {
4778 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4779 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4780 bufHdr->tag.blockNum >= firstDelBlock[j])
4781 {
4782 InvalidateBuffer(bufHdr); /* releases spinlock */
4783 break;
4784 }
4785 }
4786 if (j >= nforks)
4788 }
4789}
4790
4791/* ---------------------------------------------------------------------
4792 * DropRelationsAllBuffers
4793 *
4794 * This function removes from the buffer pool all the pages of all
4795 * forks of the specified relations. It's equivalent to calling
4796 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4797 * --------------------------------------------------------------------
4798 */
4799void
4801{
4802 int i;
4803 int n = 0;
4804 SMgrRelation *rels;
4805 BlockNumber (*block)[MAX_FORKNUM + 1];
4808 bool cached = true;
4809 bool use_bsearch;
4810
4811 if (nlocators == 0)
4812 return;
4813
4814 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4815
4816 /* If it's a local relation, it's localbuf.c's problem. */
4817 for (i = 0; i < nlocators; i++)
4818 {
4819 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4820 {
4821 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4822 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4823 }
4824 else
4825 rels[n++] = smgr_reln[i];
4826 }
4827
4828 /*
4829 * If there are no non-local relations, then we're done. Release the
4830 * memory and return.
4831 */
4832 if (n == 0)
4833 {
4834 pfree(rels);
4835 return;
4836 }
4837
4838 /*
4839 * This is used to remember the number of blocks for all the relations
4840 * forks.
4841 */
4842 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4843 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4844
4845 /*
4846 * We can avoid scanning the entire buffer pool if we know the exact size
4847 * of each of the given relation forks. See DropRelationBuffers.
4848 */
4849 for (i = 0; i < n && cached; i++)
4850 {
4851 for (int j = 0; j <= MAX_FORKNUM; j++)
4852 {
4853 /* Get the number of blocks for a relation's fork. */
4854 block[i][j] = smgrnblocks_cached(rels[i], j);
4855
4856 /* We need to only consider the relation forks that exists. */
4857 if (block[i][j] == InvalidBlockNumber)
4858 {
4859 if (!smgrexists(rels[i], j))
4860 continue;
4861 cached = false;
4862 break;
4863 }
4864
4865 /* calculate the total number of blocks to be invalidated */
4866 nBlocksToInvalidate += block[i][j];
4867 }
4868 }
4869
4870 /*
4871 * We apply the optimization iff the total number of blocks to invalidate
4872 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4873 */
4875 {
4876 for (i = 0; i < n; i++)
4877 {
4878 for (int j = 0; j <= MAX_FORKNUM; j++)
4879 {
4880 /* ignore relation forks that doesn't exist */
4881 if (!BlockNumberIsValid(block[i][j]))
4882 continue;
4883
4884 /* drop all the buffers for a particular relation fork */
4885 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4886 j, block[i][j], 0);
4887 }
4888 }
4889
4890 pfree(block);
4891 pfree(rels);
4892 return;
4893 }
4894
4895 pfree(block);
4896 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4897 for (i = 0; i < n; i++)
4898 locators[i] = rels[i]->smgr_rlocator.locator;
4899
4900 /*
4901 * For low number of relations to drop just use a simple walk through, to
4902 * save the bsearch overhead. The threshold to use is rather a guess than
4903 * an exactly determined value, as it depends on many factors (CPU and RAM
4904 * speeds, amount of shared buffers etc.).
4905 */
4907
4908 /* sort the list of rlocators if necessary */
4909 if (use_bsearch)
4911
4912 for (i = 0; i < NBuffers; i++)
4913 {
4914 RelFileLocator *rlocator = NULL;
4916
4917 /*
4918 * As in DropRelationBuffers, an unlocked precheck should be safe and
4919 * saves some cycles.
4920 */
4921
4922 if (!use_bsearch)
4923 {
4924 int j;
4925
4926 for (j = 0; j < n; j++)
4927 {
4929 {
4930 rlocator = &locators[j];
4931 break;
4932 }
4933 }
4934 }
4935 else
4936 {
4937 RelFileLocator locator;
4938
4939 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4940 rlocator = bsearch(&locator,
4941 locators, n, sizeof(RelFileLocator),
4943 }
4944
4945 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4946 if (rlocator == NULL)
4947 continue;
4948
4950 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4951 InvalidateBuffer(bufHdr); /* releases spinlock */
4952 else
4954 }
4955
4956 pfree(locators);
4957 pfree(rels);
4958}
4959
4960/* ---------------------------------------------------------------------
4961 * FindAndDropRelationBuffers
4962 *
4963 * This function performs look up in BufMapping table and removes from the
4964 * buffer pool all the pages of the specified relation fork that has block
4965 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4966 * pages are removed.)
4967 * --------------------------------------------------------------------
4968 */
4969static void
4973{
4974 BlockNumber curBlock;
4975
4976 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4977 {
4978 uint32 bufHash; /* hash value for tag */
4979 BufferTag bufTag; /* identity of requested block */
4980 LWLock *bufPartitionLock; /* buffer partition lock for it */
4981 int buf_id;
4983
4984 /* create a tag so we can lookup the buffer */
4985 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4986
4987 /* determine its hash code and partition lock ID */
4990
4991 /* Check that it is in the buffer pool. If not, do nothing. */
4993 buf_id = BufTableLookup(&bufTag, bufHash);
4995
4996 if (buf_id < 0)
4997 continue;
4998
4999 bufHdr = GetBufferDescriptor(buf_id);
5000
5001 /*
5002 * We need to lock the buffer header and recheck if the buffer is
5003 * still associated with the same block because the buffer could be
5004 * evicted by some other backend loading blocks for a different
5005 * relation after we release lock on the BufMapping table.
5006 */
5008
5009 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5010 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5011 bufHdr->tag.blockNum >= firstDelBlock)
5012 InvalidateBuffer(bufHdr); /* releases spinlock */
5013 else
5015 }
5016}
5017
5018/* ---------------------------------------------------------------------
5019 * DropDatabaseBuffers
5020 *
5021 * This function removes all the buffers in the buffer cache for a
5022 * particular database. Dirty pages are simply dropped, without
5023 * bothering to write them out first. This is used when we destroy a
5024 * database, to avoid trying to flush data to disk when the directory
5025 * tree no longer exists. Implementation is pretty similar to
5026 * DropRelationBuffers() which is for destroying just one relation.
5027 * --------------------------------------------------------------------
5028 */
5029void
5031{
5032 int i;
5033
5034 /*
5035 * We needn't consider local buffers, since by assumption the target
5036 * database isn't our own.
5037 */
5038
5039 for (i = 0; i < NBuffers; i++)
5040 {
5042
5043 /*
5044 * As in DropRelationBuffers, an unlocked precheck should be safe and
5045 * saves some cycles.
5046 */
5047 if (bufHdr->tag.dbOid != dbid)
5048 continue;
5049
5051 if (bufHdr->tag.dbOid == dbid)
5052 InvalidateBuffer(bufHdr); /* releases spinlock */
5053 else
5055 }
5056}
5057
5058/* ---------------------------------------------------------------------
5059 * FlushRelationBuffers
5060 *
5061 * This function writes all dirty pages of a relation out to disk
5062 * (or more accurately, out to kernel disk buffers), ensuring that the
5063 * kernel has an up-to-date view of the relation.
5064 *
5065 * Generally, the caller should be holding AccessExclusiveLock on the
5066 * target relation to ensure that no other backend is busy dirtying
5067 * more blocks of the relation; the effects can't be expected to last
5068 * after the lock is released.
5069 *
5070 * XXX currently it sequentially searches the buffer pool, should be
5071 * changed to more clever ways of searching. This routine is not
5072 * used in any performance-critical code paths, so it's not worth
5073 * adding additional overhead to normal paths to make it go faster.
5074 * --------------------------------------------------------------------
5075 */
5076void
5078{
5079 int i;
5081 SMgrRelation srel = RelationGetSmgr(rel);
5082
5083 if (RelationUsesLocalBuffers(rel))
5084 {
5085 for (i = 0; i < NLocBuffer; i++)
5086 {
5088
5090 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5091 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5092 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5093 {
5094 ErrorContextCallback errcallback;
5095
5096 /* Setup error traceback support for ereport() */
5098 errcallback.arg = bufHdr;
5099 errcallback.previous = error_context_stack;
5100 error_context_stack = &errcallback;
5101
5102 /* Make sure we can handle the pin */
5105
5106 /*
5107 * Pin/unpin mostly to make valgrind work, but it also seems
5108 * like the right thing to do.
5109 */
5110 PinLocalBuffer(bufHdr, false);
5111
5112
5113 FlushLocalBuffer(bufHdr, srel);
5114
5116
5117 /* Pop the error context stack */
5118 error_context_stack = errcallback.previous;
5119 }
5120 }
5121
5122 return;
5123 }
5124
5125 for (i = 0; i < NBuffers; i++)
5126 {
5128
5130
5131 /*
5132 * As in DropRelationBuffers, an unlocked precheck should be safe and
5133 * saves some cycles.
5134 */
5136 continue;
5137
5138 /* Make sure we can handle the pin */
5141
5143 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5145 {
5149 }
5150 else
5152 }
5153}
5154
5155/* ---------------------------------------------------------------------
5156 * FlushRelationsAllBuffers
5157 *
5158 * This function flushes out of the buffer pool all the pages of all
5159 * forks of the specified smgr relations. It's equivalent to calling
5160 * FlushRelationBuffers once per relation. The relations are assumed not
5161 * to use local buffers.
5162 * --------------------------------------------------------------------
5163 */
5164void
5166{
5167 int i;
5169 bool use_bsearch;
5170
5171 if (nrels == 0)
5172 return;
5173
5174 /* fill-in array for qsort */
5176
5177 for (i = 0; i < nrels; i++)
5178 {
5179 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5180
5181 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5182 srels[i].srel = smgrs[i];
5183 }
5184
5185 /*
5186 * Save the bsearch overhead for low number of relations to sync. See
5187 * DropRelationsAllBuffers for details.
5188 */
5190
5191 /* sort the list of SMgrRelations if necessary */
5192 if (use_bsearch)
5193 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5194
5195 for (i = 0; i < NBuffers; i++)
5196 {
5200
5201 /*
5202 * As in DropRelationBuffers, an unlocked precheck should be safe and
5203 * saves some cycles.
5204 */
5205
5206 if (!use_bsearch)
5207 {
5208 int j;
5209
5210 for (j = 0; j < nrels; j++)
5211 {
5212 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5213 {
5214 srelent = &srels[j];
5215 break;
5216 }
5217 }
5218 }
5219 else
5220 {
5221 RelFileLocator rlocator;
5222
5223 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5224 srelent = bsearch(&rlocator,
5225 srels, nrels, sizeof(SMgrSortArray),
5227 }
5228
5229 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5230 if (srelent == NULL)
5231 continue;
5232
5233 /* Make sure we can handle the pin */
5236
5238 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5240 {
5244 }
5245 else
5247 }
5248
5249 pfree(srels);
5250}
5251
5252/* ---------------------------------------------------------------------
5253 * RelationCopyStorageUsingBuffer
5254 *
5255 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5256 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5257 *
5258 * Refer comments atop CreateAndCopyRelationData() for details about
5259 * 'permanent' parameter.
5260 * --------------------------------------------------------------------
5261 */
5262static void
5265 ForkNumber forkNum, bool permanent)
5266{
5267 Buffer srcBuf;
5268 Buffer dstBuf;
5269 Page srcPage;
5270 Page dstPage;
5271 bool use_wal;
5272 BlockNumber nblocks;
5273 BlockNumber blkno;
5280
5281 /*
5282 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5283 * can skip it when copying any fork of an unlogged relation other than
5284 * the init fork.
5285 */
5286 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5287
5288 /* Get number of blocks in the source relation. */
5290 forkNum);
5291
5292 /* Nothing to copy; just return. */
5293 if (nblocks == 0)
5294 return;
5295
5296 /*
5297 * Bulk extend the destination relation of the same size as the source
5298 * relation before starting to copy block by block.
5299 */
5300 memset(buf.data, 0, BLCKSZ);
5301 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5302 buf.data, true);
5303
5304 /* This is a bulk operation, so use buffer access strategies. */
5307
5308 /* Initialize streaming read */
5309 p.current_blocknum = 0;
5310 p.last_exclusive = nblocks;
5312
5313 /*
5314 * It is safe to use batchmode as block_range_read_stream_cb takes no
5315 * locks.
5316 */
5320 src_smgr,
5322 forkNum,
5324 &p,
5325 0);
5326
5327 /* Iterate over each block of the source relation file. */
5328 for (blkno = 0; blkno < nblocks; blkno++)
5329 {
5331
5332 /* Read block from source relation. */
5336
5340 permanent);
5342
5344
5345 /* Copy page data from the source to the destination. */
5348
5349 /* WAL-log the copied page. */
5350 if (use_wal)
5352
5354
5357 }
5360
5363}
5364
5365/* ---------------------------------------------------------------------
5366 * CreateAndCopyRelationData
5367 *
5368 * Create destination relation storage and copy all forks from the
5369 * source relation to the destination.
5370 *
5371 * Pass permanent as true for permanent relations and false for
5372 * unlogged relations. Currently this API is not supported for
5373 * temporary relations.
5374 * --------------------------------------------------------------------
5375 */
5376void
5378 RelFileLocator dst_rlocator, bool permanent)
5379{
5380 char relpersistence;
5383
5384 /* Set the relpersistence. */
5385 relpersistence = permanent ?
5387
5390
5391 /*
5392 * Create and copy all forks of the relation. During create database we
5393 * have a separate cleanup mechanism which deletes complete database
5394 * directory. Therefore, each individual relation doesn't need to be
5395 * registered for cleanup.
5396 */
5397 RelationCreateStorage(dst_rlocator, relpersistence, false);
5398
5399 /* copy main fork. */
5401 permanent);
5402
5403 /* copy those extra forks that exist */
5404 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5405 forkNum <= MAX_FORKNUM; forkNum++)
5406 {
5407 if (smgrexists(src_rel, forkNum))
5408 {
5409 smgrcreate(dst_rel, forkNum, false);
5410
5411 /*
5412 * WAL log creation if the relation is persistent, or this is the
5413 * init fork of an unlogged relation.
5414 */
5415 if (permanent || forkNum == INIT_FORKNUM)
5416 log_smgrcreate(&dst_rlocator, forkNum);
5417
5418 /* Copy a fork's data, block by block. */
5420 permanent);
5421 }
5422 }
5423}
5424
5425/* ---------------------------------------------------------------------
5426 * FlushDatabaseBuffers
5427 *
5428 * This function writes all dirty pages of a database out to disk
5429 * (or more accurately, out to kernel disk buffers), ensuring that the
5430 * kernel has an up-to-date view of the database.
5431 *
5432 * Generally, the caller should be holding an appropriate lock to ensure
5433 * no other backend is active in the target database; otherwise more
5434 * pages could get dirtied.
5435 *
5436 * Note we don't worry about flushing any pages of temporary relations.
5437 * It's assumed these wouldn't be interesting.
5438 * --------------------------------------------------------------------
5439 */
5440void
5442{
5443 int i;
5445
5446 for (i = 0; i < NBuffers; i++)
5447 {
5449
5451
5452 /*
5453 * As in DropRelationBuffers, an unlocked precheck should be safe and
5454 * saves some cycles.
5455 */
5456 if (bufHdr->tag.dbOid != dbid)
5457 continue;
5458
5459 /* Make sure we can handle the pin */
5462
5464 if (bufHdr->tag.dbOid == dbid &&
5466 {
5470 }
5471 else
5473 }
5474}
5475
5476/*
5477 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5478 * OS.
5479 */
5480void
5482{
5484
5485 /* currently not needed, but no fundamental reason not to support */
5487
5489
5491
5493
5495}
5496
5497/*
5498 * ReleaseBuffer -- release the pin on a buffer
5499 */
5500void
5502{
5503 if (!BufferIsValid(buffer))
5504 elog(ERROR, "bad buffer ID: %d", buffer);
5505
5506 if (BufferIsLocal(buffer))
5508 else
5510}
5511
5512/*
5513 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5514 *
5515 * This is just a shorthand for a common combination.
5516 */
5517void
5523
5524/*
5525 * IncrBufferRefCount
5526 * Increment the pin count on a buffer that we have *already* pinned
5527 * at least once.
5528 *
5529 * This function cannot be used on a buffer we do not have pinned,
5530 * because it doesn't change the shared buffer state.
5531 */
5532void
5549
5550/*
5551 * MarkBufferDirtyHint
5552 *
5553 * Mark a buffer dirty for non-critical changes.
5554 *
5555 * This is essentially the same as MarkBufferDirty, except:
5556 *
5557 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5558 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5559 * 2. The caller might have only share-lock instead of exclusive-lock on the
5560 * buffer's content lock.
5561 * 3. This function does not guarantee that the buffer is always marked dirty
5562 * (due to a race condition), so it cannot be used for important changes.
5563 */
5564void
5566{
5568 Page page = BufferGetPage(buffer);
5569
5570 if (!BufferIsValid(buffer))
5571 elog(ERROR, "bad buffer ID: %d", buffer);
5572
5573 if (BufferIsLocal(buffer))
5574 {
5576 return;
5577 }
5578
5580
5582 /* here, either share or exclusive lock is OK */
5584
5585 /*
5586 * This routine might get called many times on the same page, if we are
5587 * making the first scan after commit of an xact that added/deleted many
5588 * tuples. So, be as quick as we can if the buffer is already dirty. We
5589 * do this by not acquiring spinlock if it looks like the status bits are
5590 * already set. Since we make this test unlocked, there's a chance we
5591 * might fail to notice that the flags have just been cleared, and failed
5592 * to reset them, due to memory-ordering issues. But since this function
5593 * is only intended to be used in cases where failing to write out the
5594 * data would be harmless anyway, it doesn't really matter.
5595 */
5596 if ((pg_atomic_read_u64(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5598 {
5600 bool dirtied = false;
5601 bool delayChkptFlags = false;
5603
5604 /*
5605 * If we need to protect hint bit updates from torn writes, WAL-log a
5606 * full page image of the page. This full page image is only necessary
5607 * if the hint bit update is the first change to the page since the
5608 * last checkpoint.
5609 *
5610 * We don't check full_page_writes here because that logic is included
5611 * when we call XLogInsert() since the value changes dynamically.
5612 */
5613 if (XLogHintBitIsNeeded() &&
5615 {
5616 /*
5617 * If we must not write WAL, due to a relfilelocator-specific
5618 * condition or being in recovery, don't dirty the page. We can
5619 * set the hint, just not dirty the page as a result so the hint
5620 * is lost when we evict the page or shutdown.
5621 *
5622 * See src/backend/storage/page/README for longer discussion.
5623 */
5624 if (RecoveryInProgress() ||
5626 return;
5627
5628 /*
5629 * If the block is already dirty because we either made a change
5630 * or set a hint already, then we don't need to write a full page
5631 * image. Note that aggressive cleaning of blocks dirtied by hint
5632 * bit setting would increase the call rate. Bulk setting of hint
5633 * bits would reduce the call rate...
5634 *
5635 * We must issue the WAL record before we mark the buffer dirty.
5636 * Otherwise we might write the page before we write the WAL. That
5637 * causes a race condition, since a checkpoint might occur between
5638 * writing the WAL record and marking the buffer dirty. We solve
5639 * that with a kluge, but one that is already in use during
5640 * transaction commit to prevent race conditions. Basically, we
5641 * simply prevent the checkpoint WAL record from being written
5642 * until we have marked the buffer dirty. We don't start the
5643 * checkpoint flush until we have marked dirty, so our checkpoint
5644 * must flush the change to disk successfully or the checkpoint
5645 * never gets written, so crash recovery will fix.
5646 *
5647 * It's possible we may enter here without an xid, so it is
5648 * essential that CreateCheckPoint waits for virtual transactions
5649 * rather than full transactionids.
5650 */
5653 delayChkptFlags = true;
5655 }
5656
5658
5660
5661 if (!(buf_state & BM_DIRTY))
5662 {
5663 dirtied = true; /* Means "will be dirtied by this action" */
5664
5665 /*
5666 * Set the page LSN if we wrote a backup block. We aren't supposed
5667 * to set this when only holding a share lock but as long as we
5668 * serialise it somehow we're OK. We choose to set LSN while
5669 * holding the buffer header lock, which causes any reader of an
5670 * LSN who holds only a share lock to also obtain a buffer header
5671 * lock before using PageGetLSN(), which is enforced in
5672 * BufferGetLSNAtomic().
5673 *
5674 * If checksums are enabled, you might think we should reset the
5675 * checksum here. That will happen when the page is written
5676 * sometime later in this checkpoint cycle.
5677 */
5678 if (XLogRecPtrIsValid(lsn))
5679 PageSetLSN(page, lsn);
5680 }
5681
5684 0, 0);
5685
5686 if (delayChkptFlags)
5688
5689 if (dirtied)
5690 {
5692 if (VacuumCostActive)
5694 }
5695 }
5696}
5697
5698/*
5699 * Release buffer content locks for shared buffers.
5700 *
5701 * Used to clean up after errors.
5702 *
5703 * Currently, we can expect that resource owner cleanup, via
5704 * ResOwnerReleaseBufferPin(), took care of releasing buffer content locks per
5705 * se; the only thing we need to deal with here is clearing any PIN_COUNT
5706 * request that was in progress.
5707 */
5708void
5710{
5712
5713 if (buf)
5714 {
5716 uint64 unset_bits = 0;
5717
5719
5720 /*
5721 * Don't complain if flag bit not set; it could have been reset but we
5722 * got a cancel/die interrupt before getting the signal.
5723 */
5724 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5725 buf->wait_backend_pgprocno == MyProcNumber)
5727
5729 0, unset_bits,
5730 0);
5731
5733 }
5734}
5735
5736/*
5737 * Acquire the buffer content lock in the specified mode
5738 *
5739 * If the lock is not available, sleep until it is.
5740 *
5741 * Side effect: cancel/die interrupts are held off until lock release.
5742 *
5743 * This uses almost the same locking approach as lwlock.c's
5744 * LWLockAcquire(). See documentation at the top of lwlock.c for a more
5745 * detailed discussion.
5746 *
5747 * The reason that this, and most of the other BufferLock* functions, get both
5748 * the Buffer and BufferDesc* as parameters, is that looking up one from the
5749 * other repeatedly shows up noticeably in profiles.
5750 *
5751 * Callers should provide a constant for mode, for more efficient code
5752 * generation.
5753 */
5754static inline void
5756{
5757 PrivateRefCountEntry *entry;
5758 int extraWaits = 0;
5759
5760 /*
5761 * Get reference to the refcount entry before we hold the lock, it seems
5762 * better to do before holding the lock.
5763 */
5764 entry = GetPrivateRefCountEntry(buffer, true);
5765
5766 /*
5767 * We better not already hold a lock on the buffer.
5768 */
5770
5771 /*
5772 * Lock out cancel/die interrupts until we exit the code section protected
5773 * by the content lock. This ensures that interrupts will not interfere
5774 * with manipulations of data structures in shared memory.
5775 */
5777
5778 for (;;)
5779 {
5780 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5781 bool mustwait;
5782
5783 /*
5784 * Try to grab the lock the first time, we're not in the waitqueue
5785 * yet/anymore.
5786 */
5788
5789 if (likely(!mustwait))
5790 {
5791 break;
5792 }
5793
5794 /*
5795 * Ok, at this point we couldn't grab the lock on the first try. We
5796 * cannot simply queue ourselves to the end of the list and wait to be
5797 * woken up because by now the lock could long have been released.
5798 * Instead add us to the queue and try to grab the lock again. If we
5799 * succeed we need to revert the queuing and be happy, otherwise we
5800 * recheck the lock. If we still couldn't grab it, we know that the
5801 * other locker will see our queue entries when releasing since they
5802 * existed before we checked for the lock.
5803 */
5804
5805 /* add to the queue */
5807
5808 /* we're now guaranteed to be woken up if necessary */
5810
5811 /* ok, grabbed the lock the second time round, need to undo queueing */
5812 if (!mustwait)
5813 {
5815 break;
5816 }
5817
5818 switch (mode)
5819 {
5822 break;
5825 break;
5826 case BUFFER_LOCK_SHARE:
5828 break;
5829 case BUFFER_LOCK_UNLOCK:
5831
5832 }
5834
5835 /*
5836 * Wait until awakened.
5837 *
5838 * It is possible that we get awakened for a reason other than being
5839 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5840 * Once we've gotten the lock, re-increment the sema by the number of
5841 * additional signals received.
5842 */
5843 for (;;)
5844 {
5847 break;
5848 extraWaits++;
5849 }
5850
5852
5853 /* Retrying, allow BufferLockRelease to release waiters again. */
5855 }
5856
5857 /* Remember that we now hold this lock */
5858 entry->data.lockmode = mode;
5859
5860 /*
5861 * Fix the process wait semaphore's count for any absorbed wakeups.
5862 */
5863 while (unlikely(extraWaits-- > 0))
5865}
5866
5867/*
5868 * Release a previously acquired buffer content lock.
5869 */
5870static void
5872{
5875 uint64 sub;
5876
5878
5879 /*
5880 * Release my hold on lock, after that it can immediately be acquired by
5881 * others, even if we still have to wakeup other waiters.
5882 */
5884
5886
5888
5889 /*
5890 * Now okay to allow cancel/die interrupts.
5891 */
5893}
5894
5895
5896/*
5897 * Acquire the content lock for the buffer, but only if we don't have to wait.
5898 */
5899static bool
5901{
5903 bool mustwait;
5904
5905 /*
5906 * We better not already hold a lock on the buffer.
5907 */
5909
5910 /*
5911 * Lock out cancel/die interrupts until we exit the code section protected
5912 * by the content lock. This ensures that interrupts will not interfere
5913 * with manipulations of data structures in shared memory.
5914 */
5916
5917 /* Check for the lock */
5919
5920 if (mustwait)
5921 {
5922 /* Failed to get lock, so release interrupt holdoff */
5924 }
5925 else
5926 {
5927 entry->data.lockmode = mode;
5928 }
5929
5930 return !mustwait;
5931}
5932
5933/*
5934 * Internal function that tries to atomically acquire the content lock in the
5935 * passed in mode.
5936 *
5937 * This function will not block waiting for a lock to become free - that's the
5938 * caller's job.
5939 *
5940 * Similar to LWLockAttemptLock().
5941 */
5942static inline bool
5944{
5946
5947 /*
5948 * Read once outside the loop, later iterations will get the newer value
5949 * via compare & exchange.
5950 */
5952
5953 /* loop until we've determined whether we could acquire the lock or not */
5954 while (true)
5955 {
5957 bool lock_free;
5958
5960
5962 {
5963 lock_free = (old_state & BM_LOCK_MASK) == 0;
5964 if (lock_free)
5966 }
5968 {
5970 if (lock_free)
5972 }
5973 else
5974 {
5976 if (lock_free)
5978 }
5979
5980 /*
5981 * Attempt to swap in the state we are expecting. If we didn't see
5982 * lock to be free, that's just the old value. If we saw it as free,
5983 * we'll attempt to mark it acquired. The reason that we always swap
5984 * in the value is that this doubles as a memory barrier. We could try
5985 * to be smarter and only swap in values if we saw the lock as free,
5986 * but benchmark haven't shown it as beneficial so far.
5987 *
5988 * Retry if the value changed since we last looked at it.
5989 */
5992 {
5993 if (lock_free)
5994 {
5995 /* Great! Got the lock. */
5996 return false;
5997 }
5998 else
5999 return true; /* somebody else has the lock */
6000 }
6001 }
6002
6004}
6005
6006/*
6007 * Add ourselves to the end of the content lock's wait queue.
6008 */
6009static void
6011{
6012 /*
6013 * If we don't have a PGPROC structure, there's no way to wait. This
6014 * should never occur, since MyProc should only be null during shared
6015 * memory initialization.
6016 */
6017 if (MyProc == NULL)
6018 elog(PANIC, "cannot wait without a PGPROC structure");
6019
6021 elog(PANIC, "queueing for lock while waiting on another one");
6022
6024
6025 /* setting the flag is protected by the spinlock */
6027
6028 /*
6029 * These are currently used both for lwlocks and buffer content locks,
6030 * which is acceptable, although not pretty, because a backend can't wait
6031 * for both types of locks at the same time.
6032 */
6035
6036 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6037
6038 /* Can release the mutex now */
6040}
6041
6042/*
6043 * Remove ourselves from the waitlist.
6044 *
6045 * This is used if we queued ourselves because we thought we needed to sleep
6046 * but, after further checking, we discovered that we don't actually need to
6047 * do so.
6048 */
6049static void
6051{
6052 bool on_waitlist;
6053
6055
6057 if (on_waitlist)
6058 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6059
6060 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6062 {
6064 }
6065
6066 /* XXX: combine with fetch_and above? */
6068
6069 /* clear waiting state again, nice for debugging */
6070 if (on_waitlist)
6072 else
6073 {
6074 int extraWaits = 0;
6075
6076
6077 /*
6078 * Somebody else dequeued us and has or will wake us up. Deal with the
6079 * superfluous absorption of a wakeup.
6080 */
6081
6082 /*
6083 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6084 * removed ourselves - they'll have set it.
6085 */
6087
6088 /*
6089 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6090 * get reset at some inconvenient point later. Most of the time this
6091 * will immediately return.
6092 */
6093 for (;;)
6094 {
6097 break;
6098 extraWaits++;
6099 }
6100
6101 /*
6102 * Fix the process wait semaphore's count for any absorbed wakeups.
6103 */
6104 while (extraWaits-- > 0)
6106 }
6107}
6108
6109/*
6110 * Stop treating lock as held by current backend.
6111 *
6112 * After calling this function it's the callers responsibility to ensure that
6113 * the lock gets released, even in case of an error. This only is desirable if
6114 * the lock is going to be released in a different process than the process
6115 * that acquired it.
6116 */
6117static inline void
6123
6124/*
6125 * Stop treating lock as held by current backend.
6126 *
6127 * This is the code that can be shared between actually releasing a lock
6128 * (BufferLockUnlock()) and just not tracking ownership of the lock anymore
6129 * without releasing the lock (BufferLockDisown()).
6130 */
6131static inline int
6133{
6136
6138 if (ref == NULL)
6139 elog(ERROR, "lock %d is not held", buffer);
6140 mode = ref->data.lockmode;
6141 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6142
6143 return mode;
6144}
6145
6146/*
6147 * Wakeup all the lockers that currently have a chance to acquire the lock.
6148 *
6149 * wake_exclusive indicates whether exclusive lock waiters should be woken up.
6150 */
6151static void
6153{
6154 bool new_wake_in_progress = false;
6155 bool wake_share_exclusive = true;
6158
6160
6161 /* lock wait list while collecting backends to wake up */
6163
6164 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6165 {
6166 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6167
6168 /*
6169 * Already woke up a conflicting lock, so skip over this wait list
6170 * entry.
6171 */
6173 continue;
6175 continue;
6176
6177 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6178 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6179
6180 /*
6181 * Prevent additional wakeups until retryer gets to run. Backends that
6182 * are just waiting for the lock to become free don't retry
6183 * automatically.
6184 */
6185 new_wake_in_progress = true;
6186
6187 /*
6188 * Signal that the process isn't on the wait list anymore. This allows
6189 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6190 * proclist_delete(), rather than having to check if it has been
6191 * removed from the list.
6192 */
6193 Assert(waiter->lwWaiting == LW_WS_WAITING);
6195
6196 /*
6197 * Don't wakeup further waiters after waking a conflicting waiter.
6198 */
6199 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6200 {
6201 /*
6202 * Share locks conflict with exclusive locks.
6203 */
6204 wake_exclusive = false;
6205 }
6206 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6207 {
6208 /*
6209 * Share-exclusive locks conflict with share-exclusive and
6210 * exclusive locks.
6211 */
6212 wake_exclusive = false;
6213 wake_share_exclusive = false;
6214 }
6215 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6216 {
6217 /*
6218 * Exclusive locks conflict with all other locks, there's no point
6219 * in waking up anybody else.
6220 */
6221 break;
6222 }
6223 }
6224
6226
6227 /* unset required flags, and release lock, in one fell swoop */
6228 {
6231
6233 while (true)
6234 {
6236
6237 /* compute desired flags */
6238
6241 else
6243
6244 if (proclist_is_empty(&buf_hdr->lock_waiters))
6246
6247 desired_state &= ~BM_LOCKED; /* release lock */
6248
6251 break;
6252 }
6253 }
6254
6255 /* Awaken any waiters I removed from the queue. */
6256 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6257 {
6258 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6259
6260 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6261
6262 /*
6263 * Guarantee that lwWaiting being unset only becomes visible once the
6264 * unlink from the link has completed. Otherwise the target backend
6265 * could be woken up for other reason and enqueue for a new lock - if
6266 * that happens before the list unlink happens, the list would end up
6267 * being corrupted.
6268 *
6269 * The barrier pairs with the LockBufHdr() when enqueuing for another
6270 * lock.
6271 */
6273 waiter->lwWaiting = LW_WS_NOT_WAITING;
6274 PGSemaphoreUnlock(waiter->sem);
6275 }
6276}
6277
6278/*
6279 * Compute subtraction from buffer state for a release of a held lock in
6280 * `mode`.
6281 *
6282 * This is separated from BufferLockUnlock() as we want to combine the lock
6283 * release with other atomic operations when possible, leading to the lock
6284 * release being done in multiple places, each needing to compute what to
6285 * subtract from the lock state.
6286 */
6287static inline uint64
6289{
6290 /*
6291 * Turns out that a switch() leads gcc to generate sufficiently worse code
6292 * for this to show up in profiles...
6293 */
6295 return BM_LOCK_VAL_EXCLUSIVE;
6298 else
6299 {
6301 return BM_LOCK_VAL_SHARED;
6302 }
6303
6304 return 0; /* keep compiler quiet */
6305}
6306
6307/*
6308 * Handle work that needs to be done after releasing a lock that was held in
6309 * `mode`, where `lockstate` is the result of the atomic operation modifying
6310 * the state variable.
6311 *
6312 * This is separated from BufferLockUnlock() as we want to combine the lock
6313 * release with other atomic operations when possible, leading to the lock
6314 * release being done in multiple places.
6315 */
6316static void
6318{
6319 bool check_waiters = false;
6320 bool wake_exclusive = false;
6321
6322 /* nobody else can have that kind of lock */
6324
6325 /*
6326 * If we're still waiting for backends to get scheduled, don't wake them
6327 * up again. Otherwise check if we need to look through the waitqueue to
6328 * wake other backends.
6329 */
6332 {
6333 if ((lockstate & BM_LOCK_MASK) == 0)
6334 {
6335 /*
6336 * We released a lock and the lock was, in that moment, free. We
6337 * therefore can wake waiters for any kind of lock.
6338 */
6339 check_waiters = true;
6340 wake_exclusive = true;
6341 }
6343 {
6344 /*
6345 * We released the lock, but another backend still holds a lock.
6346 * We can't have released an exclusive lock, as there couldn't
6347 * have been other lock holders. If we released a share lock, no
6348 * waiters need to be woken up, as there must be other share
6349 * lockers. However, if we held a share-exclusive lock, another
6350 * backend now could acquire a share-exclusive lock.
6351 */
6352 check_waiters = true;
6353 wake_exclusive = false;
6354 }
6355 }
6356
6357 /*
6358 * As waking up waiters requires the spinlock to be acquired, only do so
6359 * if necessary.
6360 */
6361 if (check_waiters)
6363}
6364
6365/*
6366 * BufferLockHeldByMeInMode - test whether my process holds the content lock
6367 * in the specified mode
6368 *
6369 * This is meant as debug support only.
6370 */
6371static bool
6373{
6374 PrivateRefCountEntry *entry =
6376
6377 if (!entry)
6378 return false;
6379 else
6380 return entry->data.lockmode == mode;
6381}
6382
6383/*
6384 * BufferLockHeldByMe - test whether my process holds the content lock in any
6385 * mode
6386 *
6387 * This is meant as debug support only.
6388 */
6389static bool
6391{
6392 PrivateRefCountEntry *entry =
6394
6395 if (!entry)
6396 return false;
6397 else
6398 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6399}
6400
6401/*
6402 * Release the content lock for the buffer.
6403 */
6404void
6406{
6408
6410 if (BufferIsLocal(buffer))
6411 return; /* local buffers need no lock */
6412
6415}
6416
6417/*
6418 * Acquire the content_lock for the buffer.
6419 */
6420void
6422{
6424
6425 /*
6426 * We can't wait if we haven't got a PGPROC. This should only occur
6427 * during bootstrap or shared memory initialization. Put an Assert here
6428 * to catch unsafe coding practices.
6429 */
6431
6432 /* handled in LockBuffer() wrapper */
6434
6436 if (BufferIsLocal(buffer))
6437 return; /* local buffers need no lock */
6438
6440
6441 /*
6442 * Test the most frequent lock modes first. While a switch (mode) would be
6443 * nice, at least gcc generates considerably worse code for it.
6444 *
6445 * Call BufferLockAcquire() with a constant argument for mode, to generate
6446 * more efficient code for the different lock modes.
6447 */
6448 if (mode == BUFFER_LOCK_SHARE)
6450 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6454 else
6455 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6456}
6457
6458/*
6459 * Acquire the content_lock for the buffer, but only if we don't have to wait.
6460 *
6461 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
6462 */
6463bool
6465{
6466 BufferDesc *buf;
6467
6469 if (BufferIsLocal(buffer))
6470 return true; /* act as though we got it */
6471
6473
6475}
6476
6477/*
6478 * Verify that this backend is pinning the buffer exactly once.
6479 *
6480 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
6481 * holds a pin on the buffer. We do not care whether some other backend does.
6482 */
6483void
6485{
6486 if (BufferIsLocal(buffer))
6487 {
6488 if (LocalRefCount[-buffer - 1] != 1)
6489 elog(ERROR, "incorrect local pin count: %d",
6490 LocalRefCount[-buffer - 1]);
6491 }
6492 else
6493 {
6494 if (GetPrivateRefCount(buffer) != 1)
6495 elog(ERROR, "incorrect local pin count: %d",
6497 }
6498}
6499
6500/*
6501 * LockBufferForCleanup - lock a buffer in preparation for deleting items
6502 *
6503 * Items may be deleted from a disk page only when the caller (a) holds an
6504 * exclusive lock on the buffer and (b) has observed that no other backend
6505 * holds a pin on the buffer. If there is a pin, then the other backend
6506 * might have a pointer into the buffer (for example, a heapscan reference
6507 * to an item --- see README for more details). It's OK if a pin is added
6508 * after the cleanup starts, however; the newly-arrived backend will be
6509 * unable to look at the page until we release the exclusive lock.
6510 *
6511 * To implement this protocol, a would-be deleter must pin the buffer and
6512 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
6513 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
6514 * it has successfully observed pin count = 1.
6515 */
6516void
6518{
6520 TimestampTz waitStart = 0;
6521 bool waiting = false;
6522 bool logged_recovery_conflict = false;
6523
6526
6528
6529 /*
6530 * We do not yet need to be worried about in-progress AIOs holding a pin,
6531 * as we, so far, only support doing reads via AIO and this function can
6532 * only be called once the buffer is valid (i.e. no read can be in
6533 * flight).
6534 */
6535
6536 /* Nobody else to wait for */
6537 if (BufferIsLocal(buffer))
6538 return;
6539
6541
6542 for (;;)
6543 {
6545 uint64 unset_bits = 0;
6546
6547 /* Try to acquire lock */
6550
6553 {
6554 /* Successfully acquired exclusive lock with pincount 1 */
6556
6557 /*
6558 * Emit the log message if recovery conflict on buffer pin was
6559 * resolved but the startup process waited longer than
6560 * deadlock_timeout for it.
6561 */
6564 waitStart, GetCurrentTimestamp(),
6565 NULL, false);
6566
6567 if (waiting)
6568 {
6569 /* reset ps display to remove the suffix if we added one */
6571 waiting = false;
6572 }
6573 return;
6574 }
6575 /* Failed, so mark myself as waiting for pincount 1 */
6577 {
6580 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6581 }
6582 bufHdr->wait_backend_pgprocno = MyProcNumber;
6586 0);
6588
6589 /* Wait to be signaled by UnpinBuffer() */
6590 if (InHotStandby)
6591 {
6592 if (!waiting)
6593 {
6594 /* adjust the process title to indicate that it's waiting */
6595 set_ps_display_suffix("waiting");
6596 waiting = true;
6597 }
6598
6599 /*
6600 * Emit the log message if the startup process is waiting longer
6601 * than deadlock_timeout for recovery conflict on buffer pin.
6602 *
6603 * Skip this if first time through because the startup process has
6604 * not started waiting yet in this case. So, the wait start
6605 * timestamp is set after this logic.
6606 */
6607 if (waitStart != 0 && !logged_recovery_conflict)
6608 {
6610
6611 if (TimestampDifferenceExceeds(waitStart, now,
6613 {
6615 waitStart, now, NULL, true);
6617 }
6618 }
6619
6620 /*
6621 * Set the wait start timestamp if logging is enabled and first
6622 * time through.
6623 */
6624 if (log_recovery_conflict_waits && waitStart == 0)
6625 waitStart = GetCurrentTimestamp();
6626
6627 /* Publish the bufid that Startup process waits on */
6629 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6631 /* Reset the published bufid */
6633 }
6634 else
6636
6637 /*
6638 * Remove flag marking us as waiter. Normally this will not be set
6639 * anymore, but ProcWaitForSignal() can return for other signals as
6640 * well. We take care to only reset the flag if we're the waiter, as
6641 * theoretically another backend could have started waiting. That's
6642 * impossible with the current usages due to table level locking, but
6643 * better be safe.
6644 */
6646 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6647 bufHdr->wait_backend_pgprocno == MyProcNumber)
6649
6651 0, unset_bits,
6652 0);
6653
6655 /* Loop back and try again */
6656 }
6657}
6658
6659/*
6660 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
6661 * requests cancellation of all pin holders that are blocking it.
6662 */
6663bool
6665{
6667
6668 /*
6669 * If we get woken slowly then it's possible that the Startup process was
6670 * already woken by other backends before we got here. Also possible that
6671 * we get here by multiple interrupts or interrupts at inappropriate
6672 * times, so make sure we do nothing if the bufid is not set.
6673 */
6674 if (bufid < 0)
6675 return false;
6676
6677 if (GetPrivateRefCount(bufid + 1) > 0)
6678 return true;
6679
6680 return false;
6681}
6682
6683/*
6684 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
6685 *
6686 * We won't loop, but just check once to see if the pin count is OK. If
6687 * not, return false with no lock held.
6688 */
6689bool
6691{
6694 refcount;
6695
6697
6698 /* see AIO related comment in LockBufferForCleanup() */
6699
6700 if (BufferIsLocal(buffer))
6701 {
6702 refcount = LocalRefCount[-buffer - 1];
6703 /* There should be exactly one pin */
6704 Assert(refcount > 0);
6705 if (refcount != 1)
6706 return false;
6707 /* Nobody else to wait for */
6708 return true;
6709 }
6710
6711 /* There should be exactly one local pin */
6712 refcount = GetPrivateRefCount(buffer);
6713 Assert(refcount);
6714 if (refcount != 1)
6715 return false;
6716
6717 /* Try to acquire lock */
6719 return false;
6720
6724
6725 Assert(refcount > 0);
6726 if (refcount == 1)
6727 {
6728 /* Successfully acquired exclusive lock with pincount 1 */
6730 return true;
6731 }
6732
6733 /* Failed, so release the lock */
6736 return false;
6737}
6738
6739/*
6740 * IsBufferCleanupOK - as above, but we already have the lock
6741 *
6742 * Check whether it's OK to perform cleanup on a buffer we've already
6743 * locked. If we observe that the pin count is 1, our exclusive lock
6744 * happens to be a cleanup lock, and we can proceed with anything that
6745 * would have been allowable had we sought a cleanup lock originally.
6746 */
6747bool
6749{
6752
6754
6755 /* see AIO related comment in LockBufferForCleanup() */
6756
6757 if (BufferIsLocal(buffer))
6758 {
6759 /* There should be exactly one pin */
6760 if (LocalRefCount[-buffer - 1] != 1)
6761 return false;
6762 /* Nobody else to wait for */
6763 return true;
6764 }
6765
6766 /* There should be exactly one local pin */
6767 if (GetPrivateRefCount(buffer) != 1)
6768 return false;
6769
6771
6772 /* caller must hold exclusive lock on buffer */
6774
6776
6779 {
6780 /* pincount is OK. */
6782 return true;
6783 }
6784
6786 return false;
6787}
6788
6789
6790/*
6791 * Functions for buffer I/O handling
6792 *
6793 * Also note that these are used only for shared buffers, not local ones.
6794 */
6795
6796/*
6797 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
6798 */
6799static void
6801{
6803
6805 for (;;)
6806 {
6809
6810 /*
6811 * It may not be necessary to acquire the spinlock to check the flag
6812 * here, but since this test is essential for correctness, we'd better
6813 * play it safe.
6814 */
6816
6817 /*
6818 * Copy the wait reference while holding the spinlock. This protects
6819 * against a concurrent TerminateBufferIO() in another backend from
6820 * clearing the wref while it's being read.
6821 */
6822 iow = buf->io_wref;
6824
6825 /* no IO in progress, we don't need to wait */
6827 break;
6828
6829 /*
6830 * The buffer has asynchronous IO in progress, wait for it to
6831 * complete.
6832 */
6833 if (pgaio_wref_valid(&iow))
6834 {
6836
6837 /*
6838 * The AIO subsystem internally uses condition variables and thus
6839 * might remove this backend from the BufferDesc's CV. While that
6840 * wouldn't cause a correctness issue (the first CV sleep just
6841 * immediately returns if not already registered), it seems worth
6842 * avoiding unnecessary loop iterations, given that we take care
6843 * to do so at the start of the function.
6844 */
6846 continue;
6847 }
6848
6849 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6851 }
6853}
6854
6855/*
6856 * StartBufferIO: begin I/O on this buffer
6857 * (Assumptions)
6858 * My process is executing no IO on this buffer
6859 * The buffer is Pinned
6860 *
6861 * In some scenarios multiple backends could attempt the same I/O operation
6862 * concurrently. If someone else has already started I/O on this buffer then
6863 * we will wait for completion of the IO using WaitIO().
6864 *
6865 * Input operations are only attempted on buffers that are not BM_VALID,
6866 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6867 * so we can always tell if the work is already done.
6868 *
6869 * Returns true if we successfully marked the buffer as I/O busy,
6870 * false if someone else already did the work.
6871 *
6872 * If nowait is true, then we don't wait for an I/O to be finished by another
6873 * backend. In that case, false indicates either that the I/O was already
6874 * finished, or is still in progress. This is useful for callers that want to
6875 * find out if they can perform the I/O as part of a larger operation, without
6876 * waiting for the answer or distinguishing the reasons why not.
6877 */
6878bool
6880{
6882
6884
6885 for (;;)
6886 {
6888
6890 break;
6892 if (nowait)
6893 return false;
6894 WaitIO(buf);
6895 }
6896
6897 /* Once we get here, there is definitely no I/O active on this buffer */
6898
6899 /* Check if someone else already did the I/O */
6900 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6901 {
6903 return false;
6904 }
6905
6908 0);
6909
6912
6913 return true;
6914}
6915
6916/*
6917 * TerminateBufferIO: release a buffer we were doing I/O on
6918 * (Assumptions)
6919 * My process is executing IO for the buffer
6920 * BM_IO_IN_PROGRESS bit is set for the buffer
6921 * The buffer is Pinned
6922 *
6923 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6924 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6925 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6926 * marking the buffer clean if it was re-dirtied while we were writing.
6927 *
6928 * set_flag_bits gets ORed into the buffer's flags. It must include
6929 * BM_IO_ERROR in a failure case. For successful completion it could
6930 * be 0, or BM_VALID if we just finished reading in the page.
6931 *
6932 * If forget_owner is true, we release the buffer I/O from the current
6933 * resource owner. (forget_owner=false is used when the resource owner itself
6934 * is being released)
6935 */
6936void
6938 bool forget_owner, bool release_aio)
6939{
6942 int refcount_change = 0;
6943
6945
6948
6949 /* Clear earlier errors, if this IO failed, it'll be marked again */
6951
6954
6955 if (release_aio)
6956 {
6957 /* release ownership by the AIO subsystem */
6959 refcount_change = -1;
6960 pgaio_wref_clear(&buf->io_wref);
6961 }
6962
6966
6967 if (forget_owner)
6970
6972
6973 /*
6974 * Support LockBufferForCleanup()
6975 *
6976 * We may have just released the last pin other than the waiter's. In most
6977 * cases, this backend holds another pin on the buffer. But, if, for
6978 * example, this backend is completing an IO issued by another backend, it
6979 * may be time to wake the waiter.
6980 */
6983}
6984
6985/*
6986 * AbortBufferIO: Clean up active buffer I/O after an error.
6987 *
6988 * All LWLocks & content locks we might have held have been released, but we
6989 * haven't yet released buffer pins, so the buffer is still pinned.
6990 *
6991 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6992 * possible the error condition wasn't related to the I/O.
6993 *
6994 * Note: this does not remove the buffer I/O from the resource owner.
6995 * That's correct when we're releasing the whole resource owner, but
6996 * beware if you use this in other contexts.
6997 */
6998static void
7000{
7003
7006
7007 if (!(buf_state & BM_VALID))
7008 {
7011 }
7012 else
7013 {
7016
7017 /* Issue notice if this is not the first failure... */
7018 if (buf_state & BM_IO_ERROR)
7019 {
7020 /* Buffer is pinned, so we can read tag without spinlock */
7023 errmsg("could not write block %u of %s",
7024 buf_hdr->tag.blockNum,
7026 BufTagGetForkNum(&buf_hdr->tag)).str),
7027 errdetail("Multiple failures --- write error might be permanent.")));
7028 }
7029 }
7030
7031 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7032}
7033
7034/*
7035 * Error context callback for errors occurring during shared buffer writes.
7036 */
7037static void
7039{
7041
7042 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7043 if (bufHdr != NULL)
7044 errcontext("writing block %u of relation \"%s\"",
7045 bufHdr->tag.blockNum,
7047 BufTagGetForkNum(&bufHdr->tag)).str);
7048}
7049
7050/*
7051 * Error context callback for errors occurring during local buffer writes.
7052 */
7053static void
7055{
7057
7058 if (bufHdr != NULL)
7059 errcontext("writing block %u of relation \"%s\"",
7060 bufHdr->tag.blockNum,
7063 BufTagGetForkNum(&bufHdr->tag)).str);
7064}
7065
7066/*
7067 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
7068 */
7069static int
7070rlocator_comparator(const void *p1, const void *p2)
7071{
7072 RelFileLocator n1 = *(const RelFileLocator *) p1;
7073 RelFileLocator n2 = *(const RelFileLocator *) p2;
7074
7075 if (n1.relNumber < n2.relNumber)
7076 return -1;
7077 else if (n1.relNumber > n2.relNumber)
7078 return 1;
7079
7080 if (n1.dbOid < n2.dbOid)
7081 return -1;
7082 else if (n1.dbOid > n2.dbOid)
7083 return 1;
7084
7085 if (n1.spcOid < n2.spcOid)
7086 return -1;
7087 else if (n1.spcOid > n2.spcOid)
7088 return 1;
7089 else
7090 return 0;
7091}
7092
7093/*
7094 * Lock buffer header - set BM_LOCKED in buffer state.
7095 */
7096uint64
7098{
7100
7102
7103 while (true)
7104 {
7105 /*
7106 * Always try once to acquire the lock directly, without setting up
7107 * the spin-delay infrastructure. The work necessary for that shows up
7108 * in profiles and is rarely necessary.
7109 */
7111 if (likely(!(old_buf_state & BM_LOCKED)))
7112 break; /* got lock */
7113
7114 /* and then spin without atomic operations until lock is released */
7115 {
7117
7119
7120 while (old_buf_state & BM_LOCKED)
7121 {
7124 }
7126 }
7127
7128 /*
7129 * Retry. The lock might obviously already be re-acquired by the time
7130 * we're attempting to get it again.
7131 */
7132 }
7133
7134 return old_buf_state | BM_LOCKED;
7135}
7136
7137/*
7138 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
7139 * state at that point.
7140 *
7141 * Obviously the buffer could be locked by the time the value is returned, so
7142 * this is primarily useful in CAS style loops.
7143 */
7146{
7149
7151
7152 buf_state = pg_atomic_read_u64(&buf->state);
7153
7154 while (buf_state & BM_LOCKED)
7155 {
7157 buf_state = pg_atomic_read_u64(&buf->state);
7158 }
7159
7161
7162 return buf_state;
7163}
7164
7165/*
7166 * BufferTag comparator.
7167 */
7168static inline int
7170{
7171 int ret;
7174
7177
7179
7180 if (ret != 0)
7181 return ret;
7182
7184 return -1;
7186 return 1;
7187
7188 if (ba->blockNum < bb->blockNum)
7189 return -1;
7190 if (ba->blockNum > bb->blockNum)
7191 return 1;
7192
7193 return 0;
7194}
7195
7196/*
7197 * Comparator determining the writeout order in a checkpoint.
7198 *
7199 * It is important that tablespaces are compared first, the logic balancing
7200 * writes between tablespaces relies on it.
7201 */
7202static inline int
7204{
7205 /* compare tablespace */
7206 if (a->tsId < b->tsId)
7207 return -1;
7208 else if (a->tsId > b->tsId)
7209 return 1;
7210 /* compare relation */
7211 if (a->relNumber < b->relNumber)
7212 return -1;
7213 else if (a->relNumber > b->relNumber)
7214 return 1;
7215 /* compare fork */
7216 else if (a->forkNum < b->forkNum)
7217 return -1;
7218 else if (a->forkNum > b->forkNum)
7219 return 1;
7220 /* compare block number */
7221 else if (a->blockNum < b->blockNum)
7222 return -1;
7223 else if (a->blockNum > b->blockNum)
7224 return 1;
7225 /* equal page IDs are unlikely, but not impossible */
7226 return 0;
7227}
7228
7229/*
7230 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
7231 * progress.
7232 */
7233static int
7235{
7238
7239 /* we want a min-heap, so return 1 for the a < b */
7240 if (sa->progress < sb->progress)
7241 return 1;
7242 else if (sa->progress == sb->progress)
7243 return 0;
7244 else
7245 return -1;
7246}
7247
7248/*
7249 * Initialize a writeback context, discarding potential previous state.
7250 *
7251 * *max_pending is a pointer instead of an immediate value, so the coalesce
7252 * limits can easily changed by the GUC mechanism, and so calling code does
7253 * not have to check the current configuration. A value of 0 means that no
7254 * writeback control will be performed.
7255 */
7256void
7257WritebackContextInit(WritebackContext *context, int *max_pending)
7258{
7259 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7260
7261 context->max_pending = max_pending;
7262 context->nr_pending = 0;
7263}
7264
7265/*
7266 * Add buffer to list of pending writeback requests.
7267 */
7268void
7270 BufferTag *tag)
7271{
7272 PendingWriteback *pending;
7273
7274 /*
7275 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7276 * point in tracking in that case.
7277 */
7279 !enableFsync)
7280 return;
7281
7282 /*
7283 * Add buffer to the pending writeback array, unless writeback control is
7284 * disabled.
7285 */
7286 if (*wb_context->max_pending > 0)
7287 {
7289
7290 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7291
7292 pending->tag = *tag;
7293 }
7294
7295 /*
7296 * Perform pending flushes if the writeback limit is exceeded. This
7297 * includes the case where previously an item has been added, but control
7298 * is now disabled.
7299 */
7300 if (wb_context->nr_pending >= *wb_context->max_pending)
7302}
7303
7304#define ST_SORT sort_pending_writebacks
7305#define ST_ELEMENT_TYPE PendingWriteback
7306#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
7307#define ST_SCOPE static
7308#define ST_DEFINE
7309#include "lib/sort_template.h"
7310
7311/*
7312 * Issue all pending writeback requests, previously scheduled with
7313 * ScheduleBufferTagForWriteback, to the OS.
7314 *
7315 * Because this is only used to improve the OSs IO scheduling we try to never
7316 * error out - it's just a hint.
7317 */
7318void
7320{
7322 int i;
7323
7324 if (wb_context->nr_pending == 0)
7325 return;
7326
7327 /*
7328 * Executing the writes in-order can make them a lot faster, and allows to
7329 * merge writeback requests to consecutive blocks into larger writebacks.
7330 */
7331 sort_pending_writebacks(wb_context->pending_writebacks,
7332 wb_context->nr_pending);
7333
7335
7336 /*
7337 * Coalesce neighbouring writes, but nothing else. For that we iterate
7338 * through the, now sorted, array of pending flushes, and look forward to
7339 * find all neighbouring (or identical) writes.
7340 */
7341 for (i = 0; i < wb_context->nr_pending; i++)
7342 {
7346 int ahead;
7347 BufferTag tag;
7349 Size nblocks = 1;
7350
7351 cur = &wb_context->pending_writebacks[i];
7352 tag = cur->tag;
7354
7355 /*
7356 * Peek ahead, into following writeback requests, to see if they can
7357 * be combined with the current one.
7358 */
7359 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7360 {
7361
7362 next = &wb_context->pending_writebacks[i + ahead + 1];
7363
7364 /* different file, stop */
7366 BufTagGetRelFileLocator(&next->tag)) ||
7367 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7368 break;
7369
7370 /* ok, block queued twice, skip */
7371 if (cur->tag.blockNum == next->tag.blockNum)
7372 continue;
7373
7374 /* only merge consecutive writes */
7375 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7376 break;
7377
7378 nblocks++;
7379 cur = next;
7380 }
7381
7382 i += ahead;
7383
7384 /* and finally tell the kernel to write the data to storage */
7386 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7387 }
7388
7389 /*
7390 * Assume that writeback requests are only issued for buffers containing
7391 * blocks of permanent relations.
7392 */
7394 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7395
7396 wb_context->nr_pending = 0;
7397}
7398
7399/* ResourceOwner callbacks */
7400
7401static void
7408
7409static char *
7411{
7413
7414 return psprintf("lost track of buffer IO on buffer %d", buffer);
7415}
7416
7417/*
7418 * Release buffer as part of resource owner cleanup. This will only be called
7419 * if the buffer is pinned. If this backend held the content lock at the time
7420 * of the error we also need to release that (note that it is not possible to
7421 * hold a content lock without a pin).
7422 */
7423static void
7425{
7427
7428 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7429 if (!BufferIsValid(buffer))
7430 elog(ERROR, "bad buffer ID: %d", buffer);
7431
7432 if (BufferIsLocal(buffer))
7434 else
7435 {
7437
7439
7440 /* not having a private refcount would imply resowner corruption */
7441 Assert(ref != NULL);
7442
7443 /*
7444 * If the buffer was locked at the time of the resowner release,
7445 * release the lock now. This should only happen after errors.
7446 */
7447 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7448 {
7450
7451 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7453 }
7454
7456 }
7457}
7458
7459static char *
7464
7465/*
7466 * Helper function to evict unpinned buffer whose buffer header lock is
7467 * already acquired.
7468 */
7469static bool
7471{
7473 bool result;
7474
7475 *buffer_flushed = false;
7476
7479
7480 if ((buf_state & BM_VALID) == 0)
7481 {
7482 UnlockBufHdr(desc);
7483 return false;
7484 }
7485
7486 /* Check that it's not pinned already. */
7488 {
7489 UnlockBufHdr(desc);
7490 return false;
7491 }
7492
7493 PinBuffer_Locked(desc); /* releases spinlock */
7494
7495 /* If it was dirty, try to clean it once. */
7496 if (buf_state & BM_DIRTY)
7497 {
7499 *buffer_flushed = true;
7500 }
7501
7502 /* This will return false if it becomes dirty or someone else pins it. */
7503 result = InvalidateVictimBuffer(desc);
7504
7505 UnpinBuffer(desc);
7506
7507 return result;
7508}
7509
7510/*
7511 * Try to evict the current block in a shared buffer.
7512 *
7513 * This function is intended for testing/development use only!
7514 *
7515 * To succeed, the buffer must not be pinned on entry, so if the caller had a
7516 * particular block in mind, it might already have been replaced by some other
7517 * block by the time this function runs. It's also unpinned on return, so the
7518 * buffer might be occupied again by the time control is returned, potentially
7519 * even by the same block. This inherent raciness without other interlocking
7520 * makes the function unsuitable for non-testing usage.
7521 *
7522 * *buffer_flushed is set to true if the buffer was dirty and has been
7523 * flushed, false otherwise. However, *buffer_flushed=true does not
7524 * necessarily mean that we flushed the buffer, it could have been flushed by
7525 * someone else.
7526 *
7527 * Returns true if the buffer was valid and it has now been made invalid.
7528 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
7529 * or if the buffer becomes dirty again while we're trying to write it out.
7530 */
7531bool
7533{
7534 BufferDesc *desc;
7535
7537
7538 /* Make sure we can pin the buffer. */
7541
7542 desc = GetBufferDescriptor(buf - 1);
7543 LockBufHdr(desc);
7544
7546}
7547
7548/*
7549 * Try to evict all the shared buffers.
7550 *
7551 * This function is intended for testing/development use only! See
7552 * EvictUnpinnedBuffer().
7553 *
7554 * The buffers_* parameters are mandatory and indicate the total count of
7555 * buffers that:
7556 * - buffers_evicted - were evicted
7557 * - buffers_flushed - were flushed
7558 * - buffers_skipped - could not be evicted
7559 */
7560void
7563{
7564 *buffers_evicted = 0;
7565 *buffers_skipped = 0;
7566 *buffers_flushed = 0;
7567
7568 for (int buf = 1; buf <= NBuffers; buf++)
7569 {
7570 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7572 bool buffer_flushed;
7573
7575
7577 if (!(buf_state & BM_VALID))
7578 continue;
7579
7582
7583 LockBufHdr(desc);
7584
7586 (*buffers_evicted)++;
7587 else
7588 (*buffers_skipped)++;
7589
7590 if (buffer_flushed)
7591 (*buffers_flushed)++;
7592 }
7593}
7594
7595/*
7596 * Try to evict all the shared buffers containing provided relation's pages.
7597 *
7598 * This function is intended for testing/development use only! See
7599 * EvictUnpinnedBuffer().
7600 *
7601 * The caller must hold at least AccessShareLock on the relation to prevent
7602 * the relation from being dropped.
7603 *
7604 * The buffers_* parameters are mandatory and indicate the total count of
7605 * buffers that:
7606 * - buffers_evicted - were evicted
7607 * - buffers_flushed - were flushed
7608 * - buffers_skipped - could not be evicted
7609 */
7610void
7613{
7615
7616 *buffers_skipped = 0;
7617 *buffers_evicted = 0;
7618 *buffers_flushed = 0;
7619
7620 for (int buf = 1; buf <= NBuffers; buf++)
7621 {
7622 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7624 bool buffer_flushed;
7625
7627
7628 /* An unlocked precheck should be safe and saves some cycles. */
7629 if ((buf_state & BM_VALID) == 0 ||
7631 continue;
7632
7633 /* Make sure we can pin the buffer. */
7636
7637 buf_state = LockBufHdr(desc);
7638
7639 /* recheck, could have changed without the lock */
7640 if ((buf_state & BM_VALID) == 0 ||
7642 {
7643 UnlockBufHdr(desc);
7644 continue;
7645 }
7646
7648 (*buffers_evicted)++;
7649 else
7650 (*buffers_skipped)++;
7651
7652 if (buffer_flushed)
7653 (*buffers_flushed)++;
7654 }
7655}
7656
7657/*
7658 * Helper function to mark unpinned buffer dirty whose buffer header lock is
7659 * already acquired.
7660 */
7661static bool
7664{
7666 bool result = false;
7667
7668 *buffer_already_dirty = false;
7669
7672
7673 if ((buf_state & BM_VALID) == 0)
7674 {
7675 UnlockBufHdr(desc);
7676 return false;
7677 }
7678
7679 /* Check that it's not pinned already. */
7681 {
7682 UnlockBufHdr(desc);
7683 return false;
7684 }
7685
7686 /* Pin the buffer and then release the buffer spinlock */
7687 PinBuffer_Locked(desc);
7688
7689 /* If it was not already dirty, mark it as dirty. */
7690 if (!(buf_state & BM_DIRTY))
7691 {
7694 result = true;
7695 BufferLockUnlock(buf, desc);
7696 }
7697 else
7698 *buffer_already_dirty = true;
7699
7700 UnpinBuffer(desc);
7701
7702 return result;
7703}
7704
7705/*
7706 * Try to mark the provided shared buffer as dirty.
7707 *
7708 * This function is intended for testing/development use only!
7709 *
7710 * Same as EvictUnpinnedBuffer() but with MarkBufferDirty() call inside.
7711 *
7712 * The buffer_already_dirty parameter is mandatory and indicate if the buffer
7713 * could not be dirtied because it is already dirty.
7714 *
7715 * Returns true if the buffer has successfully been marked as dirty.
7716 */
7717bool
7719{
7720 BufferDesc *desc;
7721 bool buffer_dirtied = false;
7722
7724
7725 /* Make sure we can pin the buffer. */
7728
7729 desc = GetBufferDescriptor(buf - 1);
7730 LockBufHdr(desc);
7731
7733 /* Both can not be true at the same time */
7735
7736 return buffer_dirtied;
7737}
7738
7739/*
7740 * Try to mark all the shared buffers containing provided relation's pages as
7741 * dirty.
7742 *
7743 * This function is intended for testing/development use only! See
7744 * MarkDirtyUnpinnedBuffer().
7745 *
7746 * The buffers_* parameters are mandatory and indicate the total count of
7747 * buffers that:
7748 * - buffers_dirtied - were dirtied
7749 * - buffers_already_dirty - were already dirty
7750 * - buffers_skipped - could not be dirtied because of a reason different
7751 * than a buffer being already dirty.
7752 */
7753void
7758{
7760
7761 *buffers_dirtied = 0;
7763 *buffers_skipped = 0;
7764
7765 for (int buf = 1; buf <= NBuffers; buf++)
7766 {
7767 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7770
7772
7773 /* An unlocked precheck should be safe and saves some cycles. */
7774 if ((buf_state & BM_VALID) == 0 ||
7776 continue;
7777
7778 /* Make sure we can pin the buffer. */
7781
7782 buf_state = LockBufHdr(desc);
7783
7784 /* recheck, could have changed without the lock */
7785 if ((buf_state & BM_VALID) == 0 ||
7787 {
7788 UnlockBufHdr(desc);
7789 continue;
7790 }
7791
7793 (*buffers_dirtied)++;
7794 else if (buffer_already_dirty)
7795 (*buffers_already_dirty)++;
7796 else
7797 (*buffers_skipped)++;
7798 }
7799}
7800
7801/*
7802 * Try to mark all the shared buffers as dirty.
7803 *
7804 * This function is intended for testing/development use only! See
7805 * MarkDirtyUnpinnedBuffer().
7806 *
7807 * See MarkDirtyRelUnpinnedBuffers() above for details about the buffers_*
7808 * parameters.
7809 */
7810void
7814{
7815 *buffers_dirtied = 0;
7817 *buffers_skipped = 0;
7818
7819 for (int buf = 1; buf <= NBuffers; buf++)
7820 {
7821 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7824
7826
7828 if (!(buf_state & BM_VALID))
7829 continue;
7830
7833
7834 LockBufHdr(desc);
7835
7837 (*buffers_dirtied)++;
7838 else if (buffer_already_dirty)
7839 (*buffers_already_dirty)++;
7840 else
7841 (*buffers_skipped)++;
7842 }
7843}
7844
7845/*
7846 * Generic implementation of the AIO handle staging callback for readv/writev
7847 * on local/shared buffers.
7848 *
7849 * Each readv/writev can target multiple buffers. The buffers have already
7850 * been registered with the IO handle.
7851 *
7852 * To make the IO ready for execution ("staging"), we need to ensure that the
7853 * targeted buffers are in an appropriate state while the IO is ongoing. For
7854 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
7855 * in this backend could lead to this backend's buffer pin being released as
7856 * part of error handling, which in turn could lead to the buffer being
7857 * replaced while IO is ongoing.
7858 */
7861{
7862 uint64 *io_data;
7863 uint8 handle_data_len;
7866
7867 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7868
7870
7871 /* iterate over all buffers affected by the vectored readv/writev */
7872 for (int i = 0; i < handle_data_len; i++)
7873 {
7875 BufferDesc *buf_hdr = is_temp ?
7879
7880 /*
7881 * Check that all the buffers are actually ones that could conceivably
7882 * be done in one IO, i.e. are sequential. This is the last
7883 * buffer-aware code before IO is actually executed and confusion
7884 * about which buffers are targeted by IO can be hard to debug, making
7885 * it worth doing extra-paranoid checks.
7886 */
7887 if (i == 0)
7888 first = buf_hdr->tag;
7889 else
7890 {
7891 Assert(buf_hdr->tag.relNumber == first.relNumber);
7892 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
7893 }
7894
7895 if (is_temp)
7897 else
7899
7900 /* verify the buffer is in the expected state */
7902 if (is_write)
7903 {
7906 }
7907 else
7908 {
7911 }
7912
7913 /* temp buffers don't use BM_IO_IN_PROGRESS */
7914 if (!is_temp)
7916
7918
7919 /*
7920 * Reflect that the buffer is now owned by the AIO subsystem.
7921 *
7922 * For local buffers: This can't be done just via LocalRefCount, as
7923 * one might initially think, as this backend could error out while
7924 * AIO is still in progress, releasing all the pins by the backend
7925 * itself.
7926 *
7927 * This pin is released again in TerminateBufferIO().
7928 */
7929 buf_hdr->io_wref = io_ref;
7930
7931 if (is_temp)
7932 {
7935 }
7936 else
7938
7939 /*
7940 * Ensure the content lock that prevents buffer modifications while
7941 * the buffer is being written out is not released early due to an
7942 * error.
7943 */
7944 if (is_write && !is_temp)
7945 {
7947
7948 /*
7949 * Lock is now owned by AIO subsystem.
7950 */
7952 }
7953
7954 /*
7955 * Stop tracking this buffer via the resowner - the AIO system now
7956 * keeps track.
7957 */
7958 if (!is_temp)
7960 }
7961}
7962
7963/*
7964 * Decode readv errors as encoded by buffer_readv_encode_error().
7965 */
7966static inline void
7968 bool *zeroed_any,
7969 bool *ignored_any,
7973{
7974 uint32 rem_error = result.error_data;
7975
7976 /* see static asserts in buffer_readv_encode_error */
7977#define READV_COUNT_BITS 7
7978#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
7979
7980 *zeroed_any = rem_error & 1;
7981 rem_error >>= 1;
7982
7983 *ignored_any = rem_error & 1;
7984 rem_error >>= 1;
7985
7988
7991
7994}
7995
7996/*
7997 * Helper to encode errors for buffer_readv_complete()
7998 *
7999 * Errors are encoded as follows:
8000 * - bit 0 indicates whether any page was zeroed (1) or not (0)
8001 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
8002 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
8003 * - next READV_COUNT_BITS bits indicate the number of checksum failures
8004 * - next READV_COUNT_BITS bits indicate the first offset of the first page
8005 * that was errored or zeroed or, if no errors/zeroes, the first ignored
8006 * checksum
8007 */
8008static inline void
8010 bool is_temp,
8011 bool zeroed_any,
8012 bool ignored_any,
8019{
8020
8021 uint8 shift = 0;
8025
8027 "PG_IOV_MAX is bigger than reserved space for error data");
8029 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8030
8031 /*
8032 * We only have space to encode one offset - but luckily that's good
8033 * enough. If there is an error, the error is the interesting offset, same
8034 * with a zeroed buffer vs an ignored buffer.
8035 */
8036 if (error_count > 0)
8038 else if (zeroed_count > 0)
8040 else
8042
8043 Assert(!zeroed_any || error_count == 0);
8044
8045 result->error_data = 0;
8046
8047 result->error_data |= zeroed_any << shift;
8048 shift += 1;
8049
8050 result->error_data |= ignored_any << shift;
8051 shift += 1;
8052
8053 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8054 shift += READV_COUNT_BITS;
8055
8056 result->error_data |= ((uint32) checkfail_count) << shift;
8057 shift += READV_COUNT_BITS;
8058
8059 result->error_data |= ((uint32) first_off) << shift;
8060 shift += READV_COUNT_BITS;
8061
8062 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8064
8065 if (error_count > 0)
8066 result->status = PGAIO_RS_ERROR;
8067 else
8068 result->status = PGAIO_RS_WARNING;
8069
8070 /*
8071 * The encoding is complicated enough to warrant cross-checking it against
8072 * the decode function.
8073 */
8074#ifdef USE_ASSERT_CHECKING
8075 {
8076 bool zeroed_any_2,
8081
8086 &first_off_2);
8092 }
8093#endif
8094
8095#undef READV_COUNT_BITS
8096#undef READV_COUNT_MASK
8097}
8098
8099/*
8100 * Helper for AIO readv completion callbacks, supporting both shared and temp
8101 * buffers. Gets called once for each buffer in a multi-page read.
8102 */
8105 uint8 flags, bool failed, bool is_temp,
8106 bool *buffer_invalid,
8107 bool *failed_checksum,
8108 bool *ignored_checksum,
8109 bool *zeroed_buffer)
8110{
8111 BufferDesc *buf_hdr = is_temp ?
8114 BufferTag tag = buf_hdr->tag;
8115 char *bufdata = BufferGetBlock(buffer);
8117 int piv_flags;
8118
8119 /* check that the buffer is in the expected state for a read */
8120#ifdef USE_ASSERT_CHECKING
8121 {
8123
8126 /* temp buffers don't use BM_IO_IN_PROGRESS */
8127 if (!is_temp)
8130 }
8131#endif
8132
8133 *buffer_invalid = false;
8134 *failed_checksum = false;
8135 *ignored_checksum = false;
8136 *zeroed_buffer = false;
8137
8138 /*
8139 * We ask PageIsVerified() to only log the message about checksum errors,
8140 * as the completion might be run in any backend (or IO workers). We will
8141 * report checksum errors in buffer_readv_report().
8142 */
8144
8145 /* the local zero_damaged_pages may differ from the definer's */
8148
8149 /* Check for garbage data. */
8150 if (!failed)
8151 {
8152 /*
8153 * If the buffer is not currently pinned by this backend, e.g. because
8154 * we're completing this IO after an error, the buffer data will have
8155 * been marked as inaccessible when the buffer was unpinned. The AIO
8156 * subsystem holds a pin, but that doesn't prevent the buffer from
8157 * having been marked as inaccessible. The completion might also be
8158 * executed in a different process.
8159 */
8160#ifdef USE_VALGRIND
8161 if (!BufferIsPinned(buffer))
8163#endif
8164
8165 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8167 {
8168 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8169 {
8170 memset(bufdata, 0, BLCKSZ);
8171 *zeroed_buffer = true;
8172 }
8173 else
8174 {
8175 *buffer_invalid = true;
8176 /* mark buffer as having failed */
8177 failed = true;
8178 }
8179 }
8180 else if (*failed_checksum)
8181 *ignored_checksum = true;
8182
8183 /* undo what we did above */
8184#ifdef USE_VALGRIND
8185 if (!BufferIsPinned(buffer))
8187#endif
8188
8189 /*
8190 * Immediately log a message about the invalid page, but only to the
8191 * server log. The reason to do so immediately is that this may be
8192 * executed in a different backend than the one that originated the
8193 * request. The reason to do so immediately is that the originator
8194 * might not process the query result immediately (because it is busy
8195 * doing another part of query processing) or at all (e.g. if it was
8196 * cancelled or errored out due to another IO also failing). The
8197 * definer of the IO will emit an ERROR or WARNING when processing the
8198 * IO's results
8199 *
8200 * To avoid duplicating the code to emit these log messages, we reuse
8201 * buffer_readv_report().
8202 */
8204 {
8205 PgAioResult result_one = {0};
8206
8211 *zeroed_buffer ? 1 : 0,
8212 *failed_checksum ? 1 : 0,
8215 }
8216 }
8217
8218 /* Terminate I/O and set BM_VALID. */
8219 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8220 if (is_temp)
8222 else
8223 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8224
8225 /*
8226 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8227 * callback may not be executed in the same backend that called
8228 * BUFFER_READ_START. The alternative would be to defer calling the
8229 * tracepoint to a later point (e.g. the local completion callback for
8230 * shared buffer reads), which seems even less helpful.
8231 */
8233 tag.blockNum,
8234 tag.spcOid,
8235 tag.dbOid,
8236 tag.relNumber,
8238 false);
8239}
8240
8241/*
8242 * Perform completion handling of a single AIO read. This read may cover
8243 * multiple blocks / buffers.
8244 *
8245 * Shared between shared and local buffers, to reduce code duplication.
8246 */
8249 uint8 cb_data, bool is_temp)
8250{
8251 PgAioResult result = prior_result;
8256 uint8 error_count = 0;
8257 uint8 zeroed_count = 0;
8258 uint8 ignored_count = 0;
8260 uint64 *io_data;
8261 uint8 handle_data_len;
8262
8263 if (is_temp)
8264 {
8265 Assert(td->smgr.is_temp);
8267 }
8268 else
8269 Assert(!td->smgr.is_temp);
8270
8271 /*
8272 * Iterate over all the buffers affected by this IO and call the
8273 * per-buffer completion function for each buffer.
8274 */
8275 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8276 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8277 {
8279 bool failed;
8280 bool failed_verification = false;
8281 bool failed_checksum = false;
8282 bool zeroed_buffer = false;
8283 bool ignored_checksum = false;
8284
8286
8287 /*
8288 * If the entire I/O failed on a lower-level, each buffer needs to be
8289 * marked as failed. In case of a partial read, the first few buffers
8290 * may be ok.
8291 */
8292 failed =
8294 || prior_result.result <= buf_off;
8295
8296 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8300 &zeroed_buffer);
8301
8302 /*
8303 * Track information about the number of different kinds of error
8304 * conditions across all pages, as there can be multiple pages failing
8305 * verification as part of one IO.
8306 */
8309 if (zeroed_buffer && zeroed_count++ == 0)
8311 if (ignored_checksum && ignored_count++ == 0)
8313 if (failed_checksum)
8315 }
8316
8317 /*
8318 * If the smgr read succeeded [partially] and page verification failed for
8319 * some of the pages, adjust the IO's result state appropriately.
8320 */
8321 if (prior_result.status != PGAIO_RS_ERROR &&
8322 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8323 {
8324 buffer_readv_encode_error(&result, is_temp,
8325 zeroed_count > 0, ignored_count > 0,
8329 pgaio_result_report(result, td, DEBUG1);
8330 }
8331
8332 /*
8333 * For shared relations this reporting is done in
8334 * shared_buffer_readv_complete_local().
8335 */
8336 if (is_temp && checkfail_count > 0)
8339
8340 return result;
8341}
8342
8343/*
8344 * AIO error reporting callback for aio_shared_buffer_readv_cb and
8345 * aio_local_buffer_readv_cb.
8346 *
8347 * The error is encoded / decoded in buffer_readv_encode_error() /
8348 * buffer_readv_decode_error().
8349 */
8350static void
8352 int elevel)
8353{
8354 int nblocks = td->smgr.nblocks;
8355 BlockNumber first = td->smgr.blockNum;
8356 BlockNumber last = first + nblocks - 1;
8359 RelPathStr rpath =
8361 bool zeroed_any,
8365 first_off;
8367 const char *msg_one,
8368 *msg_mult,
8369 *det_mult,
8370 *hint_mult;
8371
8375 &first_off);
8376
8377 /*
8378 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8379 * special case, it's too irregular to be emitted the same way as the
8380 * other cases.
8381 */
8382 if (zeroed_any && ignored_any)
8383 {
8385 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8386 Assert(result.status != PGAIO_RS_ERROR);
8388
8389 ereport(elevel,
8391 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8392 affected_count, checkfail_count, first, last, rpath.str),
8393 affected_count > 1 ?
8394 errdetail("Block %u held the first zeroed page.",
8395 first + first_off) : 0,
8396 errhint_plural("See server log for details about the other %d invalid block.",
8397 "See server log for details about the other %d invalid blocks.",
8400 return;
8401 }
8402
8403 /*
8404 * The other messages are highly repetitive. To avoid duplicating a long
8405 * and complicated ereport(), gather the translated format strings
8406 * separately and then do one common ereport.
8407 */
8408 if (result.status == PGAIO_RS_ERROR)
8409 {
8410 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8412 msg_one = _("invalid page in block %u of relation \"%s\"");
8413 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8414 det_mult = _("Block %u held the first invalid page.");
8415 hint_mult = _("See server log for the other %u invalid block(s).");
8416 }
8417 else if (zeroed_any && !ignored_any)
8418 {
8420 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8421 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8422 det_mult = _("Block %u held the first zeroed page.");
8423 hint_mult = _("See server log for the other %u zeroed block(s).");
8424 }
8425 else if (!zeroed_any && ignored_any)
8426 {
8428 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8429 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8430 det_mult = _("Block %u held the first ignored page.");
8431 hint_mult = _("See server log for the other %u ignored block(s).");
8432 }
8433 else
8435
8436 ereport(elevel,
8438 affected_count == 1 ?
8439 errmsg_internal(msg_one, first + first_off, rpath.str) :
8440 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8443}
8444
8445static void
8450
8451static PgAioResult
8457
8458/*
8459 * We need a backend-local completion callback for shared buffers, to be able
8460 * to report checksum errors correctly. Unfortunately that can only safely
8461 * happen if the reporting backend has previously called
8462 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
8463 * the backend that started the IO. Hence this callback.
8464 */
8465static PgAioResult
8495
8496static void
8501
8502static PgAioResult
8508
8509/* readv callback is passed READ_BUFFERS_* flags as callback data */
8512 .complete_shared = shared_buffer_readv_complete,
8513 /* need a local callback to report checksum failures */
8514 .complete_local = shared_buffer_readv_complete_local,
8515 .report = buffer_readv_report,
8516};
8517
8518/* readv callback is passed READ_BUFFERS_* flags as callback data */
8521
8522 /*
8523 * Note that this, in contrast to the shared_buffers case, uses
8524 * complete_local, as only the issuing backend has access to the required
8525 * datastructures. This is important in case the IO completion may be
8526 * consumed incidentally by another backend.
8527 */
8528 .complete_local = local_buffer_readv_complete,
8529 .report = buffer_readv_report,
8530};
int io_method
Definition aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
bool pgaio_have_staged(void)
Definition aio.c:1107
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
void pgaio_submit_staged(void)
Definition aio.c:1123
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ IOMETHOD_SYNC
Definition aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_OK
Definition aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define pg_write_barrier()
Definition atomics.h:155
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1609
int BgWriterDelay
Definition bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
#define MaxBlockNumber
Definition block.h:35
static int32 next
Definition blutils.c:225
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
#define BufferIsLocal(buffer)
Definition buf.h:37
CkptSortItem * CkptBufferIds
Definition buf_init.c:26
WritebackContext BackendWritebackContext
Definition buf_init.c:25
#define BM_MAX_USAGE_COUNT
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
#define BM_PERMANENT
#define BUF_USAGECOUNT_MASK
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BM_LOCK_VAL_SHARED
#define BUF_REFCOUNT_ONE
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_LOCK_VAL_EXCLUSIVE
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
#define BM_PIN_COUNT_WAITER
#define BM_DIRTY
#define BM_LOCK_WAKE_IN_PROGRESS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
#define BM_JUST_DIRTIED
#define BUF_STATE_GET_USAGECOUNT(state)
#define BM_LOCK_MASK
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
#define BUF_STATE_GET_REFCOUNT(state)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_LOCK_HAS_WAITERS
#define BM_IO_ERROR
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BM_LOCK_VAL_SHARE_EXCLUSIVE
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:118
bool track_io_timing
Definition bufmgr.c:176
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7424
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6484
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition bufmgr.c:5165
void IncrBufferRefCount(Buffer buffer)
Definition bufmgr.c:5533
void DropDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5030
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition bufmgr.c:7203
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8248
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4356
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:373
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:247
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition bufmgr.c:1664
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition bufmgr.c:4680
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition bufmgr.c:3121
static int ReservedRefCountSlot
Definition bufmgr.c:252
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8466
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1362
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1627
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:772
static uint32 PrivateRefCountClock
Definition bufmgr.c:251
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4416
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7402
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8503
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition bufmgr.c:1589
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7561
int io_max_combine_limit
Definition bufmgr.c:201
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4553
const ResourceOwnerDesc buffer_io_resowner_desc
Definition bufmgr.c:269
bool zero_damaged_pages
Definition bufmgr.c:173
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:92
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3292
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition bufmgr.c:7611
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8104
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7460
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5755
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5943
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition bufmgr.c:7169
bool IsBufferCleanupOK(Buffer buffer)
Definition bufmgr.c:6748
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:74
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7410
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:2997
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6118
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:964
void AtEOXact_Buffers(bool isCommit)
Definition bufmgr.c:4103
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:6999
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition bufmgr.c:8510
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:5871
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:996
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1293
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6152
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1693
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7145
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1131
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2100
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4173
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition bufmgr.c:1651
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition bufmgr.c:5377
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition bufmgr.c:4800
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6050
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7070
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6372
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition bufmgr.c:1025
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition bufmgr.c:8519
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2374
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4155
int io_combine_limit_guc
Definition bufmgr.c:200
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7234
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4377
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:73
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5900
const ResourceOwnerDesc buffer_resowner_desc
Definition bufmgr.c:278
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:7860
void UnlockBuffer(Buffer buffer)
Definition bufmgr.c:6405
#define BUF_REUSABLE
Definition bufmgr.c:82
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7054
static void BufferSync(int flags)
Definition bufmgr.c:3456
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1864
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8497
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4299
void CheckPointBuffers(int flags)
Definition bufmgr.c:4342
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3024
static uint32 MaxProportionalPins
Definition bufmgr.c:255
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6010
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2703
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6132
bool BgBufferSync(WritebackContext *wb_context)
Definition bufmgr.c:3735
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7097
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3324
bool BufferIsPermanent(Buffer buffer)
Definition bufmgr.c:4604
void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:7811
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:129
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8446
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6317
void UnlockBuffers(void)
Definition bufmgr.c:5709
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:682
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8452
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2451
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6464
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition bufmgr.c:4572
int bgwriter_flush_after
Definition bufmgr.c:208
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5501
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:2971
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3181
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:4970
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition bufmgr.c:4634
void LockBufferInternal(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:6421
bool HoldingBufferPinThatDelaysRecovery(void)
Definition bufmgr.c:6664
bool MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty)
Definition bufmgr.c:7718
int checkpoint_flush_after
Definition bufmgr.c:207
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5518
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition bufmgr.c:1210
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3369
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7038
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7269
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1732
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7257
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3056
#define BufferIsPinned(bufnum)
Definition bufmgr.c:589
double bgwriter_lru_multiplier
Definition bufmgr.c:175
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7470
int backend_flush_after
Definition bufmgr.c:209
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2641
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8351
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:293
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:212
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:404
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:528
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2659
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6517
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5565
void FlushRelationBuffers(Relation rel)
Definition bufmgr.c:5077
#define READV_COUNT_BITS
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6288
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7319
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:551
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition bufmgr.c:7532
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:948
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition bufmgr.c:803
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:84
int maintenance_io_concurrency
Definition bufmgr.c:191
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3360
void FlushDatabaseBuffers(Oid dbid)
Definition bufmgr.c:5441
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2273
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5263
int effective_io_concurrency
Definition bufmgr.c:184
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:493
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6390
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:6937
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition bufmgr.c:6879
void MarkDirtyRelUnpinnedBuffers(Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
Definition bufmgr.c:7754
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1608
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:911
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:7662
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:248
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:7967
#define READV_COUNT_MASK
static int PrivateRefCountEntryLast
Definition bufmgr.c:253
int io_combine_limit
Definition bufmgr.c:199
void InitBufferManagerAccess(void)
Definition bufmgr.c:4120
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8009
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4033
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2615
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:864
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3416
static HTAB * PrivateRefCountHash
Definition bufmgr.c:249
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:250
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6690
int bgwriter_lru_maxpages
Definition bufmgr.c:174
uint32 GetPinLimit(void)
Definition bufmgr.c:2603
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:6800
#define BUF_WRITTEN
Definition bufmgr.c:81
void FlushOneBuffer(Buffer buffer)
Definition bufmgr.c:5481
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define P_NEW
Definition bufmgr.h:198
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
#define DEFAULT_IO_COMBINE_LIMIT
Definition bufmgr.h:174
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:433
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
BufferLockMode
Definition bufmgr.h:204
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:215
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:173
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition bufmgr.h:168
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition bufmgr.h:169
void * Block
Definition bufmgr.h:26
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
ReadBufferMode
Definition bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
@ RBM_NORMAL
Definition bufmgr.h:46
#define BMR_REL(p_rel)
Definition bufmgr.h:114
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
bool ignore_checksum_failure
Definition bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:468
static bool PageIsNew(const PageData *page)
Definition bufpage.h:233
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:390
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:385
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:469
#define pg_noinline
Definition c.h:305
#define likely(x)
Definition c.h:421
uint8_t uint8
Definition c.h:554
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:223
#define Max(x, y)
Definition c.h:1001
#define Assert(condition)
Definition c.h:883
double float8
Definition c.h:654
#define pg_attribute_always_inline
Definition c.h:289
int16_t int16
Definition c.h:551
int32_t int32
Definition c.h:552
uint64_t uint64
Definition c.h:557
#define pg_unreachable()
Definition c.h:351
#define unlikely(x)
Definition c.h:422
uint32_t uint32
Definition c.h:556
#define lengthof(array)
Definition c.h:813
#define MemSet(start, val, len)
Definition c.h:1023
#define StaticAssertDecl(condition, errmessage)
Definition c.h:952
size_t Size
Definition c.h:629
bool IsCatalogRelationOid(Oid relid)
Definition catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
struct cursor * cur
Definition ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errdetail_internal(const char *fmt,...)
Definition elog.c:1243
int errdetail(const char *fmt,...)
Definition elog.c:1216
ErrorContextCallback * error_context_stack
Definition elog.c:95
int errhint_internal(const char *fmt,...)
Definition elog.c:1352
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition elog.c:1373
#define _(x)
Definition elog.c:91
#define errcontext
Definition elog.h:198
#define DEBUG3
Definition elog.h:28
#define LOG_SERVER_ONLY
Definition elog.h:32
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int io_direct_flags
Definition fd.c:168
#define IO_DIRECT_DATA
Definition fd.h:54
#define palloc_array(type, count)
Definition fe_memutils.h:76
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:321
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:174
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:747
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:787
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
int NBuffers
Definition globals.c:142
bool enableFsync
Definition globals.c:129
ProcNumber MyProcNumber
Definition globals.c:90
int VacuumCostPageMiss
Definition globals.c:152
bool VacuumCostActive
Definition globals.c:158
bool IsUnderPostmaster
Definition globals.c:120
int VacuumCostBalance
Definition globals.c:157
int MaxBackends
Definition globals.c:146
int VacuumCostPageDirty
Definition globals.c:153
int VacuumCostPageHit
Definition globals.c:151
const char * str
@ HASH_FIND
Definition hsearch.h:113
@ HASH_REMOVE
Definition hsearch.h:115
@ HASH_ENTER
Definition hsearch.h:114
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
BufferUsage pgBufferUsage
Definition instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
int b
Definition isn.c:74
int a
Definition isn.c:73
int j
Definition isn.c:78
int i
Definition isn.c:77
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
int32 * LocalRefCount
Definition localbuf.c:49
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:841
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition localbuf.c:523
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1003
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1014
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:805
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:491
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:702
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:562
int NLocBuffer
Definition localbuf.c:45
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:346
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:848
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:665
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define ExclusiveLock
Definition lockdefs.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1176
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1793
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
@ LW_WS_WAITING
Definition lwlock.h:31
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
@ LW_SHARED
Definition lwlock.h:113
@ LW_EXCLUSIVE
Definition lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define RESUME_INTERRUPTS()
Definition miscadmin.h:136
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define HOLD_INTERRUPTS()
Definition miscadmin.h:134
#define END_CRIT_SECTION()
Definition miscadmin.h:152
void * arg
#define ERRCODE_DATA_CORRUPTED
static PgChecksumMode mode
static int64 current_size
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition pg_iovec.h:47
static char buf[DEFAULT_XLOG_SEG_SIZE]
IOObject
Definition pgstat.h:276
@ IOOBJECT_RELATION
Definition pgstat.h:277
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:278
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:715
IOContext
Definition pgstat.h:285
@ IOCONTEXT_NORMAL
Definition pgstat.h:289
@ IOOP_EXTEND
Definition pgstat.h:314
@ IOOP_READ
Definition pgstat.h:315
@ IOOP_WRITEBACK
Definition pgstat.h:311
@ IOOP_HIT
Definition pgstat.h:309
@ IOOP_EVICT
Definition pgstat.h:307
@ IOOP_REUSE
Definition pgstat.h:310
@ IOOP_WRITE
Definition pgstat.h:316
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:720
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define qsort(a, b, c, d)
Definition port.h:495
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:335
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:315
static Datum PointerGetDatum(const void *X)
Definition postgres.h:352
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fb(int x)
#define NUM_AUXILIARY_PROCS
Definition proc.h:469
#define GetPGProcByNumber(n)
Definition proc.h:446
#define DELAY_CHKPT_START
Definition proc.h:135
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
int ProcNumber
Definition procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition procsignal.c:499
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:387
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:576
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:667
#define RelationIsValid(relation)
Definition rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ INIT_FORKNUM
Definition relpath.h:61
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:753
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
#define free(a)
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:1992
PGPROC * MyProc
Definition proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:771
int DeadlockTimeout
Definition proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:1980
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:793
bool log_recovery_conflict_waits
Definition standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187
BufferTag tag
pg_atomic_uint64 state
int64 shared_blks_dirtied
Definition instrument.h:28
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 shared_blks_written
Definition instrument.h:29
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
int ckpt_bufs_written
Definition xlog.h:178
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:148
int num_scanned
Definition bufmgr.c:153
float8 progress
Definition bufmgr.c:147
int num_to_scan
Definition bufmgr.c:151
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
Definition proc.h:179
uint8 lwWaitMode
Definition proc.h:247
int delayChkptFlags
Definition proc.h:263
PGSemaphore sem
Definition proc.h:183
uint8 lwWaiting
Definition proc.h:246
PgAioHandleCallbackStage stage
Definition aio.h:219
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
uint32 id
Definition aio_types.h:105
PgAioResult result
Definition aio_types.h:132
PgStat_Counter buf_written_clean
Definition pgstat.h:242
PgStat_Counter maxwritten_clean
Definition pgstat.h:243
PgStat_Counter buf_alloc
Definition pgstat.h:244
PgStat_Counter buffers_written
Definition pgstat.h:266
Buffer recent_buffer
Definition bufmgr.h:61
BufferLockMode lockmode
Definition bufmgr.c:109
PrivateRefCountData data
Definition bufmgr.c:125
ForkNumber forknum
Definition bufmgr.h:137
PgAioWaitRef io_wref
Definition bufmgr.h:150
SMgrRelation smgr
Definition bufmgr.h:135
BufferAccessStrategy strategy
Definition bufmgr.h:138
BlockNumber blocknum
Definition bufmgr.h:146
PgAioReturn io_return
Definition bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
const char * name
Definition resowner.h:93
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38
SMgrRelation srel
Definition bufmgr.c:169
RelFileLocator rlocator
Definition bufmgr.c:168
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1847
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@126 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
static volatile sig_atomic_t waiting
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
bool RecoveryInProgress(void)
Definition xlog.c:6461
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3146
CheckpointStatsData CheckpointStats
Definition xlog.c:212
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2784
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:154
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:151
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:150
#define XLogIsNeeded()
Definition xlog.h:111
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
#define InHotStandby
Definition xlogutils.h:60