PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#ifdef USE_ASSERT_CHECKING
44#include "catalog/pg_tablespace_d.h"
45#endif
46#include "catalog/storage.h"
48#include "executor/instrument.h"
49#include "lib/binaryheap.h"
50#include "miscadmin.h"
51#include "pg_trace.h"
52#include "pgstat.h"
53#include "postmaster/bgwriter.h"
54#include "storage/aio.h"
56#include "storage/bufmgr.h"
57#include "storage/fd.h"
58#include "storage/ipc.h"
59#include "storage/lmgr.h"
60#include "storage/proc.h"
61#include "storage/read_stream.h"
62#include "storage/smgr.h"
63#include "storage/standby.h"
64#include "utils/memdebug.h"
65#include "utils/ps_status.h"
66#include "utils/rel.h"
67#include "utils/resowner.h"
68#include "utils/timestamp.h"
69
70
71/* Note: these two macros only work on shared buffers, not local ones! */
72#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
73#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
74
75/* Note: this macro only works on local buffers, not shared ones! */
76#define LocalBufHdrGetBlock(bufHdr) \
77 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
78
79/* Bits in SyncOneBuffer's return value */
80#define BUF_WRITTEN 0x01
81#define BUF_REUSABLE 0x02
82
83#define RELS_BSEARCH_THRESHOLD 20
84
85/*
86 * This is the size (in the number of blocks) above which we scan the
87 * entire buffer pool to remove the buffers for all the pages of relation
88 * being dropped. For the relations with size below this threshold, we find
89 * the buffers by doing lookups in BufMapping table.
90 */
91#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
92
94{
98
99/* 64 bytes, about the size of a cache line on common systems */
100#define REFCOUNT_ARRAY_ENTRIES 8
101
102/*
103 * Status of buffers to checkpoint for a particular tablespace, used
104 * internally in BufferSync.
105 */
106typedef struct CkptTsStatus
107{
108 /* oid of the tablespace */
110
111 /*
112 * Checkpoint progress for this tablespace. To make progress comparable
113 * between tablespaces the progress is, for each tablespace, measured as a
114 * number between 0 and the total number of to-be-checkpointed pages. Each
115 * page checkpointed in this tablespace increments this space's progress
116 * by progress_slice.
117 */
120
121 /* number of to-be checkpointed pages in this tablespace */
123 /* already processed pages in this tablespace */
125
126 /* current offset in CkptBufferIds for this tablespace */
127 int index;
129
130/*
131 * Type for array used to sort SMgrRelations
132 *
133 * FlushRelationsAllBuffers shares the same comparator function with
134 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
135 * compatible.
136 */
137typedef struct SMgrSortArray
138{
139 RelFileLocator rlocator; /* This must be the first member */
142
143/* GUC variables */
147bool track_io_timing = false;
148
149/*
150 * How many buffers PrefetchBuffer callers should try to stay ahead of their
151 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
152 * for buffers not belonging to tablespaces that have their
153 * effective_io_concurrency parameter set.
154 */
156
157/*
158 * Like effective_io_concurrency, but used by maintenance code paths that might
159 * benefit from a higher setting because they work on behalf of many sessions.
160 * Overridden by the tablespace setting of the same name.
161 */
163
164/*
165 * Limit on how many blocks should be handled in single I/O operations.
166 * StartReadBuffers() callers should respect it, as should other operations
167 * that call smgr APIs directly. It is computed as the minimum of underlying
168 * GUCs io_combine_limit_guc and io_max_combine_limit.
169 */
173
174/*
175 * GUC variables about triggering kernel writeback for buffers written; OS
176 * dependent defaults are set via the GUC mechanism.
177 */
181
182/* local state for LockBufferForCleanup */
184
185/*
186 * Backend-Private refcount management:
187 *
188 * Each buffer also has a private refcount that keeps track of the number of
189 * times the buffer is pinned in the current process. This is so that the
190 * shared refcount needs to be modified only once if a buffer is pinned more
191 * than once by an individual backend. It's also used to check that no buffers
192 * are still pinned at the end of transactions and when exiting.
193 *
194 *
195 * To avoid - as we used to - requiring an array with NBuffers entries to keep
196 * track of local buffers, we use a small sequentially searched array
197 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
198 * keep track of backend local pins.
199 *
200 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
201 * refcounts are kept track of in the array; after that, new array entries
202 * displace old ones into the hash table. That way a frequently used entry
203 * can't get "stuck" in the hashtable while infrequent ones clog the array.
204 *
205 * Note that in most scenarios the number of pinned buffers will not exceed
206 * REFCOUNT_ARRAY_ENTRIES.
207 *
208 *
209 * To enter a buffer into the refcount tracking mechanism first reserve a free
210 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
211 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
212 * memory allocations in NewPrivateRefCountEntry() which can be important
213 * because in some scenarios it's called with a spinlock held...
214 */
220
222
223static void ReservePrivateRefCountEntry(void);
228
229/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
230static void ResOwnerReleaseBufferIO(Datum res);
231static char *ResOwnerPrintBufferIO(Datum res);
232static void ResOwnerReleaseBufferPin(Datum res);
233static char *ResOwnerPrintBufferPin(Datum res);
234
236{
237 .name = "buffer io",
238 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239 .release_priority = RELEASE_PRIO_BUFFER_IOS,
240 .ReleaseResource = ResOwnerReleaseBufferIO,
241 .DebugPrint = ResOwnerPrintBufferIO
242};
243
245{
246 .name = "buffer pin",
247 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
248 .release_priority = RELEASE_PRIO_BUFFER_PINS,
249 .ReleaseResource = ResOwnerReleaseBufferPin,
250 .DebugPrint = ResOwnerPrintBufferPin
251};
252
253/*
254 * Ensure that the PrivateRefCountArray has sufficient space to store one more
255 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
256 * a new entry - but it's perfectly fine to not use a reserved entry.
257 */
258static void
260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
320
321/*
322 * Fill a previously reserved refcount entry.
323 */
326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}
342
343/*
344 * Return the PrivateRefCount entry for the passed buffer.
345 *
346 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
347 * do_move is true, and the entry resides in the hashtable the entry is
348 * optimized for frequent access by moving it to the array.
349 */
352{
354 int i;
355
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
418
419/*
420 * Returns how many times the passed buffer is pinned by this backend.
421 *
422 * Only works for shared memory buffers!
423 */
424static inline int32
426{
428
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
442
443/*
444 * Release resources used to track the reference count of a buffer which we no
445 * longer have pinned and don't want to pin again immediately.
446 */
447static void
449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
475
476/*
477 * BufferIsPinned
478 * True iff the buffer is pinned (also checks for valid buffer number).
479 *
480 * NOTE: what we check here is that *this* backend holds a pin on
481 * the buffer. We do not care whether some other backend does.
482 */
483#define BufferIsPinned(bufnum) \
484( \
485 !BufferIsValid(bufnum) ? \
486 false \
487 : \
488 BufferIsLocal(bufnum) ? \
489 (LocalRefCount[-(bufnum) - 1] > 0) \
490 : \
491 (GetPrivateRefCount(bufnum) > 0) \
492)
493
494
496 SMgrRelation smgr, char smgr_persistence,
497 ForkNumber forkNum, BlockNumber blockNum,
500 ForkNumber fork,
501 BufferAccessStrategy strategy,
502 uint32 flags,
503 uint32 extend_by,
504 BlockNumber extend_upto,
505 Buffer *buffers,
506 uint32 *extended_by);
508 ForkNumber fork,
509 BufferAccessStrategy strategy,
510 uint32 flags,
511 uint32 extend_by,
512 BlockNumber extend_upto,
513 Buffer *buffers,
514 uint32 *extended_by);
515static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
516 bool skip_if_not_valid);
517static void PinBuffer_Locked(BufferDesc *buf);
518static void UnpinBuffer(BufferDesc *buf);
519static void UnpinBufferNoOwner(BufferDesc *buf);
520static void BufferSync(int flags);
521static int SyncOneBuffer(int buf_id, bool skip_recently_used,
522 WritebackContext *wb_context);
523static void WaitIO(BufferDesc *buf);
524static void AbortBufferIO(Buffer buffer);
525static void shared_buffer_write_error_callback(void *arg);
526static void local_buffer_write_error_callback(void *arg);
527static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
528 char relpersistence,
529 ForkNumber forkNum,
530 BlockNumber blockNum,
531 BufferAccessStrategy strategy,
532 bool *foundPtr, IOContext io_context);
533static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
534static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
535static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
537 IOObject io_object, IOContext io_context);
538static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
539 IOObject io_object, IOContext io_context);
540static void FindAndDropRelationBuffers(RelFileLocator rlocator,
541 ForkNumber forkNum,
542 BlockNumber nForkBlock,
543 BlockNumber firstDelBlock);
545 RelFileLocator dstlocator,
546 ForkNumber forkNum, bool permanent);
547static void AtProcExit_Buffers(int code, Datum arg);
548static void CheckForBufferLeaks(void);
549#ifdef USE_ASSERT_CHECKING
550static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
551 void *unused_context);
552#endif
553static int rlocator_comparator(const void *p1, const void *p2);
554static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
555static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
556static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
557
558
559/*
560 * Implementation of PrefetchBuffer() for shared buffers.
561 */
564 ForkNumber forkNum,
565 BlockNumber blockNum)
566{
567 PrefetchBufferResult result = {InvalidBuffer, false};
568 BufferTag newTag; /* identity of requested block */
569 uint32 newHash; /* hash value for newTag */
570 LWLock *newPartitionLock; /* buffer partition lock for it */
571 int buf_id;
572
573 Assert(BlockNumberIsValid(blockNum));
574
575 /* create a tag so we can lookup the buffer */
576 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
577 forkNum, blockNum);
578
579 /* determine its hash code and partition lock ID */
580 newHash = BufTableHashCode(&newTag);
581 newPartitionLock = BufMappingPartitionLock(newHash);
582
583 /* see if the block is in the buffer pool already */
584 LWLockAcquire(newPartitionLock, LW_SHARED);
585 buf_id = BufTableLookup(&newTag, newHash);
586 LWLockRelease(newPartitionLock);
587
588 /* If not in buffers, initiate prefetch */
589 if (buf_id < 0)
590 {
591#ifdef USE_PREFETCH
592 /*
593 * Try to initiate an asynchronous read. This returns false in
594 * recovery if the relation file doesn't exist.
595 */
596 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
597 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
598 {
599 result.initiated_io = true;
600 }
601#endif /* USE_PREFETCH */
602 }
603 else
604 {
605 /*
606 * Report the buffer it was in at that time. The caller may be able
607 * to avoid a buffer table lookup, but it's not pinned and it must be
608 * rechecked!
609 */
610 result.recent_buffer = buf_id + 1;
611 }
612
613 /*
614 * If the block *is* in buffers, we do nothing. This is not really ideal:
615 * the block might be just about to be evicted, which would be stupid
616 * since we know we are going to need it soon. But the only easy answer
617 * is to bump the usage_count, which does not seem like a great solution:
618 * when the caller does ultimately touch the block, usage_count would get
619 * bumped again, resulting in too much favoritism for blocks that are
620 * involved in a prefetch sequence. A real fix would involve some
621 * additional per-buffer state, and it's not clear that there's enough of
622 * a problem to justify that.
623 */
624
625 return result;
626}
627
628/*
629 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
630 *
631 * This is named by analogy to ReadBuffer but doesn't actually allocate a
632 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
633 * block will not be delayed by the I/O. Prefetching is optional.
634 *
635 * There are three possible outcomes:
636 *
637 * 1. If the block is already cached, the result includes a valid buffer that
638 * could be used by the caller to avoid the need for a later buffer lookup, but
639 * it's not pinned, so the caller must recheck it.
640 *
641 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
642 * true. Currently there is no way to know if the data was already cached by
643 * the kernel and therefore didn't really initiate I/O, and no way to know when
644 * the I/O completes other than using synchronous ReadBuffer().
645 *
646 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
647 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
648 * lack of a kernel facility), direct I/O is enabled, or the underlying
649 * relation file wasn't found and we are in recovery. (If the relation file
650 * wasn't found and we are not in recovery, an error is raised).
651 */
654{
655 Assert(RelationIsValid(reln));
656 Assert(BlockNumberIsValid(blockNum));
657
658 if (RelationUsesLocalBuffers(reln))
659 {
660 /* see comments in ReadBufferExtended */
661 if (RELATION_IS_OTHER_TEMP(reln))
663 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
664 errmsg("cannot access temporary tables of other sessions")));
665
666 /* pass it off to localbuf.c */
667 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
668 }
669 else
670 {
671 /* pass it to the shared buffer version */
672 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
673 }
674}
675
676/*
677 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
678 *
679 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
680 * successful. Return true if the buffer is valid and still has the expected
681 * tag. In that case, the buffer is pinned and the usage count is bumped.
682 */
683bool
685 Buffer recent_buffer)
686{
687 BufferDesc *bufHdr;
688 BufferTag tag;
689 uint32 buf_state;
690
691 Assert(BufferIsValid(recent_buffer));
692
695 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
696
697 if (BufferIsLocal(recent_buffer))
698 {
699 int b = -recent_buffer - 1;
700
701 bufHdr = GetLocalBufferDescriptor(b);
702 buf_state = pg_atomic_read_u32(&bufHdr->state);
703
704 /* Is it still valid and holding the right tag? */
705 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
706 {
707 PinLocalBuffer(bufHdr, true);
708
710
711 return true;
712 }
713 }
714 else
715 {
716 bufHdr = GetBufferDescriptor(recent_buffer - 1);
717
718 /*
719 * Is it still valid and holding the right tag? We do an unlocked tag
720 * comparison first, to make it unlikely that we'll increment the
721 * usage counter of the wrong buffer, if someone calls us with a very
722 * out of date recent_buffer. Then we'll check it again if we get the
723 * pin.
724 */
725 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
726 PinBuffer(bufHdr, NULL, true))
727 {
728 if (BufferTagsEqual(&tag, &bufHdr->tag))
729 {
731 return true;
732 }
733 UnpinBuffer(bufHdr);
734 }
735 }
736
737 return false;
738}
739
740/*
741 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
742 * fork with RBM_NORMAL mode and default strategy.
743 */
744Buffer
746{
747 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
748}
749
750/*
751 * ReadBufferExtended -- returns a buffer containing the requested
752 * block of the requested relation. If the blknum
753 * requested is P_NEW, extend the relation file and
754 * allocate a new block. (Caller is responsible for
755 * ensuring that only one backend tries to extend a
756 * relation at the same time!)
757 *
758 * Returns: the buffer number for the buffer containing
759 * the block read. The returned buffer has been pinned.
760 * Does not return on error --- elog's instead.
761 *
762 * Assume when this function is called, that reln has been opened already.
763 *
764 * In RBM_NORMAL mode, the page is read from disk, and the page header is
765 * validated. An error is thrown if the page header is not valid. (But
766 * note that an all-zero page is considered "valid"; see
767 * PageIsVerified().)
768 *
769 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
770 * valid, the page is zeroed instead of throwing an error. This is intended
771 * for non-critical data, where the caller is prepared to repair errors.
772 *
773 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
774 * filled with zeros instead of reading it from disk. Useful when the caller
775 * is going to fill the page from scratch, since this saves I/O and avoids
776 * unnecessary failure if the page-on-disk has corrupt page headers.
777 * The page is returned locked to ensure that the caller has a chance to
778 * initialize the page before it's made visible to others.
779 * Caution: do not use this mode to read a page that is beyond the relation's
780 * current physical EOF; that is likely to cause problems in md.c when
781 * the page is modified and written out. P_NEW is OK, though.
782 *
783 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
784 * a cleanup-strength lock on the page.
785 *
786 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
787 *
788 * If strategy is not NULL, a nondefault buffer access strategy is used.
789 * See buffer/README for details.
790 */
791inline Buffer
794{
795 Buffer buf;
796
797 /*
798 * Reject attempts to read non-local temporary relations; we would be
799 * likely to get wrong data since we have no visibility into the owning
800 * session's local buffers.
801 */
802 if (RELATION_IS_OTHER_TEMP(reln))
804 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
805 errmsg("cannot access temporary tables of other sessions")));
806
807 /*
808 * Read the buffer, and update pgstat counters to reflect a cache hit or
809 * miss.
810 */
811 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
812 forkNum, blockNum, mode, strategy);
813
814 return buf;
815}
816
817
818/*
819 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
820 * a relcache entry for the relation.
821 *
822 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
823 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
824 * cannot be used for temporary relations (and making that work might be
825 * difficult, unless we only want to read temporary relations for our own
826 * ProcNumber).
827 */
828Buffer
831 BufferAccessStrategy strategy, bool permanent)
832{
833 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
834
835 return ReadBuffer_common(NULL, smgr,
836 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
837 forkNum, blockNum,
838 mode, strategy);
839}
840
841/*
842 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
843 */
844Buffer
846 ForkNumber forkNum,
847 BufferAccessStrategy strategy,
848 uint32 flags)
849{
850 Buffer buf;
851 uint32 extend_by = 1;
852
853 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854 &buf, &extend_by);
855
856 return buf;
857}
858
859/*
860 * Extend relation by multiple blocks.
861 *
862 * Tries to extend the relation by extend_by blocks. Depending on the
863 * availability of resources the relation may end up being extended by a
864 * smaller number of pages (unless an error is thrown, always by at least one
865 * page). *extended_by is updated to the number of pages the relation has been
866 * extended to.
867 *
868 * buffers needs to be an array that is at least extend_by long. Upon
869 * completion, the first extend_by array elements will point to a pinned
870 * buffer.
871 *
872 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
873 * locked. This is useful for callers that want a buffer that is guaranteed to
874 * be empty.
875 */
878 ForkNumber fork,
879 BufferAccessStrategy strategy,
880 uint32 flags,
881 uint32 extend_by,
882 Buffer *buffers,
883 uint32 *extended_by)
884{
885 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
887 Assert(extend_by > 0);
888
889 if (bmr.smgr == NULL)
890 {
891 bmr.smgr = RelationGetSmgr(bmr.rel);
892 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
893 }
894
895 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
896 extend_by, InvalidBlockNumber,
897 buffers, extended_by);
898}
899
900/*
901 * Extend the relation so it is at least extend_to blocks large, return buffer
902 * (extend_to - 1).
903 *
904 * This is useful for callers that want to write a specific page, regardless
905 * of the current size of the relation (e.g. useful for visibilitymap and for
906 * crash recovery).
907 */
908Buffer
910 ForkNumber fork,
911 BufferAccessStrategy strategy,
912 uint32 flags,
913 BlockNumber extend_to,
915{
917 uint32 extended_by = 0;
919 Buffer buffers[64];
920
921 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
922 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
923 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
924
925 if (bmr.smgr == NULL)
926 {
927 bmr.smgr = RelationGetSmgr(bmr.rel);
928 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
929 }
930
931 /*
932 * If desired, create the file if it doesn't exist. If
933 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
934 * an smgrexists call.
935 */
936 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
937 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
939 !smgrexists(bmr.smgr, fork))
940 {
942
943 /* recheck, fork might have been created concurrently */
944 if (!smgrexists(bmr.smgr, fork))
945 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
946
948 }
949
950 /*
951 * If requested, invalidate size cache, so that smgrnblocks asks the
952 * kernel.
953 */
954 if (flags & EB_CLEAR_SIZE_CACHE)
956
957 /*
958 * Estimate how many pages we'll need to extend by. This avoids acquiring
959 * unnecessarily many victim buffers.
960 */
961 current_size = smgrnblocks(bmr.smgr, fork);
962
963 /*
964 * Since no-one else can be looking at the page contents yet, there is no
965 * difference between an exclusive lock and a cleanup-strength lock. Note
966 * that we pass the original mode to ReadBuffer_common() below, when
967 * falling back to reading the buffer to a concurrent relation extension.
968 */
970 flags |= EB_LOCK_TARGET;
971
972 while (current_size < extend_to)
973 {
974 uint32 num_pages = lengthof(buffers);
975 BlockNumber first_block;
976
977 if ((uint64) current_size + num_pages > extend_to)
978 num_pages = extend_to - current_size;
979
980 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
981 num_pages, extend_to,
982 buffers, &extended_by);
983
984 current_size = first_block + extended_by;
985 Assert(num_pages != 0 || current_size >= extend_to);
986
987 for (uint32 i = 0; i < extended_by; i++)
988 {
989 if (first_block + i != extend_to - 1)
990 ReleaseBuffer(buffers[i]);
991 else
992 buffer = buffers[i];
993 }
994 }
995
996 /*
997 * It's possible that another backend concurrently extended the relation.
998 * In that case read the buffer.
999 *
1000 * XXX: Should we control this via a flag?
1001 */
1002 if (buffer == InvalidBuffer)
1003 {
1004 Assert(extended_by == 0);
1006 fork, extend_to - 1, mode, strategy);
1007 }
1008
1009 return buffer;
1010}
1011
1012/*
1013 * Lock and optionally zero a buffer, as part of the implementation of
1014 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1015 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1016 */
1017static void
1019{
1020 BufferDesc *bufHdr;
1021 bool need_to_zero;
1022 bool isLocalBuf = BufferIsLocal(buffer);
1023
1025
1026 if (already_valid)
1027 {
1028 /*
1029 * If the caller already knew the buffer was valid, we can skip some
1030 * header interaction. The caller just wants to lock the buffer.
1031 */
1032 need_to_zero = false;
1033 }
1034 else if (isLocalBuf)
1035 {
1036 /* Simple case for non-shared buffers. */
1037 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1038 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1039 }
1040 else
1041 {
1042 /*
1043 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1044 * concurrently. Even though we aren't doing I/O, that ensures that
1045 * we don't zero a page that someone else has pinned. An exclusive
1046 * content lock wouldn't be enough, because readers are allowed to
1047 * drop the content lock after determining that a tuple is visible
1048 * (see buffer access rules in README).
1049 */
1050 bufHdr = GetBufferDescriptor(buffer - 1);
1051 need_to_zero = StartBufferIO(bufHdr, true, false);
1052 }
1053
1054 if (need_to_zero)
1055 {
1056 memset(BufferGetPage(buffer), 0, BLCKSZ);
1057
1058 /*
1059 * Grab the buffer content lock before marking the page as valid, to
1060 * make sure that no other backend sees the zeroed page before the
1061 * caller has had a chance to initialize it.
1062 *
1063 * Since no-one else can be looking at the page contents yet, there is
1064 * no difference between an exclusive lock and a cleanup-strength
1065 * lock. (Note that we cannot use LockBuffer() or
1066 * LockBufferForCleanup() here, because they assert that the buffer is
1067 * already valid.)
1068 */
1069 if (!isLocalBuf)
1071
1072 /* Set BM_VALID, terminate IO, and wake up any waiters */
1073 if (isLocalBuf)
1074 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1075 else
1076 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1077 }
1078 else if (!isLocalBuf)
1079 {
1080 /*
1081 * The buffer is valid, so we can't zero it. The caller still expects
1082 * the page to be locked on return.
1083 */
1084 if (mode == RBM_ZERO_AND_LOCK)
1086 else
1088 }
1089}
1090
1091/*
1092 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1093 * already present, or false if more work is required to either read it in or
1094 * zero it.
1095 */
1098 SMgrRelation smgr,
1099 char persistence,
1100 ForkNumber forkNum,
1101 BlockNumber blockNum,
1102 BufferAccessStrategy strategy,
1103 bool *foundPtr)
1104{
1105 BufferDesc *bufHdr;
1106 IOContext io_context;
1107 IOObject io_object;
1108
1109 Assert(blockNum != P_NEW);
1110
1111 /* Persistence should be set before */
1112 Assert((persistence == RELPERSISTENCE_TEMP ||
1113 persistence == RELPERSISTENCE_PERMANENT ||
1114 persistence == RELPERSISTENCE_UNLOGGED));
1115
1116 if (persistence == RELPERSISTENCE_TEMP)
1117 {
1118 io_context = IOCONTEXT_NORMAL;
1119 io_object = IOOBJECT_TEMP_RELATION;
1120 }
1121 else
1122 {
1123 io_context = IOContextForStrategy(strategy);
1124 io_object = IOOBJECT_RELATION;
1125 }
1126
1127 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1131 smgr->smgr_rlocator.backend);
1132
1133 if (persistence == RELPERSISTENCE_TEMP)
1134 {
1135 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1136 if (*foundPtr)
1138 }
1139 else
1140 {
1141 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1142 strategy, foundPtr, io_context);
1143 if (*foundPtr)
1145 }
1146 if (rel)
1147 {
1148 /*
1149 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1150 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1151 * zeroed instead), the per-relation stats always count them.
1152 */
1154 if (*foundPtr)
1156 }
1157 if (*foundPtr)
1158 {
1159 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1160 if (VacuumCostActive)
1162
1163 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1167 smgr->smgr_rlocator.backend,
1168 true);
1169 }
1170
1171 return BufferDescriptorGetBuffer(bufHdr);
1172}
1173
1174/*
1175 * ReadBuffer_common -- common logic for all ReadBuffer variants
1176 *
1177 * smgr is required, rel is optional unless using P_NEW.
1178 */
1180ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1181 ForkNumber forkNum,
1183 BufferAccessStrategy strategy)
1184{
1185 ReadBuffersOperation operation;
1186 Buffer buffer;
1187 int flags;
1188 char persistence;
1189
1190 /*
1191 * Backward compatibility path, most code should use ExtendBufferedRel()
1192 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1193 * scales a lot better.
1194 */
1195 if (unlikely(blockNum == P_NEW))
1196 {
1198
1199 /*
1200 * Since no-one else can be looking at the page contents yet, there is
1201 * no difference between an exclusive lock and a cleanup-strength
1202 * lock.
1203 */
1205 flags |= EB_LOCK_FIRST;
1206
1207 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1208 }
1209
1210 if (rel)
1211 persistence = rel->rd_rel->relpersistence;
1212 else
1213 persistence = smgr_persistence;
1214
1217 {
1218 bool found;
1219
1220 buffer = PinBufferForBlock(rel, smgr, persistence,
1221 forkNum, blockNum, strategy, &found);
1222 ZeroAndLockBuffer(buffer, mode, found);
1223 return buffer;
1224 }
1225
1226 /*
1227 * Signal that we are going to immediately wait. If we're immediately
1228 * waiting, there is no benefit in actually executing the IO
1229 * asynchronously, it would just add dispatch overhead.
1230 */
1232 if (mode == RBM_ZERO_ON_ERROR)
1234 operation.smgr = smgr;
1235 operation.rel = rel;
1236 operation.persistence = persistence;
1237 operation.forknum = forkNum;
1238 operation.strategy = strategy;
1239 if (StartReadBuffer(&operation,
1240 &buffer,
1241 blockNum,
1242 flags))
1243 WaitReadBuffers(&operation);
1244
1245 return buffer;
1246}
1247
1250 Buffer *buffers,
1251 BlockNumber blockNum,
1252 int *nblocks,
1253 int flags,
1254 bool allow_forwarding)
1255{
1256 int actual_nblocks = *nblocks;
1257 int maxcombine = 0;
1258 bool did_start_io;
1259
1260 Assert(*nblocks == 1 || allow_forwarding);
1261 Assert(*nblocks > 0);
1262 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1263
1264 for (int i = 0; i < actual_nblocks; ++i)
1265 {
1266 bool found;
1267
1268 if (allow_forwarding && buffers[i] != InvalidBuffer)
1269 {
1270 BufferDesc *bufHdr;
1271
1272 /*
1273 * This is a buffer that was pinned by an earlier call to
1274 * StartReadBuffers(), but couldn't be handled in one operation at
1275 * that time. The operation was split, and the caller has passed
1276 * an already pinned buffer back to us to handle the rest of the
1277 * operation. It must continue at the expected block number.
1278 */
1279 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1280
1281 /*
1282 * It might be an already valid buffer (a hit) that followed the
1283 * final contiguous block of an earlier I/O (a miss) marking the
1284 * end of it, or a buffer that some other backend has since made
1285 * valid by performing the I/O for us, in which case we can handle
1286 * it as a hit now. It is safe to check for a BM_VALID flag with
1287 * a relaxed load, because we got a fresh view of it while pinning
1288 * it in the previous call.
1289 *
1290 * On the other hand if we don't see BM_VALID yet, it must be an
1291 * I/O that was split by the previous call and we need to try to
1292 * start a new I/O from this block. We're also racing against any
1293 * other backend that might start the I/O or even manage to mark
1294 * it BM_VALID after this check, but StartBufferIO() will handle
1295 * those cases.
1296 */
1297 if (BufferIsLocal(buffers[i]))
1298 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1299 else
1300 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1302 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1303 }
1304 else
1305 {
1306 buffers[i] = PinBufferForBlock(operation->rel,
1307 operation->smgr,
1308 operation->persistence,
1309 operation->forknum,
1310 blockNum + i,
1311 operation->strategy,
1312 &found);
1313 }
1314
1315 if (found)
1316 {
1317 /*
1318 * We have a hit. If it's the first block in the requested range,
1319 * we can return it immediately and report that WaitReadBuffers()
1320 * does not need to be called. If the initial value of *nblocks
1321 * was larger, the caller will have to call again for the rest.
1322 */
1323 if (i == 0)
1324 {
1325 *nblocks = 1;
1326
1327#ifdef USE_ASSERT_CHECKING
1328
1329 /*
1330 * Initialize enough of ReadBuffersOperation to make
1331 * CheckReadBuffersOperation() work. Outside of assertions
1332 * that's not necessary when no IO is issued.
1333 */
1334 operation->buffers = buffers;
1335 operation->blocknum = blockNum;
1336 operation->nblocks = 1;
1337 operation->nblocks_done = 1;
1338 CheckReadBuffersOperation(operation, true);
1339#endif
1340 return false;
1341 }
1342
1343 /*
1344 * Otherwise we already have an I/O to perform, but this block
1345 * can't be included as it is already valid. Split the I/O here.
1346 * There may or may not be more blocks requiring I/O after this
1347 * one, we haven't checked, but they can't be contiguous with this
1348 * one in the way. We'll leave this buffer pinned, forwarding it
1349 * to the next call, avoiding the need to unpin it here and re-pin
1350 * it in the next call.
1351 */
1352 actual_nblocks = i;
1353 break;
1354 }
1355 else
1356 {
1357 /*
1358 * Check how many blocks we can cover with the same IO. The smgr
1359 * implementation might e.g. be limited due to a segment boundary.
1360 */
1361 if (i == 0 && actual_nblocks > 1)
1362 {
1363 maxcombine = smgrmaxcombine(operation->smgr,
1364 operation->forknum,
1365 blockNum);
1366 if (unlikely(maxcombine < actual_nblocks))
1367 {
1368 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1369 blockNum, actual_nblocks, maxcombine);
1370 actual_nblocks = maxcombine;
1371 }
1372 }
1373 }
1374 }
1375 *nblocks = actual_nblocks;
1376
1377 /* Populate information needed for I/O. */
1378 operation->buffers = buffers;
1379 operation->blocknum = blockNum;
1380 operation->flags = flags;
1381 operation->nblocks = actual_nblocks;
1382 operation->nblocks_done = 0;
1383 pgaio_wref_clear(&operation->io_wref);
1384
1385 /*
1386 * When using AIO, start the IO in the background. If not, issue prefetch
1387 * requests if desired by the caller.
1388 *
1389 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1390 * de-risk the introduction of AIO somewhat. It's a large architectural
1391 * change, with lots of chances for unanticipated performance effects.
1392 *
1393 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1394 * asynchronously, but without the check here we'd execute IO earlier than
1395 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1396 */
1397 if (io_method != IOMETHOD_SYNC)
1398 {
1399 /*
1400 * Try to start IO asynchronously. It's possible that no IO needs to
1401 * be started, if another backend already performed the IO.
1402 *
1403 * Note that if an IO is started, it might not cover the entire
1404 * requested range, e.g. because an intermediary block has been read
1405 * in by another backend. In that case any "trailing" buffers we
1406 * already pinned above will be "forwarded" by read_stream.c to the
1407 * next call to StartReadBuffers().
1408 *
1409 * This is signalled to the caller by decrementing *nblocks *and*
1410 * reducing operation->nblocks. The latter is done here, but not below
1411 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1412 * overall read size anymore, we need to retry until done in its
1413 * entirety or until failed.
1414 */
1415 did_start_io = AsyncReadBuffers(operation, nblocks);
1416
1417 operation->nblocks = *nblocks;
1418 }
1419 else
1420 {
1421 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1422
1423 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1424 {
1425 /*
1426 * In theory we should only do this if PinBufferForBlock() had to
1427 * allocate new buffers above. That way, if two calls to
1428 * StartReadBuffers() were made for the same blocks before
1429 * WaitReadBuffers(), only the first would issue the advice.
1430 * That'd be a better simulation of true asynchronous I/O, which
1431 * would only start the I/O once, but isn't done here for
1432 * simplicity.
1433 */
1434 smgrprefetch(operation->smgr,
1435 operation->forknum,
1436 blockNum,
1437 actual_nblocks);
1438 }
1439
1440 /*
1441 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1442 * will initiate the necessary IO.
1443 */
1444 did_start_io = true;
1445 }
1446
1447 CheckReadBuffersOperation(operation, !did_start_io);
1448
1449 return did_start_io;
1450}
1451
1452/*
1453 * Begin reading a range of blocks beginning at blockNum and extending for
1454 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1455 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1456 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1457 * and is now being continued. On return, *nblocks holds the number of blocks
1458 * accepted by this operation. If it is less than the original number then
1459 * this operation has been split, but buffer elements up to the original
1460 * requested size may hold forwarded buffers to be used for a continuing
1461 * operation. The caller must either start a new I/O beginning at the block
1462 * immediately following the blocks accepted by this call and pass those
1463 * buffers back in, or release them if it chooses not to. It shouldn't make
1464 * any other use of or assumptions about forwarded buffers.
1465 *
1466 * If false is returned, no I/O is necessary and the buffers covered by
1467 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1468 * an I/O has been started, and WaitReadBuffers() must be called with the same
1469 * operation object before the buffers covered by *nblocks on exit can be
1470 * accessed. Along with the operation object, the caller-supplied array of
1471 * buffers must remain valid until WaitReadBuffers() is called, and any
1472 * forwarded buffers must also be preserved for a continuing call unless
1473 * they are explicitly released.
1474 */
1475bool
1477 Buffer *buffers,
1478 BlockNumber blockNum,
1479 int *nblocks,
1480 int flags)
1481{
1482 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1483 true /* expect forwarded buffers */ );
1484}
1485
1486/*
1487 * Single block version of the StartReadBuffers(). This might save a few
1488 * instructions when called from another translation unit, because it is
1489 * specialized for nblocks == 1.
1490 *
1491 * This version does not support "forwarded" buffers: they cannot be created
1492 * by reading only one block and *buffer is ignored on entry.
1493 */
1494bool
1496 Buffer *buffer,
1497 BlockNumber blocknum,
1498 int flags)
1499{
1500 int nblocks = 1;
1501 bool result;
1502
1503 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1504 false /* single block, no forwarding */ );
1505 Assert(nblocks == 1); /* single block can't be short */
1506
1507 return result;
1508}
1509
1510/*
1511 * Perform sanity checks on the ReadBuffersOperation.
1512 */
1513static void
1515{
1516#ifdef USE_ASSERT_CHECKING
1517 Assert(operation->nblocks_done <= operation->nblocks);
1518 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1519
1520 for (int i = 0; i < operation->nblocks; i++)
1521 {
1522 Buffer buffer = operation->buffers[i];
1523 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1526
1527 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1529
1530 if (i < operation->nblocks_done)
1532 }
1533#endif
1534}
1535
1536/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1537static inline bool
1539{
1540 if (BufferIsLocal(buffer))
1542 true, nowait);
1543 else
1544 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1545}
1546
1547/*
1548 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1549 */
1550static inline bool
1552{
1553 /*
1554 * If this backend currently has staged IO, we need to submit the pending
1555 * IO before waiting for the right to issue IO, to avoid the potential for
1556 * deadlocks (and, more commonly, unnecessary delays for other backends).
1557 */
1558 if (!nowait && pgaio_have_staged())
1559 {
1561 return true;
1562
1563 /*
1564 * Unfortunately StartBufferIO() returning false doesn't allow to
1565 * distinguish between the buffer already being valid and IO already
1566 * being in progress. Since IO already being in progress is quite
1567 * rare, this approach seems fine.
1568 */
1570 }
1571
1572 return ReadBuffersCanStartIOOnce(buffer, nowait);
1573}
1574
1575/*
1576 * Helper for WaitReadBuffers() that processes the results of a readv
1577 * operation, raising an error if necessary.
1578 */
1579static void
1581{
1582 PgAioReturn *aio_ret = &operation->io_return;
1583 PgAioResultStatus rs = aio_ret->result.status;
1584 int newly_read_blocks = 0;
1585
1586 Assert(pgaio_wref_valid(&operation->io_wref));
1587 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1588
1589 /*
1590 * SMGR reports the number of blocks successfully read as the result of
1591 * the IO operation. Thus we can simply add that to ->nblocks_done.
1592 */
1593
1594 if (likely(rs != PGAIO_RS_ERROR))
1595 newly_read_blocks = aio_ret->result.result;
1596
1597 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1598 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1599 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1600 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1601 {
1602 /*
1603 * We'll retry, so we just emit a debug message to the server log (or
1604 * not even that in prod scenarios).
1605 */
1606 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1607 elog(DEBUG3, "partial read, will retry");
1608 }
1609
1610 Assert(newly_read_blocks > 0);
1611 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1612
1613 operation->nblocks_done += newly_read_blocks;
1614
1615 Assert(operation->nblocks_done <= operation->nblocks);
1616}
1617
1618void
1620{
1621 PgAioReturn *aio_ret = &operation->io_return;
1622 IOContext io_context;
1623 IOObject io_object;
1624
1625 if (operation->persistence == RELPERSISTENCE_TEMP)
1626 {
1627 io_context = IOCONTEXT_NORMAL;
1628 io_object = IOOBJECT_TEMP_RELATION;
1629 }
1630 else
1631 {
1632 io_context = IOContextForStrategy(operation->strategy);
1633 io_object = IOOBJECT_RELATION;
1634 }
1635
1636 /*
1637 * If we get here without an IO operation having been issued, the
1638 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1639 * caller should not have called WaitReadBuffers().
1640 *
1641 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1642 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1643 * of the retry logic below, no extra code is required.
1644 *
1645 * This path is expected to eventually go away.
1646 */
1647 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1648 elog(ERROR, "waiting for read operation that didn't read");
1649
1650 /*
1651 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1652 * done. We may need multiple retries, not just because we could get
1653 * multiple partial reads, but also because some of the remaining
1654 * to-be-read buffers may have been read in by other backends, limiting
1655 * the IO size.
1656 */
1657 while (true)
1658 {
1659 int ignored_nblocks_progress;
1660
1661 CheckReadBuffersOperation(operation, false);
1662
1663 /*
1664 * If there is an IO associated with the operation, we may need to
1665 * wait for it.
1666 */
1667 if (pgaio_wref_valid(&operation->io_wref))
1668 {
1669 /*
1670 * Track the time spent waiting for the IO to complete. As
1671 * tracking a wait even if we don't actually need to wait
1672 *
1673 * a) is not cheap, due to the timestamping overhead
1674 *
1675 * b) reports some time as waiting, even if we never waited
1676 *
1677 * we first check if we already know the IO is complete.
1678 */
1679 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1680 !pgaio_wref_check_done(&operation->io_wref))
1681 {
1683
1684 pgaio_wref_wait(&operation->io_wref);
1685
1686 /*
1687 * The IO operation itself was already counted earlier, in
1688 * AsyncReadBuffers(), this just accounts for the wait time.
1689 */
1690 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1691 io_start, 0, 0);
1692 }
1693 else
1694 {
1695 Assert(pgaio_wref_check_done(&operation->io_wref));
1696 }
1697
1698 /*
1699 * We now are sure the IO completed. Check the results. This
1700 * includes reporting on errors if there were any.
1701 */
1702 ProcessReadBuffersResult(operation);
1703 }
1704
1705 /*
1706 * Most of the time, the one IO we already started, will read in
1707 * everything. But we need to deal with partial reads and buffers not
1708 * needing IO anymore.
1709 */
1710 if (operation->nblocks_done == operation->nblocks)
1711 break;
1712
1714
1715 /*
1716 * This may only complete the IO partially, either because some
1717 * buffers were already valid, or because of a partial read.
1718 *
1719 * NB: In contrast to after the AsyncReadBuffers() call in
1720 * StartReadBuffers(), we do *not* reduce
1721 * ReadBuffersOperation->nblocks here, callers expect the full
1722 * operation to be completed at this point (as more operations may
1723 * have been queued).
1724 */
1725 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1726 }
1727
1728 CheckReadBuffersOperation(operation, true);
1729
1730 /* NB: READ_DONE tracepoint was already executed in completion callback */
1731}
1732
1733/*
1734 * Initiate IO for the ReadBuffersOperation
1735 *
1736 * This function only starts a single IO at a time. The size of the IO may be
1737 * limited to below the to-be-read blocks, if one of the buffers has
1738 * concurrently been read in. If the first to-be-read buffer is already valid,
1739 * no IO will be issued.
1740 *
1741 * To support retries after partial reads, the first operation->nblocks_done
1742 * buffers are skipped.
1743 *
1744 * On return *nblocks_progress is updated to reflect the number of buffers
1745 * affected by the call. If the first buffer is valid, *nblocks_progress is
1746 * set to 1 and operation->nblocks_done is incremented.
1747 *
1748 * Returns true if IO was initiated, false if no IO was necessary.
1749 */
1750static bool
1751AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1752{
1753 Buffer *buffers = &operation->buffers[0];
1754 int flags = operation->flags;
1755 BlockNumber blocknum = operation->blocknum;
1756 ForkNumber forknum = operation->forknum;
1757 char persistence = operation->persistence;
1758 int16 nblocks_done = operation->nblocks_done;
1759 Buffer *io_buffers = &operation->buffers[nblocks_done];
1760 int io_buffers_len = 0;
1761 PgAioHandle *ioh;
1762 uint32 ioh_flags = 0;
1763 void *io_pages[MAX_IO_COMBINE_LIMIT];
1764 IOContext io_context;
1765 IOObject io_object;
1766 bool did_start_io;
1767
1768 /*
1769 * When this IO is executed synchronously, either because the caller will
1770 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1771 * the AIO subsystem needs to know.
1772 */
1773 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1774 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1775
1776 if (persistence == RELPERSISTENCE_TEMP)
1777 {
1778 io_context = IOCONTEXT_NORMAL;
1779 io_object = IOOBJECT_TEMP_RELATION;
1780 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1781 }
1782 else
1783 {
1784 io_context = IOContextForStrategy(operation->strategy);
1785 io_object = IOOBJECT_RELATION;
1786 }
1787
1788 /*
1789 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1790 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1791 * set globally, but on a per-session basis. The completion callback,
1792 * which may be run in other processes, e.g. in IO workers, may have a
1793 * different value of the zero_damaged_pages GUC.
1794 *
1795 * XXX: We probably should eventually use a different flag for
1796 * zero_damaged_pages, so we can report different log levels / error codes
1797 * for zero_damaged_pages and ZERO_ON_ERROR.
1798 */
1801
1802 /*
1803 * For the same reason as with zero_damaged_pages we need to use this
1804 * backend's ignore_checksum_failure value.
1805 */
1808
1809
1810 /*
1811 * To be allowed to report stats in the local completion callback we need
1812 * to prepare to report stats now. This ensures we can safely report the
1813 * checksum failure even in a critical section.
1814 */
1816
1817 /*
1818 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1819 * might block, which we don't want after setting IO_IN_PROGRESS.
1820 *
1821 * If we need to wait for IO before we can get a handle, submit
1822 * already-staged IO first, so that other backends don't need to wait.
1823 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1824 * wait for already submitted IO, which doesn't require additional locks,
1825 * but it could still cause undesirable waits.
1826 *
1827 * A secondary benefit is that this would allow us to measure the time in
1828 * pgaio_io_acquire() without causing undue timer overhead in the common,
1829 * non-blocking, case. However, currently the pgstats infrastructure
1830 * doesn't really allow that, as it a) asserts that an operation can't
1831 * have time without operations b) doesn't have an API to report
1832 * "accumulated" time.
1833 */
1835 if (unlikely(!ioh))
1836 {
1838
1840 }
1841
1842 /*
1843 * Check if we can start IO on the first to-be-read buffer.
1844 *
1845 * If an I/O is already in progress in another backend, we want to wait
1846 * for the outcome: either done, or something went wrong and we will
1847 * retry.
1848 */
1849 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1850 {
1851 /*
1852 * Someone else has already completed this block, we're done.
1853 *
1854 * When IO is necessary, ->nblocks_done is updated in
1855 * ProcessReadBuffersResult(), but that is not called if no IO is
1856 * necessary. Thus update here.
1857 */
1858 operation->nblocks_done += 1;
1859 *nblocks_progress = 1;
1860
1861 pgaio_io_release(ioh);
1862 pgaio_wref_clear(&operation->io_wref);
1863 did_start_io = false;
1864
1865 /*
1866 * Report and track this as a 'hit' for this backend, even though it
1867 * must have started out as a miss in PinBufferForBlock(). The other
1868 * backend will track this as a 'read'.
1869 */
1870 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1871 operation->smgr->smgr_rlocator.locator.spcOid,
1872 operation->smgr->smgr_rlocator.locator.dbOid,
1873 operation->smgr->smgr_rlocator.locator.relNumber,
1874 operation->smgr->smgr_rlocator.backend,
1875 true);
1876
1877 if (persistence == RELPERSISTENCE_TEMP)
1879 else
1881
1882 if (operation->rel)
1883 pgstat_count_buffer_hit(operation->rel);
1884
1885 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1886
1887 if (VacuumCostActive)
1889 }
1890 else
1891 {
1892 instr_time io_start;
1893
1894 /* We found a buffer that we need to read in. */
1895 Assert(io_buffers[0] == buffers[nblocks_done]);
1896 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1897 io_buffers_len = 1;
1898
1899 /*
1900 * How many neighboring-on-disk blocks can we scatter-read into other
1901 * buffers at the same time? In this case we don't wait if we see an
1902 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1903 * head block, so we should get on with that I/O as soon as possible.
1904 */
1905 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1906 {
1907 if (!ReadBuffersCanStartIO(buffers[i], true))
1908 break;
1909 /* Must be consecutive block numbers. */
1910 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1911 BufferGetBlockNumber(buffers[i]) - 1);
1912 Assert(io_buffers[io_buffers_len] == buffers[i]);
1913
1914 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1915 }
1916
1917 /* get a reference to wait for in WaitReadBuffers() */
1918 pgaio_io_get_wref(ioh, &operation->io_wref);
1919
1920 /* provide the list of buffers to the completion callbacks */
1921 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1922
1924 persistence == RELPERSISTENCE_TEMP ?
1927 flags);
1928
1929 pgaio_io_set_flag(ioh, ioh_flags);
1930
1931 /* ---
1932 * Even though we're trying to issue IO asynchronously, track the time
1933 * in smgrstartreadv():
1934 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1935 * immediately
1936 * - the io method might not support the IO (e.g. worker IO for a temp
1937 * table)
1938 * ---
1939 */
1941 smgrstartreadv(ioh, operation->smgr, forknum,
1942 blocknum + nblocks_done,
1943 io_pages, io_buffers_len);
1944 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1945 io_start, 1, io_buffers_len * BLCKSZ);
1946
1947 if (persistence == RELPERSISTENCE_TEMP)
1948 pgBufferUsage.local_blks_read += io_buffers_len;
1949 else
1950 pgBufferUsage.shared_blks_read += io_buffers_len;
1951
1952 /*
1953 * Track vacuum cost when issuing IO, not after waiting for it.
1954 * Otherwise we could end up issuing a lot of IO in a short timespan,
1955 * despite a low cost limit.
1956 */
1957 if (VacuumCostActive)
1958 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1959
1960 *nblocks_progress = io_buffers_len;
1961 did_start_io = true;
1962 }
1963
1964 return did_start_io;
1965}
1966
1967/*
1968 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1969 * buffer. If no buffer exists already, selects a replacement victim and
1970 * evicts the old page, but does NOT read in new page.
1971 *
1972 * "strategy" can be a buffer replacement strategy object, or NULL for
1973 * the default strategy. The selected buffer's usage_count is advanced when
1974 * using the default strategy, but otherwise possibly not (see PinBuffer).
1975 *
1976 * The returned buffer is pinned and is already marked as holding the
1977 * desired page. If it already did have the desired page, *foundPtr is
1978 * set true. Otherwise, *foundPtr is set false.
1979 *
1980 * io_context is passed as an output parameter to avoid calling
1981 * IOContextForStrategy() when there is a shared buffers hit and no IO
1982 * statistics need be captured.
1983 *
1984 * No locks are held either at entry or exit.
1985 */
1987BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1988 BlockNumber blockNum,
1989 BufferAccessStrategy strategy,
1990 bool *foundPtr, IOContext io_context)
1991{
1992 BufferTag newTag; /* identity of requested block */
1993 uint32 newHash; /* hash value for newTag */
1994 LWLock *newPartitionLock; /* buffer partition lock for it */
1995 int existing_buf_id;
1996 Buffer victim_buffer;
1997 BufferDesc *victim_buf_hdr;
1998 uint32 victim_buf_state;
1999
2000 /* Make sure we will have room to remember the buffer pin */
2003
2004 /* create a tag so we can lookup the buffer */
2005 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2006
2007 /* determine its hash code and partition lock ID */
2008 newHash = BufTableHashCode(&newTag);
2009 newPartitionLock = BufMappingPartitionLock(newHash);
2010
2011 /* see if the block is in the buffer pool already */
2012 LWLockAcquire(newPartitionLock, LW_SHARED);
2013 existing_buf_id = BufTableLookup(&newTag, newHash);
2014 if (existing_buf_id >= 0)
2015 {
2016 BufferDesc *buf;
2017 bool valid;
2018
2019 /*
2020 * Found it. Now, pin the buffer so no one can steal it from the
2021 * buffer pool, and check to see if the correct data has been loaded
2022 * into the buffer.
2023 */
2024 buf = GetBufferDescriptor(existing_buf_id);
2025
2026 valid = PinBuffer(buf, strategy, false);
2027
2028 /* Can release the mapping lock as soon as we've pinned it */
2029 LWLockRelease(newPartitionLock);
2030
2031 *foundPtr = true;
2032
2033 if (!valid)
2034 {
2035 /*
2036 * We can only get here if (a) someone else is still reading in
2037 * the page, (b) a previous read attempt failed, or (c) someone
2038 * called StartReadBuffers() but not yet WaitReadBuffers().
2039 */
2040 *foundPtr = false;
2041 }
2042
2043 return buf;
2044 }
2045
2046 /*
2047 * Didn't find it in the buffer pool. We'll have to initialize a new
2048 * buffer. Remember to unlock the mapping lock while doing the work.
2049 */
2050 LWLockRelease(newPartitionLock);
2051
2052 /*
2053 * Acquire a victim buffer. Somebody else might try to do the same, we
2054 * don't hold any conflicting locks. If so we'll have to undo our work
2055 * later.
2056 */
2057 victim_buffer = GetVictimBuffer(strategy, io_context);
2058 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2059
2060 /*
2061 * Try to make a hashtable entry for the buffer under its new tag. If
2062 * somebody else inserted another buffer for the tag, we'll release the
2063 * victim buffer we acquired and use the already inserted one.
2064 */
2065 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2066 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2067 if (existing_buf_id >= 0)
2068 {
2069 BufferDesc *existing_buf_hdr;
2070 bool valid;
2071
2072 /*
2073 * Got a collision. Someone has already done what we were about to do.
2074 * We'll just handle this as if it were found in the buffer pool in
2075 * the first place. First, give up the buffer we were planning to
2076 * use.
2077 *
2078 * We could do this after releasing the partition lock, but then we'd
2079 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2080 * before acquiring the lock, for the rare case of such a collision.
2081 */
2082 UnpinBuffer(victim_buf_hdr);
2083
2084 /* remaining code should match code at top of routine */
2085
2086 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2087
2088 valid = PinBuffer(existing_buf_hdr, strategy, false);
2089
2090 /* Can release the mapping lock as soon as we've pinned it */
2091 LWLockRelease(newPartitionLock);
2092
2093 *foundPtr = true;
2094
2095 if (!valid)
2096 {
2097 /*
2098 * We can only get here if (a) someone else is still reading in
2099 * the page, (b) a previous read attempt failed, or (c) someone
2100 * called StartReadBuffers() but not yet WaitReadBuffers().
2101 */
2102 *foundPtr = false;
2103 }
2104
2105 return existing_buf_hdr;
2106 }
2107
2108 /*
2109 * Need to lock the buffer header too in order to change its tag.
2110 */
2111 victim_buf_state = LockBufHdr(victim_buf_hdr);
2112
2113 /* some sanity checks while we hold the buffer header lock */
2114 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2115 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2116
2117 victim_buf_hdr->tag = newTag;
2118
2119 /*
2120 * Make sure BM_PERMANENT is set for buffers that must be written at every
2121 * checkpoint. Unlogged buffers only need to be written at shutdown
2122 * checkpoints, except for their "init" forks, which need to be treated
2123 * just like permanent relations.
2124 */
2125 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2126 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2127 victim_buf_state |= BM_PERMANENT;
2128
2129 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2130
2131 LWLockRelease(newPartitionLock);
2132
2133 /*
2134 * Buffer contents are currently invalid.
2135 */
2136 *foundPtr = false;
2137
2138 return victim_buf_hdr;
2139}
2140
2141/*
2142 * InvalidateBuffer -- mark a shared buffer invalid.
2143 *
2144 * The buffer header spinlock must be held at entry. We drop it before
2145 * returning. (This is sane because the caller must have locked the
2146 * buffer in order to be sure it should be dropped.)
2147 *
2148 * This is used only in contexts such as dropping a relation. We assume
2149 * that no other backend could possibly be interested in using the page,
2150 * so the only reason the buffer might be pinned is if someone else is
2151 * trying to write it out. We have to let them finish before we can
2152 * reclaim the buffer.
2153 *
2154 * The buffer could get reclaimed by someone else while we are waiting
2155 * to acquire the necessary locks; if so, don't mess it up.
2156 */
2157static void
2159{
2160 BufferTag oldTag;
2161 uint32 oldHash; /* hash value for oldTag */
2162 LWLock *oldPartitionLock; /* buffer partition lock for it */
2163 uint32 oldFlags;
2164 uint32 buf_state;
2165
2166 /* Save the original buffer tag before dropping the spinlock */
2167 oldTag = buf->tag;
2168
2169 buf_state = pg_atomic_read_u32(&buf->state);
2170 Assert(buf_state & BM_LOCKED);
2171 UnlockBufHdr(buf, buf_state);
2172
2173 /*
2174 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2175 * worth storing the hashcode in BufferDesc so we need not recompute it
2176 * here? Probably not.
2177 */
2178 oldHash = BufTableHashCode(&oldTag);
2179 oldPartitionLock = BufMappingPartitionLock(oldHash);
2180
2181retry:
2182
2183 /*
2184 * Acquire exclusive mapping lock in preparation for changing the buffer's
2185 * association.
2186 */
2187 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2188
2189 /* Re-lock the buffer header */
2190 buf_state = LockBufHdr(buf);
2191
2192 /* If it's changed while we were waiting for lock, do nothing */
2193 if (!BufferTagsEqual(&buf->tag, &oldTag))
2194 {
2195 UnlockBufHdr(buf, buf_state);
2196 LWLockRelease(oldPartitionLock);
2197 return;
2198 }
2199
2200 /*
2201 * We assume the reason for it to be pinned is that either we were
2202 * asynchronously reading the page in before erroring out or someone else
2203 * is flushing the page out. Wait for the IO to finish. (This could be
2204 * an infinite loop if the refcount is messed up... it would be nice to
2205 * time out after awhile, but there seems no way to be sure how many loops
2206 * may be needed. Note that if the other guy has pinned the buffer but
2207 * not yet done StartBufferIO, WaitIO will fall through and we'll
2208 * effectively be busy-looping here.)
2209 */
2210 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2211 {
2212 UnlockBufHdr(buf, buf_state);
2213 LWLockRelease(oldPartitionLock);
2214 /* safety check: should definitely not be our *own* pin */
2216 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2217 WaitIO(buf);
2218 goto retry;
2219 }
2220
2221 /*
2222 * Clear out the buffer's tag and flags. We must do this to ensure that
2223 * linear scans of the buffer array don't think the buffer is valid.
2224 */
2225 oldFlags = buf_state & BUF_FLAG_MASK;
2226 ClearBufferTag(&buf->tag);
2227 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2228 UnlockBufHdr(buf, buf_state);
2229
2230 /*
2231 * Remove the buffer from the lookup hashtable, if it was in there.
2232 */
2233 if (oldFlags & BM_TAG_VALID)
2234 BufTableDelete(&oldTag, oldHash);
2235
2236 /*
2237 * Done with mapping lock.
2238 */
2239 LWLockRelease(oldPartitionLock);
2240}
2241
2242/*
2243 * Helper routine for GetVictimBuffer()
2244 *
2245 * Needs to be called on a buffer with a valid tag, pinned, but without the
2246 * buffer header spinlock held.
2247 *
2248 * Returns true if the buffer can be reused, in which case the buffer is only
2249 * pinned by this backend and marked as invalid, false otherwise.
2250 */
2251static bool
2253{
2254 uint32 buf_state;
2255 uint32 hash;
2256 LWLock *partition_lock;
2257 BufferTag tag;
2258
2260
2261 /* have buffer pinned, so it's safe to read tag without lock */
2262 tag = buf_hdr->tag;
2263
2264 hash = BufTableHashCode(&tag);
2265 partition_lock = BufMappingPartitionLock(hash);
2266
2267 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2268
2269 /* lock the buffer header */
2270 buf_state = LockBufHdr(buf_hdr);
2271
2272 /*
2273 * We have the buffer pinned nobody else should have been able to unset
2274 * this concurrently.
2275 */
2276 Assert(buf_state & BM_TAG_VALID);
2277 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2278 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2279
2280 /*
2281 * If somebody else pinned the buffer since, or even worse, dirtied it,
2282 * give up on this buffer: It's clearly in use.
2283 */
2284 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2285 {
2286 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2287
2288 UnlockBufHdr(buf_hdr, buf_state);
2289 LWLockRelease(partition_lock);
2290
2291 return false;
2292 }
2293
2294 /*
2295 * Clear out the buffer's tag and flags and usagecount. This is not
2296 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2297 * doing anything with the buffer. But currently it's beneficial, as the
2298 * cheaper pre-check for several linear scans of shared buffers use the
2299 * tag (see e.g. FlushDatabaseBuffers()).
2300 */
2301 ClearBufferTag(&buf_hdr->tag);
2302 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2303 UnlockBufHdr(buf_hdr, buf_state);
2304
2305 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2306
2307 /* finally delete buffer from the buffer mapping table */
2308 BufTableDelete(&tag, hash);
2309
2310 LWLockRelease(partition_lock);
2311
2312 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2313 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2315
2316 return true;
2317}
2318
2319static Buffer
2321{
2322 BufferDesc *buf_hdr;
2323 Buffer buf;
2324 uint32 buf_state;
2325 bool from_ring;
2326
2327 /*
2328 * Ensure, before we pin a victim buffer, that there's a free refcount
2329 * entry and resource owner slot for the pin.
2330 */
2333
2334 /* we return here if a prospective victim buffer gets used concurrently */
2335again:
2336
2337 /*
2338 * Select a victim buffer. The buffer is returned pinned and owned by
2339 * this backend.
2340 */
2341 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2342 buf = BufferDescriptorGetBuffer(buf_hdr);
2343
2344 /*
2345 * We shouldn't have any other pins for this buffer.
2346 */
2348
2349 /*
2350 * If the buffer was dirty, try to write it out. There is a race
2351 * condition here, in that someone might dirty it after we released the
2352 * buffer header lock above, or even while we are writing it out (since
2353 * our share-lock won't prevent hint-bit updates). We will recheck the
2354 * dirty bit after re-locking the buffer header.
2355 */
2356 if (buf_state & BM_DIRTY)
2357 {
2358 LWLock *content_lock;
2359
2360 Assert(buf_state & BM_TAG_VALID);
2361 Assert(buf_state & BM_VALID);
2362
2363 /*
2364 * We need a share-lock on the buffer contents to write it out (else
2365 * we might write invalid data, eg because someone else is compacting
2366 * the page contents while we write). We must use a conditional lock
2367 * acquisition here to avoid deadlock. Even though the buffer was not
2368 * pinned (and therefore surely not locked) when StrategyGetBuffer
2369 * returned it, someone else could have pinned and exclusive-locked it
2370 * by the time we get here. If we try to get the lock unconditionally,
2371 * we'd block waiting for them; if they later block waiting for us,
2372 * deadlock ensues. (This has been observed to happen when two
2373 * backends are both trying to split btree index pages, and the second
2374 * one just happens to be trying to split the page the first one got
2375 * from StrategyGetBuffer.)
2376 */
2377 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2378 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2379 {
2380 /*
2381 * Someone else has locked the buffer, so give it up and loop back
2382 * to get another one.
2383 */
2384 UnpinBuffer(buf_hdr);
2385 goto again;
2386 }
2387
2388 /*
2389 * If using a nondefault strategy, and writing the buffer would
2390 * require a WAL flush, let the strategy decide whether to go ahead
2391 * and write/reuse the buffer or to choose another victim. We need a
2392 * lock to inspect the page LSN, so this can't be done inside
2393 * StrategyGetBuffer.
2394 */
2395 if (strategy != NULL)
2396 {
2397 XLogRecPtr lsn;
2398
2399 /* Read the LSN while holding buffer header lock */
2400 buf_state = LockBufHdr(buf_hdr);
2401 lsn = BufferGetLSN(buf_hdr);
2402 UnlockBufHdr(buf_hdr, buf_state);
2403
2404 if (XLogNeedsFlush(lsn)
2405 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2406 {
2407 LWLockRelease(content_lock);
2408 UnpinBuffer(buf_hdr);
2409 goto again;
2410 }
2411 }
2412
2413 /* OK, do the I/O */
2414 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2415 LWLockRelease(content_lock);
2416
2418 &buf_hdr->tag);
2419 }
2420
2421
2422 if (buf_state & BM_VALID)
2423 {
2424 /*
2425 * When a BufferAccessStrategy is in use, blocks evicted from shared
2426 * buffers are counted as IOOP_EVICT in the corresponding context
2427 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2428 * strategy in two cases: 1) while initially claiming buffers for the
2429 * strategy ring 2) to replace an existing strategy ring buffer
2430 * because it is pinned or in use and cannot be reused.
2431 *
2432 * Blocks evicted from buffers already in the strategy ring are
2433 * counted as IOOP_REUSE in the corresponding strategy context.
2434 *
2435 * At this point, we can accurately count evictions and reuses,
2436 * because we have successfully claimed the valid buffer. Previously,
2437 * we may have been forced to release the buffer due to concurrent
2438 * pinners or erroring out.
2439 */
2441 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2442 }
2443
2444 /*
2445 * If the buffer has an entry in the buffer mapping table, delete it. This
2446 * can fail because another backend could have pinned or dirtied the
2447 * buffer.
2448 */
2449 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2450 {
2451 UnpinBuffer(buf_hdr);
2452 goto again;
2453 }
2454
2455 /* a final set of sanity checks */
2456#ifdef USE_ASSERT_CHECKING
2457 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2458
2459 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2460 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2461
2463#endif
2464
2465 return buf;
2466}
2467
2468/*
2469 * Return the maximum number of buffers that a backend should try to pin once,
2470 * to avoid exceeding its fair share. This is the highest value that
2471 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2472 * system with a very small buffer pool relative to max_connections.
2473 */
2474uint32
2476{
2477 return MaxProportionalPins;
2478}
2479
2480/*
2481 * Return the maximum number of additional buffers that this backend should
2482 * pin if it wants to stay under the per-backend limit, considering the number
2483 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2484 * return by this function can be zero.
2485 */
2486uint32
2488{
2489 uint32 estimated_pins_held;
2490
2491 /*
2492 * We get the number of "overflowed" pins for free, but don't know the
2493 * number of pins in PrivateRefCountArray. The cost of calculating that
2494 * exactly doesn't seem worth it, so just assume the max.
2495 */
2496 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2497
2498 /* Is this backend already holding more than its fair share? */
2499 if (estimated_pins_held > MaxProportionalPins)
2500 return 0;
2501
2502 return MaxProportionalPins - estimated_pins_held;
2503}
2504
2505/*
2506 * Limit the number of pins a batch operation may additionally acquire, to
2507 * avoid running out of pinnable buffers.
2508 *
2509 * One additional pin is always allowed, on the assumption that the operation
2510 * requires at least one to make progress.
2511 */
2512void
2514{
2515 uint32 limit;
2516
2517 if (*additional_pins <= 1)
2518 return;
2519
2520 limit = GetAdditionalPinLimit();
2521 limit = Max(limit, 1);
2522 if (limit < *additional_pins)
2523 *additional_pins = limit;
2524}
2525
2526/*
2527 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2528 * avoid duplicating the tracing and relpersistence related logic.
2529 */
2530static BlockNumber
2532 ForkNumber fork,
2533 BufferAccessStrategy strategy,
2534 uint32 flags,
2535 uint32 extend_by,
2536 BlockNumber extend_upto,
2537 Buffer *buffers,
2538 uint32 *extended_by)
2539{
2540 BlockNumber first_block;
2541
2542 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2547 extend_by);
2548
2549 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2550 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2551 extend_by, extend_upto,
2552 buffers, &extend_by);
2553 else
2554 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2555 extend_by, extend_upto,
2556 buffers, &extend_by);
2557 *extended_by = extend_by;
2558
2559 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2564 *extended_by,
2565 first_block);
2566
2567 return first_block;
2568}
2569
2570/*
2571 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2572 * shared buffers.
2573 */
2574static BlockNumber
2576 ForkNumber fork,
2577 BufferAccessStrategy strategy,
2578 uint32 flags,
2579 uint32 extend_by,
2580 BlockNumber extend_upto,
2581 Buffer *buffers,
2582 uint32 *extended_by)
2583{
2584 BlockNumber first_block;
2585 IOContext io_context = IOContextForStrategy(strategy);
2586 instr_time io_start;
2587
2588 LimitAdditionalPins(&extend_by);
2589
2590 /*
2591 * Acquire victim buffers for extension without holding extension lock.
2592 * Writing out victim buffers is the most expensive part of extending the
2593 * relation, particularly when doing so requires WAL flushes. Zeroing out
2594 * the buffers is also quite expensive, so do that before holding the
2595 * extension lock as well.
2596 *
2597 * These pages are pinned by us and not valid. While we hold the pin they
2598 * can't be acquired as victim buffers by another backend.
2599 */
2600 for (uint32 i = 0; i < extend_by; i++)
2601 {
2602 Block buf_block;
2603
2604 buffers[i] = GetVictimBuffer(strategy, io_context);
2605 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2606
2607 /* new buffers are zero-filled */
2608 MemSet(buf_block, 0, BLCKSZ);
2609 }
2610
2611 /*
2612 * Lock relation against concurrent extensions, unless requested not to.
2613 *
2614 * We use the same extension lock for all forks. That's unnecessarily
2615 * restrictive, but currently extensions for forks don't happen often
2616 * enough to make it worth locking more granularly.
2617 *
2618 * Note that another backend might have extended the relation by the time
2619 * we get the lock.
2620 */
2621 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2623
2624 /*
2625 * If requested, invalidate size cache, so that smgrnblocks asks the
2626 * kernel.
2627 */
2628 if (flags & EB_CLEAR_SIZE_CACHE)
2630
2631 first_block = smgrnblocks(bmr.smgr, fork);
2632
2633 /*
2634 * Now that we have the accurate relation size, check if the caller wants
2635 * us to extend to only up to a specific size. If there were concurrent
2636 * extensions, we might have acquired too many buffers and need to release
2637 * them.
2638 */
2639 if (extend_upto != InvalidBlockNumber)
2640 {
2641 uint32 orig_extend_by = extend_by;
2642
2643 if (first_block > extend_upto)
2644 extend_by = 0;
2645 else if ((uint64) first_block + extend_by > extend_upto)
2646 extend_by = extend_upto - first_block;
2647
2648 for (uint32 i = extend_by; i < orig_extend_by; i++)
2649 {
2650 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2651
2652 UnpinBuffer(buf_hdr);
2653 }
2654
2655 if (extend_by == 0)
2656 {
2657 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2659 *extended_by = extend_by;
2660 return first_block;
2661 }
2662 }
2663
2664 /* Fail if relation is already at maximum possible length */
2665 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2666 ereport(ERROR,
2667 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2668 errmsg("cannot extend relation %s beyond %u blocks",
2669 relpath(bmr.smgr->smgr_rlocator, fork).str,
2670 MaxBlockNumber)));
2671
2672 /*
2673 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2674 *
2675 * This needs to happen before we extend the relation, because as soon as
2676 * we do, other backends can start to read in those pages.
2677 */
2678 for (uint32 i = 0; i < extend_by; i++)
2679 {
2680 Buffer victim_buf = buffers[i];
2681 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2682 BufferTag tag;
2683 uint32 hash;
2684 LWLock *partition_lock;
2685 int existing_id;
2686
2687 /* in case we need to pin an existing buffer below */
2690
2691 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2692 hash = BufTableHashCode(&tag);
2693 partition_lock = BufMappingPartitionLock(hash);
2694
2695 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2696
2697 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2698
2699 /*
2700 * We get here only in the corner case where we are trying to extend
2701 * the relation but we found a pre-existing buffer. This can happen
2702 * because a prior attempt at extending the relation failed, and
2703 * because mdread doesn't complain about reads beyond EOF (when
2704 * zero_damaged_pages is ON) and so a previous attempt to read a block
2705 * beyond EOF could have left a "valid" zero-filled buffer.
2706 *
2707 * This has also been observed when relation was overwritten by
2708 * external process. Since the legitimate cases should always have
2709 * left a zero-filled buffer, complain if not PageIsNew.
2710 */
2711 if (existing_id >= 0)
2712 {
2713 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2714 Block buf_block;
2715 bool valid;
2716
2717 /*
2718 * Pin the existing buffer before releasing the partition lock,
2719 * preventing it from being evicted.
2720 */
2721 valid = PinBuffer(existing_hdr, strategy, false);
2722
2723 LWLockRelease(partition_lock);
2724 UnpinBuffer(victim_buf_hdr);
2725
2726 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2727 buf_block = BufHdrGetBlock(existing_hdr);
2728
2729 if (valid && !PageIsNew((Page) buf_block))
2730 ereport(ERROR,
2731 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2732 existing_hdr->tag.blockNum,
2733 relpath(bmr.smgr->smgr_rlocator, fork).str)));
2734
2735 /*
2736 * We *must* do smgr[zero]extend before succeeding, else the page
2737 * will not be reserved by the kernel, and the next P_NEW call
2738 * will decide to return the same page. Clear the BM_VALID bit,
2739 * do StartBufferIO() and proceed.
2740 *
2741 * Loop to handle the very small possibility that someone re-sets
2742 * BM_VALID between our clearing it and StartBufferIO inspecting
2743 * it.
2744 */
2745 do
2746 {
2747 uint32 buf_state = LockBufHdr(existing_hdr);
2748
2749 buf_state &= ~BM_VALID;
2750 UnlockBufHdr(existing_hdr, buf_state);
2751 } while (!StartBufferIO(existing_hdr, true, false));
2752 }
2753 else
2754 {
2755 uint32 buf_state;
2756
2757 buf_state = LockBufHdr(victim_buf_hdr);
2758
2759 /* some sanity checks while we hold the buffer header lock */
2760 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2761 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2762
2763 victim_buf_hdr->tag = tag;
2764
2765 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2766 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2767 buf_state |= BM_PERMANENT;
2768
2769 UnlockBufHdr(victim_buf_hdr, buf_state);
2770
2771 LWLockRelease(partition_lock);
2772
2773 /* XXX: could combine the locked operations in it with the above */
2774 StartBufferIO(victim_buf_hdr, true, false);
2775 }
2776 }
2777
2779
2780 /*
2781 * Note: if smgrzeroextend fails, we will end up with buffers that are
2782 * allocated but not marked BM_VALID. The next relation extension will
2783 * still select the same block number (because the relation didn't get any
2784 * longer on disk) and so future attempts to extend the relation will find
2785 * the same buffers (if they have not been recycled) but come right back
2786 * here to try smgrzeroextend again.
2787 *
2788 * We don't need to set checksum for all-zero pages.
2789 */
2790 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2791
2792 /*
2793 * Release the file-extension lock; it's now OK for someone else to extend
2794 * the relation some more.
2795 *
2796 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2797 * take noticeable time.
2798 */
2799 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2801
2803 io_start, 1, extend_by * BLCKSZ);
2804
2805 /* Set BM_VALID, terminate IO, and wake up any waiters */
2806 for (uint32 i = 0; i < extend_by; i++)
2807 {
2808 Buffer buf = buffers[i];
2809 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2810 bool lock = false;
2811
2812 if (flags & EB_LOCK_FIRST && i == 0)
2813 lock = true;
2814 else if (flags & EB_LOCK_TARGET)
2815 {
2816 Assert(extend_upto != InvalidBlockNumber);
2817 if (first_block + i + 1 == extend_upto)
2818 lock = true;
2819 }
2820
2821 if (lock)
2823
2824 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2825 }
2826
2828
2829 *extended_by = extend_by;
2830
2831 return first_block;
2832}
2833
2834/*
2835 * BufferIsLockedByMe
2836 *
2837 * Checks if this backend has the buffer locked in any mode.
2838 *
2839 * Buffer must be pinned.
2840 */
2841bool
2843{
2844 BufferDesc *bufHdr;
2845
2847
2848 if (BufferIsLocal(buffer))
2849 {
2850 /* Content locks are not maintained for local buffers. */
2851 return true;
2852 }
2853 else
2854 {
2855 bufHdr = GetBufferDescriptor(buffer - 1);
2857 }
2858}
2859
2860/*
2861 * BufferIsLockedByMeInMode
2862 *
2863 * Checks if this backend has the buffer locked in the specified mode.
2864 *
2865 * Buffer must be pinned.
2866 */
2867bool
2869{
2870 BufferDesc *bufHdr;
2871
2873
2874 if (BufferIsLocal(buffer))
2875 {
2876 /* Content locks are not maintained for local buffers. */
2877 return true;
2878 }
2879 else
2880 {
2881 LWLockMode lw_mode;
2882
2883 switch (mode)
2884 {
2886 lw_mode = LW_EXCLUSIVE;
2887 break;
2888 case BUFFER_LOCK_SHARE:
2889 lw_mode = LW_SHARED;
2890 break;
2891 default:
2893 }
2894
2895 bufHdr = GetBufferDescriptor(buffer - 1);
2897 lw_mode);
2898 }
2899}
2900
2901/*
2902 * BufferIsDirty
2903 *
2904 * Checks if buffer is already dirty.
2905 *
2906 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2907 * the result may be stale before it's returned.)
2908 */
2909bool
2911{
2912 BufferDesc *bufHdr;
2913
2915
2916 if (BufferIsLocal(buffer))
2917 {
2918 int bufid = -buffer - 1;
2919
2920 bufHdr = GetLocalBufferDescriptor(bufid);
2921 /* Content locks are not maintained for local buffers. */
2922 }
2923 else
2924 {
2925 bufHdr = GetBufferDescriptor(buffer - 1);
2927 }
2928
2929 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2930}
2931
2932/*
2933 * MarkBufferDirty
2934 *
2935 * Marks buffer contents as dirty (actual write happens later).
2936 *
2937 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2938 * exclusive lock, then somebody could be in process of writing the buffer,
2939 * leading to risk of bad data written to disk.)
2940 */
2941void
2943{
2944 BufferDesc *bufHdr;
2945 uint32 buf_state;
2946 uint32 old_buf_state;
2947
2948 if (!BufferIsValid(buffer))
2949 elog(ERROR, "bad buffer ID: %d", buffer);
2950
2951 if (BufferIsLocal(buffer))
2952 {
2954 return;
2955 }
2956
2957 bufHdr = GetBufferDescriptor(buffer - 1);
2958
2961
2962 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2963 for (;;)
2964 {
2965 if (old_buf_state & BM_LOCKED)
2966 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2967
2968 buf_state = old_buf_state;
2969
2970 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2971 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2972
2973 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2974 buf_state))
2975 break;
2976 }
2977
2978 /*
2979 * If the buffer was not dirty already, do vacuum accounting.
2980 */
2981 if (!(old_buf_state & BM_DIRTY))
2982 {
2984 if (VacuumCostActive)
2986 }
2987}
2988
2989/*
2990 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2991 *
2992 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2993 * compared to calling the two routines separately. Now it's mainly just
2994 * a convenience function. However, if the passed buffer is valid and
2995 * already contains the desired block, we just return it as-is; and that
2996 * does save considerable work compared to a full release and reacquire.
2997 *
2998 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2999 * buffer actually needs to be released. This case is the same as ReadBuffer,
3000 * but can save some tests in the caller.
3001 */
3002Buffer
3004 Relation relation,
3005 BlockNumber blockNum)
3006{
3007 ForkNumber forkNum = MAIN_FORKNUM;
3008 BufferDesc *bufHdr;
3009
3010 if (BufferIsValid(buffer))
3011 {
3013 if (BufferIsLocal(buffer))
3014 {
3015 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3016 if (bufHdr->tag.blockNum == blockNum &&
3017 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3018 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3019 return buffer;
3021 }
3022 else
3023 {
3024 bufHdr = GetBufferDescriptor(buffer - 1);
3025 /* we have pin, so it's ok to examine tag without spinlock */
3026 if (bufHdr->tag.blockNum == blockNum &&
3027 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3028 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3029 return buffer;
3030 UnpinBuffer(bufHdr);
3031 }
3032 }
3033
3034 return ReadBuffer(relation, blockNum);
3035}
3036
3037/*
3038 * PinBuffer -- make buffer unavailable for replacement.
3039 *
3040 * For the default access strategy, the buffer's usage_count is incremented
3041 * when we first pin it; for other strategies we just make sure the usage_count
3042 * isn't zero. (The idea of the latter is that we don't want synchronized
3043 * heap scans to inflate the count, but we need it to not be zero to discourage
3044 * other backends from stealing buffers from our ring. As long as we cycle
3045 * through the ring faster than the global clock-sweep cycles, buffers in
3046 * our ring won't be chosen as victims for replacement by other backends.)
3047 *
3048 * This should be applied only to shared buffers, never local ones.
3049 *
3050 * Since buffers are pinned/unpinned very frequently, pin buffers without
3051 * taking the buffer header lock; instead update the state variable in loop of
3052 * CAS operations. Hopefully it's just a single CAS.
3053 *
3054 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3055 * must have been done already.
3056 *
3057 * Returns true if buffer is BM_VALID, else false. This provision allows
3058 * some callers to avoid an extra spinlock cycle. If skip_if_not_valid is
3059 * true, then a false return value also indicates that the buffer was
3060 * (recently) invalid and has not been pinned.
3061 */
3062static bool
3064 bool skip_if_not_valid)
3065{
3067 bool result;
3069
3072
3073 ref = GetPrivateRefCountEntry(b, true);
3074
3075 if (ref == NULL)
3076 {
3077 uint32 buf_state;
3078 uint32 old_buf_state;
3079
3080 old_buf_state = pg_atomic_read_u32(&buf->state);
3081 for (;;)
3082 {
3083 if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
3084 return false;
3085
3086 if (old_buf_state & BM_LOCKED)
3087 old_buf_state = WaitBufHdrUnlocked(buf);
3088
3089 buf_state = old_buf_state;
3090
3091 /* increase refcount */
3092 buf_state += BUF_REFCOUNT_ONE;
3093
3094 if (strategy == NULL)
3095 {
3096 /* Default case: increase usagecount unless already max. */
3098 buf_state += BUF_USAGECOUNT_ONE;
3099 }
3100 else
3101 {
3102 /*
3103 * Ring buffers shouldn't evict others from pool. Thus we
3104 * don't make usagecount more than 1.
3105 */
3106 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3107 buf_state += BUF_USAGECOUNT_ONE;
3108 }
3109
3110 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3111 buf_state))
3112 {
3113 result = (buf_state & BM_VALID) != 0;
3114
3116
3117 /*
3118 * Assume that we acquired a buffer pin for the purposes of
3119 * Valgrind buffer client checks (even in !result case) to
3120 * keep things simple. Buffers that are unsafe to access are
3121 * not generally guaranteed to be marked undefined or
3122 * non-accessible in any case.
3123 */
3125 break;
3126 }
3127 }
3128 }
3129 else
3130 {
3131 /*
3132 * If we previously pinned the buffer, it is likely to be valid, but
3133 * it may not be if StartReadBuffers() was called and
3134 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3135 * the flags without locking. This is racy, but it's OK to return
3136 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3137 * it'll see that it's now valid.
3138 *
3139 * Note: We deliberately avoid a Valgrind client request here.
3140 * Individual access methods can optionally superimpose buffer page
3141 * client requests on top of our client requests to enforce that
3142 * buffers are only accessed while locked (and pinned). It's possible
3143 * that the buffer page is legitimately non-accessible here. We
3144 * cannot meddle with that.
3145 */
3146 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3147
3148 Assert(ref->refcount > 0);
3149 ref->refcount++;
3151 }
3152
3153 return result;
3154}
3155
3156/*
3157 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3158 * The spinlock is released before return.
3159 *
3160 * As this function is called with the spinlock held, the caller has to
3161 * previously call ReservePrivateRefCountEntry() and
3162 * ResourceOwnerEnlarge(CurrentResourceOwner);
3163 *
3164 * Currently, no callers of this function want to modify the buffer's
3165 * usage_count at all, so there's no need for a strategy parameter.
3166 * Also we don't bother with a BM_VALID test (the caller could check that for
3167 * itself).
3168 *
3169 * Also all callers only ever use this function when it's known that the
3170 * buffer can't have a preexisting pin by this backend. That allows us to skip
3171 * searching the private refcount array & hash, which is a boon, because the
3172 * spinlock is still held.
3173 *
3174 * Note: use of this routine is frequently mandatory, not just an optimization
3175 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3176 * its state can change under us.
3177 */
3178static void
3180{
3181 uint32 buf_state;
3182
3183 /*
3184 * As explained, We don't expect any preexisting pins. That allows us to
3185 * manipulate the PrivateRefCount after releasing the spinlock
3186 */
3188
3189 /*
3190 * Buffer can't have a preexisting pin, so mark its page as defined to
3191 * Valgrind (this is similar to the PinBuffer() case where the backend
3192 * doesn't already have a buffer pin)
3193 */
3195
3196 /*
3197 * Since we hold the buffer spinlock, we can update the buffer state and
3198 * release the lock in one operation.
3199 */
3200 buf_state = pg_atomic_read_u32(&buf->state);
3201 Assert(buf_state & BM_LOCKED);
3202 buf_state += BUF_REFCOUNT_ONE;
3203 UnlockBufHdr(buf, buf_state);
3204
3206}
3207
3208/*
3209 * Support for waking up another backend that is waiting for the cleanup lock
3210 * to be released using BM_PIN_COUNT_WAITER.
3211 *
3212 * See LockBufferForCleanup().
3213 *
3214 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3215 * not just reducing the backend-local pincount for the buffer).
3216 */
3217static void
3219{
3220 /*
3221 * Acquire the buffer header lock, re-check that there's a waiter. Another
3222 * backend could have unpinned this buffer, and already woken up the
3223 * waiter.
3224 *
3225 * There's no danger of the buffer being replaced after we unpinned it
3226 * above, as it's pinned by the waiter. The waiter removes
3227 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3228 * backend waking it up.
3229 */
3230 uint32 buf_state = LockBufHdr(buf);
3231
3232 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3233 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3234 {
3235 /* we just released the last pin other than the waiter's */
3236 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3237
3238 buf_state &= ~BM_PIN_COUNT_WAITER;
3239 UnlockBufHdr(buf, buf_state);
3240 ProcSendSignal(wait_backend_pgprocno);
3241 }
3242 else
3243 UnlockBufHdr(buf, buf_state);
3244}
3245
3246/*
3247 * UnpinBuffer -- make buffer available for replacement.
3248 *
3249 * This should be applied only to shared buffers, never local ones. This
3250 * always adjusts CurrentResourceOwner.
3251 */
3252static void
3254{
3256
3259}
3260
3261static void
3263{
3266
3268
3269 /* not moving as we're likely deleting it soon anyway */
3270 ref = GetPrivateRefCountEntry(b, false);
3271 Assert(ref != NULL);
3272 Assert(ref->refcount > 0);
3273 ref->refcount--;
3274 if (ref->refcount == 0)
3275 {
3276 uint32 buf_state;
3277 uint32 old_buf_state;
3278
3279 /*
3280 * Mark buffer non-accessible to Valgrind.
3281 *
3282 * Note that the buffer may have already been marked non-accessible
3283 * within access method code that enforces that buffers are only
3284 * accessed while a buffer lock is held.
3285 */
3287
3288 /*
3289 * I'd better not still hold the buffer content lock. Can't use
3290 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3291 */
3293
3294 /*
3295 * Decrement the shared reference count.
3296 *
3297 * Since buffer spinlock holder can update status using just write,
3298 * it's not safe to use atomic decrement here; thus use a CAS loop.
3299 */
3300 old_buf_state = pg_atomic_read_u32(&buf->state);
3301 for (;;)
3302 {
3303 if (old_buf_state & BM_LOCKED)
3304 old_buf_state = WaitBufHdrUnlocked(buf);
3305
3306 buf_state = old_buf_state;
3307
3308 buf_state -= BUF_REFCOUNT_ONE;
3309
3310 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3311 buf_state))
3312 break;
3313 }
3314
3315 /* Support LockBufferForCleanup() */
3316 if (buf_state & BM_PIN_COUNT_WAITER)
3318
3320 }
3321}
3322
3323inline void
3325{
3327
3329 ref->refcount++;
3330
3332}
3333
3334#define ST_SORT sort_checkpoint_bufferids
3335#define ST_ELEMENT_TYPE CkptSortItem
3336#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3337#define ST_SCOPE static
3338#define ST_DEFINE
3339#include "lib/sort_template.h"
3340
3341/*
3342 * BufferSync -- Write out all dirty buffers in the pool.
3343 *
3344 * This is called at checkpoint time to write out all dirty shared buffers.
3345 * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is
3346 * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3347 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write
3348 * even unlogged buffers, which are otherwise skipped. The remaining flags
3349 * currently have no effect here.
3350 */
3351static void
3352BufferSync(int flags)
3353{
3354 uint32 buf_state;
3355 int buf_id;
3356 int num_to_scan;
3357 int num_spaces;
3358 int num_processed;
3359 int num_written;
3360 CkptTsStatus *per_ts_stat = NULL;
3361 Oid last_tsid;
3362 binaryheap *ts_heap;
3363 int i;
3364 uint32 mask = BM_DIRTY;
3365 WritebackContext wb_context;
3366
3367 /*
3368 * Unless this is a shutdown checkpoint or we have been explicitly told,
3369 * we write only permanent, dirty buffers. But at shutdown or end of
3370 * recovery, we write all dirty buffers.
3371 */
3374 mask |= BM_PERMANENT;
3375
3376 /*
3377 * Loop over all buffers, and mark the ones that need to be written with
3378 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3379 * can estimate how much work needs to be done.
3380 *
3381 * This allows us to write only those pages that were dirty when the
3382 * checkpoint began, and not those that get dirtied while it proceeds.
3383 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3384 * later in this function, or by normal backends or the bgwriter cleaning
3385 * scan, the flag is cleared. Any buffer dirtied after this point won't
3386 * have the flag set.
3387 *
3388 * Note that if we fail to write some buffer, we may leave buffers with
3389 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3390 * certainly need to be written for the next checkpoint attempt, too.
3391 */
3392 num_to_scan = 0;
3393 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3394 {
3395 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3396
3397 /*
3398 * Header spinlock is enough to examine BM_DIRTY, see comment in
3399 * SyncOneBuffer.
3400 */
3401 buf_state = LockBufHdr(bufHdr);
3402
3403 if ((buf_state & mask) == mask)
3404 {
3405 CkptSortItem *item;
3406
3407 buf_state |= BM_CHECKPOINT_NEEDED;
3408
3409 item = &CkptBufferIds[num_to_scan++];
3410 item->buf_id = buf_id;
3411 item->tsId = bufHdr->tag.spcOid;
3412 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3413 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3414 item->blockNum = bufHdr->tag.blockNum;
3415 }
3416
3417 UnlockBufHdr(bufHdr, buf_state);
3418
3419 /* Check for barrier events in case NBuffers is large. */
3422 }
3423
3424 if (num_to_scan == 0)
3425 return; /* nothing to do */
3426
3428
3429 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3430
3431 /*
3432 * Sort buffers that need to be written to reduce the likelihood of random
3433 * IO. The sorting is also important for the implementation of balancing
3434 * writes between tablespaces. Without balancing writes we'd potentially
3435 * end up writing to the tablespaces one-by-one; possibly overloading the
3436 * underlying system.
3437 */
3438 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3439
3440 num_spaces = 0;
3441
3442 /*
3443 * Allocate progress status for each tablespace with buffers that need to
3444 * be flushed. This requires the to-be-flushed array to be sorted.
3445 */
3446 last_tsid = InvalidOid;
3447 for (i = 0; i < num_to_scan; i++)
3448 {
3449 CkptTsStatus *s;
3450 Oid cur_tsid;
3451
3452 cur_tsid = CkptBufferIds[i].tsId;
3453
3454 /*
3455 * Grow array of per-tablespace status structs, every time a new
3456 * tablespace is found.
3457 */
3458 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3459 {
3460 Size sz;
3461
3462 num_spaces++;
3463
3464 /*
3465 * Not worth adding grow-by-power-of-2 logic here - even with a
3466 * few hundred tablespaces this should be fine.
3467 */
3468 sz = sizeof(CkptTsStatus) * num_spaces;
3469
3470 if (per_ts_stat == NULL)
3471 per_ts_stat = (CkptTsStatus *) palloc(sz);
3472 else
3473 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3474
3475 s = &per_ts_stat[num_spaces - 1];
3476 memset(s, 0, sizeof(*s));
3477 s->tsId = cur_tsid;
3478
3479 /*
3480 * The first buffer in this tablespace. As CkptBufferIds is sorted
3481 * by tablespace all (s->num_to_scan) buffers in this tablespace
3482 * will follow afterwards.
3483 */
3484 s->index = i;
3485
3486 /*
3487 * progress_slice will be determined once we know how many buffers
3488 * are in each tablespace, i.e. after this loop.
3489 */
3490
3491 last_tsid = cur_tsid;
3492 }
3493 else
3494 {
3495 s = &per_ts_stat[num_spaces - 1];
3496 }
3497
3498 s->num_to_scan++;
3499
3500 /* Check for barrier events. */
3503 }
3504
3505 Assert(num_spaces > 0);
3506
3507 /*
3508 * Build a min-heap over the write-progress in the individual tablespaces,
3509 * and compute how large a portion of the total progress a single
3510 * processed buffer is.
3511 */
3512 ts_heap = binaryheap_allocate(num_spaces,
3514 NULL);
3515
3516 for (i = 0; i < num_spaces; i++)
3517 {
3518 CkptTsStatus *ts_stat = &per_ts_stat[i];
3519
3520 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3521
3522 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3523 }
3524
3525 binaryheap_build(ts_heap);
3526
3527 /*
3528 * Iterate through to-be-checkpointed buffers and write the ones (still)
3529 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3530 * tablespaces; otherwise the sorting would lead to only one tablespace
3531 * receiving writes at a time, making inefficient use of the hardware.
3532 */
3533 num_processed = 0;
3534 num_written = 0;
3535 while (!binaryheap_empty(ts_heap))
3536 {
3537 BufferDesc *bufHdr = NULL;
3538 CkptTsStatus *ts_stat = (CkptTsStatus *)
3540
3541 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3542 Assert(buf_id != -1);
3543
3544 bufHdr = GetBufferDescriptor(buf_id);
3545
3546 num_processed++;
3547
3548 /*
3549 * We don't need to acquire the lock here, because we're only looking
3550 * at a single bit. It's possible that someone else writes the buffer
3551 * and clears the flag right after we check, but that doesn't matter
3552 * since SyncOneBuffer will then do nothing. However, there is a
3553 * further race condition: it's conceivable that between the time we
3554 * examine the bit here and the time SyncOneBuffer acquires the lock,
3555 * someone else not only wrote the buffer but replaced it with another
3556 * page and dirtied it. In that improbable case, SyncOneBuffer will
3557 * write the buffer though we didn't need to. It doesn't seem worth
3558 * guarding against this, though.
3559 */
3561 {
3562 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3563 {
3564 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3566 num_written++;
3567 }
3568 }
3569
3570 /*
3571 * Measure progress independent of actually having to flush the buffer
3572 * - otherwise writing become unbalanced.
3573 */
3574 ts_stat->progress += ts_stat->progress_slice;
3575 ts_stat->num_scanned++;
3576 ts_stat->index++;
3577
3578 /* Have all the buffers from the tablespace been processed? */
3579 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3580 {
3581 binaryheap_remove_first(ts_heap);
3582 }
3583 else
3584 {
3585 /* update heap with the new progress */
3586 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3587 }
3588
3589 /*
3590 * Sleep to throttle our I/O rate.
3591 *
3592 * (This will check for barrier events even if it doesn't sleep.)
3593 */
3594 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3595 }
3596
3597 /*
3598 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3599 * IOContext will always be IOCONTEXT_NORMAL.
3600 */
3602
3603 pfree(per_ts_stat);
3604 per_ts_stat = NULL;
3605 binaryheap_free(ts_heap);
3606
3607 /*
3608 * Update checkpoint statistics. As noted above, this doesn't include
3609 * buffers written by other backends or bgwriter scan.
3610 */
3611 CheckpointStats.ckpt_bufs_written += num_written;
3612
3613 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3614}
3615
3616/*
3617 * BgBufferSync -- Write out some dirty buffers in the pool.
3618 *
3619 * This is called periodically by the background writer process.
3620 *
3621 * Returns true if it's appropriate for the bgwriter process to go into
3622 * low-power hibernation mode. (This happens if the strategy clock-sweep
3623 * has been "lapped" and no buffer allocations have occurred recently,
3624 * or if the bgwriter has been effectively disabled by setting
3625 * bgwriter_lru_maxpages to 0.)
3626 */
3627bool
3629{
3630 /* info obtained from freelist.c */
3631 int strategy_buf_id;
3632 uint32 strategy_passes;
3633 uint32 recent_alloc;
3634
3635 /*
3636 * Information saved between calls so we can determine the strategy
3637 * point's advance rate and avoid scanning already-cleaned buffers.
3638 */
3639 static bool saved_info_valid = false;
3640 static int prev_strategy_buf_id;
3641 static uint32 prev_strategy_passes;
3642 static int next_to_clean;
3643 static uint32 next_passes;
3644
3645 /* Moving averages of allocation rate and clean-buffer density */
3646 static float smoothed_alloc = 0;
3647 static float smoothed_density = 10.0;
3648
3649 /* Potentially these could be tunables, but for now, not */
3650 float smoothing_samples = 16;
3651 float scan_whole_pool_milliseconds = 120000.0;
3652
3653 /* Used to compute how far we scan ahead */
3654 long strategy_delta;
3655 int bufs_to_lap;
3656 int bufs_ahead;
3657 float scans_per_alloc;
3658 int reusable_buffers_est;
3659 int upcoming_alloc_est;
3660 int min_scan_buffers;
3661
3662 /* Variables for the scanning loop proper */
3663 int num_to_scan;
3664 int num_written;
3665 int reusable_buffers;
3666
3667 /* Variables for final smoothed_density update */
3668 long new_strategy_delta;
3669 uint32 new_recent_alloc;
3670
3671 /*
3672 * Find out where the clock-sweep currently is, and how many buffer
3673 * allocations have happened since our last call.
3674 */
3675 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3676
3677 /* Report buffer alloc counts to pgstat */
3678 PendingBgWriterStats.buf_alloc += recent_alloc;
3679
3680 /*
3681 * If we're not running the LRU scan, just stop after doing the stats
3682 * stuff. We mark the saved state invalid so that we can recover sanely
3683 * if LRU scan is turned back on later.
3684 */
3685 if (bgwriter_lru_maxpages <= 0)
3686 {
3687 saved_info_valid = false;
3688 return true;
3689 }
3690
3691 /*
3692 * Compute strategy_delta = how many buffers have been scanned by the
3693 * clock-sweep since last time. If first time through, assume none. Then
3694 * see if we are still ahead of the clock-sweep, and if so, how many
3695 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3696 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3697 * behavior when the passes counts wrap around.
3698 */
3699 if (saved_info_valid)
3700 {
3701 int32 passes_delta = strategy_passes - prev_strategy_passes;
3702
3703 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3704 strategy_delta += (long) passes_delta * NBuffers;
3705
3706 Assert(strategy_delta >= 0);
3707
3708 if ((int32) (next_passes - strategy_passes) > 0)
3709 {
3710 /* we're one pass ahead of the strategy point */
3711 bufs_to_lap = strategy_buf_id - next_to_clean;
3712#ifdef BGW_DEBUG
3713 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3714 next_passes, next_to_clean,
3715 strategy_passes, strategy_buf_id,
3716 strategy_delta, bufs_to_lap);
3717#endif
3718 }
3719 else if (next_passes == strategy_passes &&
3720 next_to_clean >= strategy_buf_id)
3721 {
3722 /* on same pass, but ahead or at least not behind */
3723 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3724#ifdef BGW_DEBUG
3725 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3726 next_passes, next_to_clean,
3727 strategy_passes, strategy_buf_id,
3728 strategy_delta, bufs_to_lap);
3729#endif
3730 }
3731 else
3732 {
3733 /*
3734 * We're behind, so skip forward to the strategy point and start
3735 * cleaning from there.
3736 */
3737#ifdef BGW_DEBUG
3738 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3739 next_passes, next_to_clean,
3740 strategy_passes, strategy_buf_id,
3741 strategy_delta);
3742#endif
3743 next_to_clean = strategy_buf_id;
3744 next_passes = strategy_passes;
3745 bufs_to_lap = NBuffers;
3746 }
3747 }
3748 else
3749 {
3750 /*
3751 * Initializing at startup or after LRU scanning had been off. Always
3752 * start at the strategy point.
3753 */
3754#ifdef BGW_DEBUG
3755 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3756 strategy_passes, strategy_buf_id);
3757#endif
3758 strategy_delta = 0;
3759 next_to_clean = strategy_buf_id;
3760 next_passes = strategy_passes;
3761 bufs_to_lap = NBuffers;
3762 }
3763
3764 /* Update saved info for next time */
3765 prev_strategy_buf_id = strategy_buf_id;
3766 prev_strategy_passes = strategy_passes;
3767 saved_info_valid = true;
3768
3769 /*
3770 * Compute how many buffers had to be scanned for each new allocation, ie,
3771 * 1/density of reusable buffers, and track a moving average of that.
3772 *
3773 * If the strategy point didn't move, we don't update the density estimate
3774 */
3775 if (strategy_delta > 0 && recent_alloc > 0)
3776 {
3777 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3778 smoothed_density += (scans_per_alloc - smoothed_density) /
3779 smoothing_samples;
3780 }
3781
3782 /*
3783 * Estimate how many reusable buffers there are between the current
3784 * strategy point and where we've scanned ahead to, based on the smoothed
3785 * density estimate.
3786 */
3787 bufs_ahead = NBuffers - bufs_to_lap;
3788 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3789
3790 /*
3791 * Track a moving average of recent buffer allocations. Here, rather than
3792 * a true average we want a fast-attack, slow-decline behavior: we
3793 * immediately follow any increase.
3794 */
3795 if (smoothed_alloc <= (float) recent_alloc)
3796 smoothed_alloc = recent_alloc;
3797 else
3798 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3799 smoothing_samples;
3800
3801 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3802 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3803
3804 /*
3805 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3806 * eventually underflow to zero, and the underflows produce annoying
3807 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3808 * zero, there's no point in tracking smaller and smaller values of
3809 * smoothed_alloc, so just reset it to exactly zero to avoid this
3810 * syndrome. It will pop back up as soon as recent_alloc increases.
3811 */
3812 if (upcoming_alloc_est == 0)
3813 smoothed_alloc = 0;
3814
3815 /*
3816 * Even in cases where there's been little or no buffer allocation
3817 * activity, we want to make a small amount of progress through the buffer
3818 * cache so that as many reusable buffers as possible are clean after an
3819 * idle period.
3820 *
3821 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3822 * the BGW will be called during the scan_whole_pool time; slice the
3823 * buffer pool into that many sections.
3824 */
3825 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3826
3827 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3828 {
3829#ifdef BGW_DEBUG
3830 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3831 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3832#endif
3833 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3834 }
3835
3836 /*
3837 * Now write out dirty reusable buffers, working forward from the
3838 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3839 * enough buffers to match our estimate of the next cycle's allocation
3840 * requirements, or hit the bgwriter_lru_maxpages limit.
3841 */
3842
3843 num_to_scan = bufs_to_lap;
3844 num_written = 0;
3845 reusable_buffers = reusable_buffers_est;
3846
3847 /* Execute the LRU scan */
3848 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3849 {
3850 int sync_state = SyncOneBuffer(next_to_clean, true,
3851 wb_context);
3852
3853 if (++next_to_clean >= NBuffers)
3854 {
3855 next_to_clean = 0;
3856 next_passes++;
3857 }
3858 num_to_scan--;
3859
3860 if (sync_state & BUF_WRITTEN)
3861 {
3862 reusable_buffers++;
3863 if (++num_written >= bgwriter_lru_maxpages)
3864 {
3866 break;
3867 }
3868 }
3869 else if (sync_state & BUF_REUSABLE)
3870 reusable_buffers++;
3871 }
3872
3874
3875#ifdef BGW_DEBUG
3876 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3877 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3878 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3879 bufs_to_lap - num_to_scan,
3880 num_written,
3881 reusable_buffers - reusable_buffers_est);
3882#endif
3883
3884 /*
3885 * Consider the above scan as being like a new allocation scan.
3886 * Characterize its density and update the smoothed one based on it. This
3887 * effectively halves the moving average period in cases where both the
3888 * strategy and the background writer are doing some useful scanning,
3889 * which is helpful because a long memory isn't as desirable on the
3890 * density estimates.
3891 */
3892 new_strategy_delta = bufs_to_lap - num_to_scan;
3893 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3894 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3895 {
3896 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3897 smoothed_density += (scans_per_alloc - smoothed_density) /
3898 smoothing_samples;
3899
3900#ifdef BGW_DEBUG
3901 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3902 new_recent_alloc, new_strategy_delta,
3903 scans_per_alloc, smoothed_density);
3904#endif
3905 }
3906
3907 /* Return true if OK to hibernate */
3908 return (bufs_to_lap == 0 && recent_alloc == 0);
3909}
3910
3911/*
3912 * SyncOneBuffer -- process a single buffer during syncing.
3913 *
3914 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3915 * buffers marked recently used, as these are not replacement candidates.
3916 *
3917 * Returns a bitmask containing the following flag bits:
3918 * BUF_WRITTEN: we wrote the buffer.
3919 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3920 * pin count 0 and usage count 0.
3921 *
3922 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3923 * after locking it, but we don't care all that much.)
3924 */
3925static int
3926SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3927{
3928 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3929 int result = 0;
3930 uint32 buf_state;
3931 BufferTag tag;
3932
3933 /* Make sure we can handle the pin */
3936
3937 /*
3938 * Check whether buffer needs writing.
3939 *
3940 * We can make this check without taking the buffer content lock so long
3941 * as we mark pages dirty in access methods *before* logging changes with
3942 * XLogInsert(): if someone marks the buffer dirty just after our check we
3943 * don't worry because our checkpoint.redo points before log record for
3944 * upcoming changes and so we are not required to write such dirty buffer.
3945 */
3946 buf_state = LockBufHdr(bufHdr);
3947
3948 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3949 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3950 {
3951 result |= BUF_REUSABLE;
3952 }
3953 else if (skip_recently_used)
3954 {
3955 /* Caller told us not to write recently-used buffers */
3956 UnlockBufHdr(bufHdr, buf_state);
3957 return result;
3958 }
3959
3960 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3961 {
3962 /* It's clean, so nothing to do */
3963 UnlockBufHdr(bufHdr, buf_state);
3964 return result;
3965 }
3966
3967 /*
3968 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3969 * buffer is clean by the time we've locked it.)
3970 */
3971 PinBuffer_Locked(bufHdr);
3972
3974
3975 tag = bufHdr->tag;
3976
3977 UnpinBuffer(bufHdr);
3978
3979 /*
3980 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3981 * IOContext will always be IOCONTEXT_NORMAL.
3982 */
3984
3985 return result | BUF_WRITTEN;
3986}
3987
3988/*
3989 * AtEOXact_Buffers - clean up at end of transaction.
3990 *
3991 * As of PostgreSQL 8.0, buffer pins should get released by the
3992 * ResourceOwner mechanism. This routine is just a debugging
3993 * cross-check that no pins remain.
3994 */
3995void
3996AtEOXact_Buffers(bool isCommit)
3997{
3999
4000 AtEOXact_LocalBuffers(isCommit);
4001
4003}
4004
4005/*
4006 * Initialize access to shared buffer pool
4007 *
4008 * This is called during backend startup (whether standalone or under the
4009 * postmaster). It sets up for this backend's access to the already-existing
4010 * buffer pool.
4011 */
4012void
4014{
4015 HASHCTL hash_ctl;
4016
4017 /*
4018 * An advisory limit on the number of pins each backend should hold, based
4019 * on shared_buffers and the maximum number of connections possible.
4020 * That's very pessimistic, but outside toy-sized shared_buffers it should
4021 * allow plenty of pins. LimitAdditionalPins() and
4022 * GetAdditionalPinLimit() can be used to check the remaining balance.
4023 */
4025
4026 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4027
4028 hash_ctl.keysize = sizeof(int32);
4029 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4030
4031 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4033
4034 /*
4035 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4036 * the corresponding phase of backend shutdown.
4037 */
4038 Assert(MyProc != NULL);
4040}
4041
4042/*
4043 * During backend exit, ensure that we released all shared-buffer locks and
4044 * assert that we have no remaining pins.
4045 */
4046static void
4048{
4049 UnlockBuffers();
4050
4052
4053 /* localbuf.c needs a chance too */
4055}
4056
4057/*
4058 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4059 *
4060 * As of PostgreSQL 8.0, buffer pins should get released by the
4061 * ResourceOwner mechanism. This routine is just a debugging
4062 * cross-check that no pins remain.
4063 */
4064static void
4066{
4067#ifdef USE_ASSERT_CHECKING
4068 int RefCountErrors = 0;
4070 int i;
4071 char *s;
4072
4073 /* check the array */
4074 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4075 {
4076 res = &PrivateRefCountArray[i];
4077
4078 if (res->buffer != InvalidBuffer)
4079 {
4081 elog(WARNING, "buffer refcount leak: %s", s);
4082 pfree(s);
4083
4084 RefCountErrors++;
4085 }
4086 }
4087
4088 /* if necessary search the hash */
4090 {
4091 HASH_SEQ_STATUS hstat;
4092
4094 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4095 {
4097 elog(WARNING, "buffer refcount leak: %s", s);
4098 pfree(s);
4099 RefCountErrors++;
4100 }
4101 }
4102
4103 Assert(RefCountErrors == 0);
4104#endif
4105}
4106
4107#ifdef USE_ASSERT_CHECKING
4108/*
4109 * Check for exclusive-locked catalog buffers. This is the core of
4110 * AssertCouldGetRelation().
4111 *
4112 * A backend would self-deadlock on LWLocks if the catalog scan read the
4113 * exclusive-locked buffer. The main threat is exclusive-locked buffers of
4114 * catalogs used in relcache, because a catcache search on any catalog may
4115 * build that catalog's relcache entry. We don't have an inventory of
4116 * catalogs relcache uses, so just check buffers of most catalogs.
4117 *
4118 * It's better to minimize waits while holding an exclusive buffer lock, so it
4119 * would be nice to broaden this check not to be catalog-specific. However,
4120 * bttextcmp() accesses pg_collation, and non-core opclasses might similarly
4121 * read tables. That is deadlock-free as long as there's no loop in the
4122 * dependency graph: modifying table A may cause an opclass to read table B,
4123 * but it must not cause a read of table A.
4124 */
4125void
4126AssertBufferLocksPermitCatalogRead(void)
4127{
4128 ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL);
4129}
4130
4131static void
4132AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode,
4133 void *unused_context)
4134{
4135 BufferDesc *bufHdr;
4136 BufferTag tag;
4137 Oid relid;
4138
4139 if (mode != LW_EXCLUSIVE)
4140 return;
4141
4142 if (!((BufferDescPadded *) lock > BufferDescriptors &&
4144 return; /* not a buffer lock */
4145
4146 bufHdr = (BufferDesc *)
4147 ((char *) lock - offsetof(BufferDesc, content_lock));
4148 tag = bufHdr->tag;
4149
4150 /*
4151 * This relNumber==relid assumption holds until a catalog experiences
4152 * VACUUM FULL or similar. After a command like that, relNumber will be
4153 * in the normal (non-catalog) range, and we lose the ability to detect
4154 * hazardous access to that catalog. Calling RelidByRelfilenumber() would
4155 * close that gap, but RelidByRelfilenumber() might then deadlock with a
4156 * held lock.
4157 */
4158 relid = tag.relNumber;
4159
4160 if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */
4161 return;
4162
4164}
4165#endif
4166
4167
4168/*
4169 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4170 */
4171char *
4173{
4174 BufferDesc *buf;
4175 int32 loccount;
4176 char *result;
4177 ProcNumber backend;
4178 uint32 buf_state;
4179
4181 if (BufferIsLocal(buffer))
4182 {
4184 loccount = LocalRefCount[-buffer - 1];
4185 backend = MyProcNumber;
4186 }
4187 else
4188 {
4190 loccount = GetPrivateRefCount(buffer);
4191 backend = INVALID_PROC_NUMBER;
4192 }
4193
4194 /* theoretically we should lock the bufhdr here */
4195 buf_state = pg_atomic_read_u32(&buf->state);
4196
4197 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4198 buffer,
4200 BufTagGetForkNum(&buf->tag)).str,
4201 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4202 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4203 return result;
4204}
4205
4206/*
4207 * CheckPointBuffers
4208 *
4209 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4210 *
4211 * Note: temporary relations do not participate in checkpoints, so they don't
4212 * need to be flushed.
4213 */
4214void
4216{
4217 BufferSync(flags);
4218}
4219
4220/*
4221 * BufferGetBlockNumber
4222 * Returns the block number associated with a buffer.
4223 *
4224 * Note:
4225 * Assumes that the buffer is valid and pinned, else the
4226 * value may be obsolete immediately...
4227 */
4230{
4231 BufferDesc *bufHdr;
4232
4234
4235 if (BufferIsLocal(buffer))
4236 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4237 else
4238 bufHdr = GetBufferDescriptor(buffer - 1);
4239
4240 /* pinned, so OK to read tag without spinlock */
4241 return bufHdr->tag.blockNum;
4242}
4243
4244/*
4245 * BufferGetTag
4246 * Returns the relfilelocator, fork number and block number associated with
4247 * a buffer.
4248 */
4249void
4251 BlockNumber *blknum)
4252{
4253 BufferDesc *bufHdr;
4254
4255 /* Do the same checks as BufferGetBlockNumber. */
4257
4258 if (BufferIsLocal(buffer))
4259 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4260 else
4261 bufHdr = GetBufferDescriptor(buffer - 1);
4262
4263 /* pinned, so OK to read tag without spinlock */
4264 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4265 *forknum = BufTagGetForkNum(&bufHdr->tag);
4266 *blknum = bufHdr->tag.blockNum;
4267}
4268
4269/*
4270 * FlushBuffer
4271 * Physically write out a shared buffer.
4272 *
4273 * NOTE: this actually just passes the buffer contents to the kernel; the
4274 * real write to disk won't happen until the kernel feels like it. This
4275 * is okay from our point of view since we can redo the changes from WAL.
4276 * However, we will need to force the changes to disk via fsync before
4277 * we can checkpoint WAL.
4278 *
4279 * The caller must hold a pin on the buffer and have share-locked the
4280 * buffer contents. (Note: a share-lock does not prevent updates of
4281 * hint bits in the buffer, so the page could change while the write
4282 * is in progress, but we assume that that will not invalidate the data
4283 * written.)
4284 *
4285 * If the caller has an smgr reference for the buffer's relation, pass it
4286 * as the second parameter. If not, pass NULL.
4287 */
4288static void
4290 IOContext io_context)
4291{
4292 XLogRecPtr recptr;
4293 ErrorContextCallback errcallback;
4294 instr_time io_start;
4295 Block bufBlock;
4296 char *bufToWrite;
4297 uint32 buf_state;
4298
4299 /*
4300 * Try to start an I/O operation. If StartBufferIO returns false, then
4301 * someone else flushed the buffer before we could, so we need not do
4302 * anything.
4303 */
4304 if (!StartBufferIO(buf, false, false))
4305 return;
4306
4307 /* Setup error traceback support for ereport() */
4309 errcallback.arg = buf;
4310 errcallback.previous = error_context_stack;
4311 error_context_stack = &errcallback;
4312
4313 /* Find smgr relation for buffer */
4314 if (reln == NULL)
4316
4317 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4318 buf->tag.blockNum,
4322
4323 buf_state = LockBufHdr(buf);
4324
4325 /*
4326 * Run PageGetLSN while holding header lock, since we don't have the
4327 * buffer locked exclusively in all cases.
4328 */
4329 recptr = BufferGetLSN(buf);
4330
4331 /* To check if block content changes while flushing. - vadim 01/17/97 */
4332 buf_state &= ~BM_JUST_DIRTIED;
4333 UnlockBufHdr(buf, buf_state);
4334
4335 /*
4336 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4337 * rule that log updates must hit disk before any of the data-file changes
4338 * they describe do.
4339 *
4340 * However, this rule does not apply to unlogged relations, which will be
4341 * lost after a crash anyway. Most unlogged relation pages do not bear
4342 * LSNs since we never emit WAL records for them, and therefore flushing
4343 * up through the buffer LSN would be useless, but harmless. However,
4344 * GiST indexes use LSNs internally to track page-splits, and therefore
4345 * unlogged GiST pages bear "fake" LSNs generated by
4346 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4347 * LSN counter could advance past the WAL insertion point; and if it did
4348 * happen, attempting to flush WAL through that location would fail, with
4349 * disastrous system-wide consequences. To make sure that can't happen,
4350 * skip the flush if the buffer isn't permanent.
4351 */
4352 if (buf_state & BM_PERMANENT)
4353 XLogFlush(recptr);
4354
4355 /*
4356 * Now it's safe to write the buffer to disk. Note that no one else should
4357 * have been able to write it, while we were busy with log flushing,
4358 * because we got the exclusive right to perform I/O by setting the
4359 * BM_IO_IN_PROGRESS bit.
4360 */
4361 bufBlock = BufHdrGetBlock(buf);
4362
4363 /*
4364 * Update page checksum if desired. Since we have only shared lock on the
4365 * buffer, other processes might be updating hint bits in it, so we must
4366 * copy the page to private storage if we do checksumming.
4367 */
4368 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4369
4371
4372 /*
4373 * bufToWrite is either the shared buffer or a copy, as appropriate.
4374 */
4375 smgrwrite(reln,
4376 BufTagGetForkNum(&buf->tag),
4377 buf->tag.blockNum,
4378 bufToWrite,
4379 false);
4380
4381 /*
4382 * When a strategy is in use, only flushes of dirty buffers already in the
4383 * strategy ring are counted as strategy writes (IOCONTEXT
4384 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4385 * statistics tracking.
4386 *
4387 * If a shared buffer initially added to the ring must be flushed before
4388 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4389 *
4390 * If a shared buffer which was added to the ring later because the
4391 * current strategy buffer is pinned or in use or because all strategy
4392 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4393 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4394 * (from_ring will be false).
4395 *
4396 * When a strategy is not in use, the write can only be a "regular" write
4397 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4398 */
4400 IOOP_WRITE, io_start, 1, BLCKSZ);
4401
4403
4404 /*
4405 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4406 * end the BM_IO_IN_PROGRESS state.
4407 */
4408 TerminateBufferIO(buf, true, 0, true, false);
4409
4410 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4411 buf->tag.blockNum,
4415
4416 /* Pop the error context stack */
4417 error_context_stack = errcallback.previous;
4418}
4419
4420/*
4421 * Convenience wrapper around FlushBuffer() that locks/unlocks the buffer
4422 * before/after calling FlushBuffer().
4423 */
4424static void
4426 IOObject io_object, IOContext io_context)
4427{
4431}
4432
4433/*
4434 * RelationGetNumberOfBlocksInFork
4435 * Determines the current number of pages in the specified relation fork.
4436 *
4437 * Note that the accuracy of the result will depend on the details of the
4438 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4439 * it might not be.
4440 */
4443{
4444 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4445 {
4446 /*
4447 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4448 * tableam returns the size in bytes - but for the purpose of this
4449 * routine, we want the number of blocks. Therefore divide, rounding
4450 * up.
4451 */
4452 uint64 szbytes;
4453
4454 szbytes = table_relation_size(relation, forkNum);
4455
4456 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4457 }
4458 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4459 {
4460 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4461 }
4462 else
4463 Assert(false);
4464
4465 return 0; /* keep compiler quiet */
4466}
4467
4468/*
4469 * BufferIsPermanent
4470 * Determines whether a buffer will potentially still be around after
4471 * a crash. Caller must hold a buffer pin.
4472 */
4473bool
4475{
4476 BufferDesc *bufHdr;
4477
4478 /* Local buffers are used only for temp relations. */
4479 if (BufferIsLocal(buffer))
4480 return false;
4481
4482 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4485
4486 /*
4487 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4488 * need not bother with the buffer header spinlock. Even if someone else
4489 * changes the buffer header state while we're doing this, the state is
4490 * changed atomically, so we'll read the old value or the new value, but
4491 * not random garbage.
4492 */
4493 bufHdr = GetBufferDescriptor(buffer - 1);
4494 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4495}
4496
4497/*
4498 * BufferGetLSNAtomic
4499 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4500 * This is necessary for some callers who may not have an exclusive lock
4501 * on the buffer.
4502 */
4505{
4506 char *page = BufferGetPage(buffer);
4507 BufferDesc *bufHdr;
4508 XLogRecPtr lsn;
4509 uint32 buf_state;
4510
4511 /*
4512 * If we don't need locking for correctness, fastpath out.
4513 */
4515 return PageGetLSN(page);
4516
4517 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4520
4521 bufHdr = GetBufferDescriptor(buffer - 1);
4522 buf_state = LockBufHdr(bufHdr);
4523 lsn = PageGetLSN(page);
4524 UnlockBufHdr(bufHdr, buf_state);
4525
4526 return lsn;
4527}
4528
4529/* ---------------------------------------------------------------------
4530 * DropRelationBuffers
4531 *
4532 * This function removes from the buffer pool all the pages of the
4533 * specified relation forks that have block numbers >= firstDelBlock.
4534 * (In particular, with firstDelBlock = 0, all pages are removed.)
4535 * Dirty pages are simply dropped, without bothering to write them
4536 * out first. Therefore, this is NOT rollback-able, and so should be
4537 * used only with extreme caution!
4538 *
4539 * Currently, this is called only from smgr.c when the underlying file
4540 * is about to be deleted or truncated (firstDelBlock is needed for
4541 * the truncation case). The data in the affected pages would therefore
4542 * be deleted momentarily anyway, and there is no point in writing it.
4543 * It is the responsibility of higher-level code to ensure that the
4544 * deletion or truncation does not lose any data that could be needed
4545 * later. It is also the responsibility of higher-level code to ensure
4546 * that no other process could be trying to load more pages of the
4547 * relation into buffers.
4548 * --------------------------------------------------------------------
4549 */
4550void
4552 int nforks, BlockNumber *firstDelBlock)
4553{
4554 int i;
4555 int j;
4556 RelFileLocatorBackend rlocator;
4557 BlockNumber nForkBlock[MAX_FORKNUM];
4558 uint64 nBlocksToInvalidate = 0;
4559
4560 rlocator = smgr_reln->smgr_rlocator;
4561
4562 /* If it's a local relation, it's localbuf.c's problem. */
4563 if (RelFileLocatorBackendIsTemp(rlocator))
4564 {
4565 if (rlocator.backend == MyProcNumber)
4566 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4567 firstDelBlock);
4568
4569 return;
4570 }
4571
4572 /*
4573 * To remove all the pages of the specified relation forks from the buffer
4574 * pool, we need to scan the entire buffer pool but we can optimize it by
4575 * finding the buffers from BufMapping table provided we know the exact
4576 * size of each fork of the relation. The exact size is required to ensure
4577 * that we don't leave any buffer for the relation being dropped as
4578 * otherwise the background writer or checkpointer can lead to a PANIC
4579 * error while flushing buffers corresponding to files that don't exist.
4580 *
4581 * To know the exact size, we rely on the size cached for each fork by us
4582 * during recovery which limits the optimization to recovery and on
4583 * standbys but we can easily extend it once we have shared cache for
4584 * relation size.
4585 *
4586 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4587 * and the future writes keeps the cached value up-to-date. See
4588 * smgrextend. It is possible that the value of the first lseek is smaller
4589 * than the actual number of existing blocks in the file due to buggy
4590 * Linux kernels that might not have accounted for the recent write. But
4591 * that should be fine because there must not be any buffers after that
4592 * file size.
4593 */
4594 for (i = 0; i < nforks; i++)
4595 {
4596 /* Get the number of blocks for a relation's fork */
4597 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4598
4599 if (nForkBlock[i] == InvalidBlockNumber)
4600 {
4601 nBlocksToInvalidate = InvalidBlockNumber;
4602 break;
4603 }
4604
4605 /* calculate the number of blocks to be invalidated */
4606 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4607 }
4608
4609 /*
4610 * We apply the optimization iff the total number of blocks to invalidate
4611 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4612 */
4613 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4614 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4615 {
4616 for (j = 0; j < nforks; j++)
4617 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4618 nForkBlock[j], firstDelBlock[j]);
4619 return;
4620 }
4621
4622 for (i = 0; i < NBuffers; i++)
4623 {
4624 BufferDesc *bufHdr = GetBufferDescriptor(i);
4625 uint32 buf_state;
4626
4627 /*
4628 * We can make this a tad faster by prechecking the buffer tag before
4629 * we attempt to lock the buffer; this saves a lot of lock
4630 * acquisitions in typical cases. It should be safe because the
4631 * caller must have AccessExclusiveLock on the relation, or some other
4632 * reason to be certain that no one is loading new pages of the rel
4633 * into the buffer pool. (Otherwise we might well miss such pages
4634 * entirely.) Therefore, while the tag might be changing while we
4635 * look at it, it can't be changing *to* a value we care about, only
4636 * *away* from such a value. So false negatives are impossible, and
4637 * false positives are safe because we'll recheck after getting the
4638 * buffer lock.
4639 *
4640 * We could check forkNum and blockNum as well as the rlocator, but
4641 * the incremental win from doing so seems small.
4642 */
4643 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4644 continue;
4645
4646 buf_state = LockBufHdr(bufHdr);
4647
4648 for (j = 0; j < nforks; j++)
4649 {
4650 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4651 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4652 bufHdr->tag.blockNum >= firstDelBlock[j])
4653 {
4654 InvalidateBuffer(bufHdr); /* releases spinlock */
4655 break;
4656 }
4657 }
4658 if (j >= nforks)
4659 UnlockBufHdr(bufHdr, buf_state);
4660 }
4661}
4662
4663/* ---------------------------------------------------------------------
4664 * DropRelationsAllBuffers
4665 *
4666 * This function removes from the buffer pool all the pages of all
4667 * forks of the specified relations. It's equivalent to calling
4668 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4669 * --------------------------------------------------------------------
4670 */
4671void
4672DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4673{
4674 int i;
4675 int n = 0;
4676 SMgrRelation *rels;
4677 BlockNumber (*block)[MAX_FORKNUM + 1];
4678 uint64 nBlocksToInvalidate = 0;
4679 RelFileLocator *locators;
4680 bool cached = true;
4681 bool use_bsearch;
4682
4683 if (nlocators == 0)
4684 return;
4685
4686 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4687
4688 /* If it's a local relation, it's localbuf.c's problem. */
4689 for (i = 0; i < nlocators; i++)
4690 {
4691 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4692 {
4693 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4694 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4695 }
4696 else
4697 rels[n++] = smgr_reln[i];
4698 }
4699
4700 /*
4701 * If there are no non-local relations, then we're done. Release the
4702 * memory and return.
4703 */
4704 if (n == 0)
4705 {
4706 pfree(rels);
4707 return;
4708 }
4709
4710 /*
4711 * This is used to remember the number of blocks for all the relations
4712 * forks.
4713 */
4714 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4715 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4716
4717 /*
4718 * We can avoid scanning the entire buffer pool if we know the exact size
4719 * of each of the given relation forks. See DropRelationBuffers.
4720 */
4721 for (i = 0; i < n && cached; i++)
4722 {
4723 for (int j = 0; j <= MAX_FORKNUM; j++)
4724 {
4725 /* Get the number of blocks for a relation's fork. */
4726 block[i][j] = smgrnblocks_cached(rels[i], j);
4727
4728 /* We need to only consider the relation forks that exists. */
4729 if (block[i][j] == InvalidBlockNumber)
4730 {
4731 if (!smgrexists(rels[i], j))
4732 continue;
4733 cached = false;
4734 break;
4735 }
4736
4737 /* calculate the total number of blocks to be invalidated */
4738 nBlocksToInvalidate += block[i][j];
4739 }
4740 }
4741
4742 /*
4743 * We apply the optimization iff the total number of blocks to invalidate
4744 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4745 */
4746 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4747 {
4748 for (i = 0; i < n; i++)
4749 {
4750 for (int j = 0; j <= MAX_FORKNUM; j++)
4751 {
4752 /* ignore relation forks that doesn't exist */
4753 if (!BlockNumberIsValid(block[i][j]))
4754 continue;
4755
4756 /* drop all the buffers for a particular relation fork */
4757 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4758 j, block[i][j], 0);
4759 }
4760 }
4761
4762 pfree(block);
4763 pfree(rels);
4764 return;
4765 }
4766
4767 pfree(block);
4768 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4769 for (i = 0; i < n; i++)
4770 locators[i] = rels[i]->smgr_rlocator.locator;
4771
4772 /*
4773 * For low number of relations to drop just use a simple walk through, to
4774 * save the bsearch overhead. The threshold to use is rather a guess than
4775 * an exactly determined value, as it depends on many factors (CPU and RAM
4776 * speeds, amount of shared buffers etc.).
4777 */
4778 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4779
4780 /* sort the list of rlocators if necessary */
4781 if (use_bsearch)
4782 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4783
4784 for (i = 0; i < NBuffers; i++)
4785 {
4786 RelFileLocator *rlocator = NULL;
4787 BufferDesc *bufHdr = GetBufferDescriptor(i);
4788 uint32 buf_state;
4789
4790 /*
4791 * As in DropRelationBuffers, an unlocked precheck should be safe and
4792 * saves some cycles.
4793 */
4794
4795 if (!use_bsearch)
4796 {
4797 int j;
4798
4799 for (j = 0; j < n; j++)
4800 {
4801 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4802 {
4803 rlocator = &locators[j];
4804 break;
4805 }
4806 }
4807 }
4808 else
4809 {
4810 RelFileLocator locator;
4811
4812 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4813 rlocator = bsearch(&locator,
4814 locators, n, sizeof(RelFileLocator),
4816 }
4817
4818 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4819 if (rlocator == NULL)
4820 continue;
4821
4822 buf_state = LockBufHdr(bufHdr);
4823 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4824 InvalidateBuffer(bufHdr); /* releases spinlock */
4825 else
4826 UnlockBufHdr(bufHdr, buf_state);
4827 }
4828
4829 pfree(locators);
4830 pfree(rels);
4831}
4832
4833/* ---------------------------------------------------------------------
4834 * FindAndDropRelationBuffers
4835 *
4836 * This function performs look up in BufMapping table and removes from the
4837 * buffer pool all the pages of the specified relation fork that has block
4838 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4839 * pages are removed.)
4840 * --------------------------------------------------------------------
4841 */
4842static void
4844 BlockNumber nForkBlock,
4845 BlockNumber firstDelBlock)
4846{
4847 BlockNumber curBlock;
4848
4849 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4850 {
4851 uint32 bufHash; /* hash value for tag */
4852 BufferTag bufTag; /* identity of requested block */
4853 LWLock *bufPartitionLock; /* buffer partition lock for it */
4854 int buf_id;
4855 BufferDesc *bufHdr;
4856 uint32 buf_state;
4857
4858 /* create a tag so we can lookup the buffer */
4859 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4860
4861 /* determine its hash code and partition lock ID */
4862 bufHash = BufTableHashCode(&bufTag);
4863 bufPartitionLock = BufMappingPartitionLock(bufHash);
4864
4865 /* Check that it is in the buffer pool. If not, do nothing. */
4866 LWLockAcquire(bufPartitionLock, LW_SHARED);
4867 buf_id = BufTableLookup(&bufTag, bufHash);
4868 LWLockRelease(bufPartitionLock);
4869
4870 if (buf_id < 0)
4871 continue;
4872
4873 bufHdr = GetBufferDescriptor(buf_id);
4874
4875 /*
4876 * We need to lock the buffer header and recheck if the buffer is
4877 * still associated with the same block because the buffer could be
4878 * evicted by some other backend loading blocks for a different
4879 * relation after we release lock on the BufMapping table.
4880 */
4881 buf_state = LockBufHdr(bufHdr);
4882
4883 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4884 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4885 bufHdr->tag.blockNum >= firstDelBlock)
4886 InvalidateBuffer(bufHdr); /* releases spinlock */
4887 else
4888 UnlockBufHdr(bufHdr, buf_state);
4889 }
4890}
4891
4892/* ---------------------------------------------------------------------
4893 * DropDatabaseBuffers
4894 *
4895 * This function removes all the buffers in the buffer cache for a
4896 * particular database. Dirty pages are simply dropped, without
4897 * bothering to write them out first. This is used when we destroy a
4898 * database, to avoid trying to flush data to disk when the directory
4899 * tree no longer exists. Implementation is pretty similar to
4900 * DropRelationBuffers() which is for destroying just one relation.
4901 * --------------------------------------------------------------------
4902 */
4903void
4905{
4906 int i;
4907
4908 /*
4909 * We needn't consider local buffers, since by assumption the target
4910 * database isn't our own.
4911 */
4912
4913 for (i = 0; i < NBuffers; i++)
4914 {
4915 BufferDesc *bufHdr = GetBufferDescriptor(i);
4916 uint32 buf_state;
4917
4918 /*
4919 * As in DropRelationBuffers, an unlocked precheck should be safe and
4920 * saves some cycles.
4921 */
4922 if (bufHdr->tag.dbOid != dbid)
4923 continue;
4924
4925 buf_state = LockBufHdr(bufHdr);
4926 if (bufHdr->tag.dbOid == dbid)
4927 InvalidateBuffer(bufHdr); /* releases spinlock */
4928 else
4929 UnlockBufHdr(bufHdr, buf_state);
4930 }
4931}
4932
4933/* ---------------------------------------------------------------------
4934 * FlushRelationBuffers
4935 *
4936 * This function writes all dirty pages of a relation out to disk
4937 * (or more accurately, out to kernel disk buffers), ensuring that the
4938 * kernel has an up-to-date view of the relation.
4939 *
4940 * Generally, the caller should be holding AccessExclusiveLock on the
4941 * target relation to ensure that no other backend is busy dirtying
4942 * more blocks of the relation; the effects can't be expected to last
4943 * after the lock is released.
4944 *
4945 * XXX currently it sequentially searches the buffer pool, should be
4946 * changed to more clever ways of searching. This routine is not
4947 * used in any performance-critical code paths, so it's not worth
4948 * adding additional overhead to normal paths to make it go faster.
4949 * --------------------------------------------------------------------
4950 */
4951void
4953{
4954 int i;
4955 BufferDesc *bufHdr;
4956 SMgrRelation srel = RelationGetSmgr(rel);
4957
4958 if (RelationUsesLocalBuffers(rel))
4959 {
4960 for (i = 0; i < NLocBuffer; i++)
4961 {
4962 uint32 buf_state;
4963
4964 bufHdr = GetLocalBufferDescriptor(i);
4965 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4966 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4967 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4968 {
4969 ErrorContextCallback errcallback;
4970
4971 /* Setup error traceback support for ereport() */
4973 errcallback.arg = bufHdr;
4974 errcallback.previous = error_context_stack;
4975 error_context_stack = &errcallback;
4976
4977 /* Make sure we can handle the pin */
4980
4981 /*
4982 * Pin/unpin mostly to make valgrind work, but it also seems
4983 * like the right thing to do.
4984 */
4985 PinLocalBuffer(bufHdr, false);
4986
4987
4988 FlushLocalBuffer(bufHdr, srel);
4989
4991
4992 /* Pop the error context stack */
4993 error_context_stack = errcallback.previous;
4994 }
4995 }
4996
4997 return;
4998 }
4999
5000 for (i = 0; i < NBuffers; i++)
5001 {
5002 uint32 buf_state;
5003
5004 bufHdr = GetBufferDescriptor(i);
5005
5006 /*
5007 * As in DropRelationBuffers, an unlocked precheck should be safe and
5008 * saves some cycles.
5009 */
5010 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5011 continue;
5012
5013 /* Make sure we can handle the pin */
5016
5017 buf_state = LockBufHdr(bufHdr);
5018 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5019 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5020 {
5021 PinBuffer_Locked(bufHdr);
5023 UnpinBuffer(bufHdr);
5024 }
5025 else
5026 UnlockBufHdr(bufHdr, buf_state);
5027 }
5028}
5029
5030/* ---------------------------------------------------------------------
5031 * FlushRelationsAllBuffers
5032 *
5033 * This function flushes out of the buffer pool all the pages of all
5034 * forks of the specified smgr relations. It's equivalent to calling
5035 * FlushRelationBuffers once per relation. The relations are assumed not
5036 * to use local buffers.
5037 * --------------------------------------------------------------------
5038 */
5039void
5041{
5042 int i;
5043 SMgrSortArray *srels;
5044 bool use_bsearch;
5045
5046 if (nrels == 0)
5047 return;
5048
5049 /* fill-in array for qsort */
5050 srels = palloc(sizeof(SMgrSortArray) * nrels);
5051
5052 for (i = 0; i < nrels; i++)
5053 {
5054 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5055
5056 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5057 srels[i].srel = smgrs[i];
5058 }
5059
5060 /*
5061 * Save the bsearch overhead for low number of relations to sync. See
5062 * DropRelationsAllBuffers for details.
5063 */
5064 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5065
5066 /* sort the list of SMgrRelations if necessary */
5067 if (use_bsearch)
5068 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5069
5070 for (i = 0; i < NBuffers; i++)
5071 {
5072 SMgrSortArray *srelent = NULL;
5073 BufferDesc *bufHdr = GetBufferDescriptor(i);
5074 uint32 buf_state;
5075
5076 /*
5077 * As in DropRelationBuffers, an unlocked precheck should be safe and
5078 * saves some cycles.
5079 */
5080
5081 if (!use_bsearch)
5082 {
5083 int j;
5084
5085 for (j = 0; j < nrels; j++)
5086 {
5087 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5088 {
5089 srelent = &srels[j];
5090 break;
5091 }
5092 }
5093 }
5094 else
5095 {
5096 RelFileLocator rlocator;
5097
5098 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5099 srelent = bsearch(&rlocator,
5100 srels, nrels, sizeof(SMgrSortArray),
5102 }
5103
5104 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5105 if (srelent == NULL)
5106 continue;
5107
5108 /* Make sure we can handle the pin */
5111
5112 buf_state = LockBufHdr(bufHdr);
5113 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5114 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5115 {
5116 PinBuffer_Locked(bufHdr);
5118 UnpinBuffer(bufHdr);
5119 }
5120 else
5121 UnlockBufHdr(bufHdr, buf_state);
5122 }
5123
5124 pfree(srels);
5125}
5126
5127/* ---------------------------------------------------------------------
5128 * RelationCopyStorageUsingBuffer
5129 *
5130 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5131 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5132 *
5133 * Refer comments atop CreateAndCopyRelationData() for details about
5134 * 'permanent' parameter.
5135 * --------------------------------------------------------------------
5136 */
5137static void
5139 RelFileLocator dstlocator,
5140 ForkNumber forkNum, bool permanent)
5141{
5142 Buffer srcBuf;
5143 Buffer dstBuf;
5144 Page srcPage;
5145 Page dstPage;
5146 bool use_wal;
5147 BlockNumber nblocks;
5148 BlockNumber blkno;
5150 BufferAccessStrategy bstrategy_src;
5151 BufferAccessStrategy bstrategy_dst;
5153 ReadStream *src_stream;
5154 SMgrRelation src_smgr;
5155
5156 /*
5157 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5158 * can skip it when copying any fork of an unlogged relation other than
5159 * the init fork.
5160 */
5161 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5162
5163 /* Get number of blocks in the source relation. */
5164 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5165 forkNum);
5166
5167 /* Nothing to copy; just return. */
5168 if (nblocks == 0)
5169 return;
5170
5171 /*
5172 * Bulk extend the destination relation of the same size as the source
5173 * relation before starting to copy block by block.
5174 */
5175 memset(buf.data, 0, BLCKSZ);
5176 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5177 buf.data, true);
5178
5179 /* This is a bulk operation, so use buffer access strategies. */
5180 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5181 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5182
5183 /* Initialize streaming read */
5184 p.current_blocknum = 0;
5185 p.last_exclusive = nblocks;
5186 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5187
5188 /*
5189 * It is safe to use batchmode as block_range_read_stream_cb takes no
5190 * locks.
5191 */
5194 bstrategy_src,
5195 src_smgr,
5196 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5197 forkNum,
5199 &p,
5200 0);
5201
5202 /* Iterate over each block of the source relation file. */
5203 for (blkno = 0; blkno < nblocks; blkno++)
5204 {
5206
5207 /* Read block from source relation. */
5208 srcBuf = read_stream_next_buffer(src_stream, NULL);
5210 srcPage = BufferGetPage(srcBuf);
5211
5212 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5213 BufferGetBlockNumber(srcBuf),
5214 RBM_ZERO_AND_LOCK, bstrategy_dst,
5215 permanent);
5216 dstPage = BufferGetPage(dstBuf);
5217
5219
5220 /* Copy page data from the source to the destination. */
5221 memcpy(dstPage, srcPage, BLCKSZ);
5222 MarkBufferDirty(dstBuf);
5223
5224 /* WAL-log the copied page. */
5225 if (use_wal)
5226 log_newpage_buffer(dstBuf, true);
5227
5229
5230 UnlockReleaseBuffer(dstBuf);
5231 UnlockReleaseBuffer(srcBuf);
5232 }
5233 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5234 read_stream_end(src_stream);
5235
5236 FreeAccessStrategy(bstrategy_src);
5237 FreeAccessStrategy(bstrategy_dst);
5238}
5239
5240/* ---------------------------------------------------------------------
5241 * CreateAndCopyRelationData
5242 *
5243 * Create destination relation storage and copy all forks from the
5244 * source relation to the destination.
5245 *
5246 * Pass permanent as true for permanent relations and false for
5247 * unlogged relations. Currently this API is not supported for
5248 * temporary relations.
5249 * --------------------------------------------------------------------
5250 */
5251void
5253 RelFileLocator dst_rlocator, bool permanent)
5254{
5255 char relpersistence;
5256 SMgrRelation src_rel;
5257 SMgrRelation dst_rel;
5258
5259 /* Set the relpersistence. */
5260 relpersistence = permanent ?
5261 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5262
5263 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5264 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5265
5266 /*
5267 * Create and copy all forks of the relation. During create database we
5268 * have a separate cleanup mechanism which deletes complete database
5269 * directory. Therefore, each individual relation doesn't need to be
5270 * registered for cleanup.
5271 */
5272 RelationCreateStorage(dst_rlocator, relpersistence, false);
5273
5274 /* copy main fork. */
5275 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5276 permanent);
5277
5278 /* copy those extra forks that exist */
5279 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5280 forkNum <= MAX_FORKNUM; forkNum++)
5281 {
5282 if (smgrexists(src_rel, forkNum))
5283 {
5284 smgrcreate(dst_rel, forkNum, false);
5285
5286 /*
5287 * WAL log creation if the relation is persistent, or this is the
5288 * init fork of an unlogged relation.
5289 */
5290 if (permanent || forkNum == INIT_FORKNUM)
5291 log_smgrcreate(&dst_rlocator, forkNum);
5292
5293 /* Copy a fork's data, block by block. */
5294 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5295 permanent);
5296 }
5297 }
5298}
5299
5300/* ---------------------------------------------------------------------
5301 * FlushDatabaseBuffers
5302 *
5303 * This function writes all dirty pages of a database out to disk
5304 * (or more accurately, out to kernel disk buffers), ensuring that the
5305 * kernel has an up-to-date view of the database.
5306 *
5307 * Generally, the caller should be holding an appropriate lock to ensure
5308 * no other backend is active in the target database; otherwise more
5309 * pages could get dirtied.
5310 *
5311 * Note we don't worry about flushing any pages of temporary relations.
5312 * It's assumed these wouldn't be interesting.
5313 * --------------------------------------------------------------------
5314 */
5315void
5317{
5318 int i;
5319 BufferDesc *bufHdr;
5320
5321 for (i = 0; i < NBuffers; i++)
5322 {
5323 uint32 buf_state;
5324
5325 bufHdr = GetBufferDescriptor(i);
5326
5327 /*
5328 * As in DropRelationBuffers, an unlocked precheck should be safe and
5329 * saves some cycles.
5330 */
5331 if (bufHdr->tag.dbOid != dbid)
5332 continue;
5333
5334 /* Make sure we can handle the pin */
5337
5338 buf_state = LockBufHdr(bufHdr);
5339 if (bufHdr->tag.dbOid == dbid &&
5340 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5341 {
5342 PinBuffer_Locked(bufHdr);
5344 UnpinBuffer(bufHdr);
5345 }
5346 else
5347 UnlockBufHdr(bufHdr, buf_state);
5348 }
5349}
5350
5351/*
5352 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5353 * OS.
5354 */
5355void
5357{
5358 BufferDesc *bufHdr;
5359
5360 /* currently not needed, but no fundamental reason not to support */
5362
5364
5365 bufHdr = GetBufferDescriptor(buffer - 1);
5366
5368
5370}
5371
5372/*
5373 * ReleaseBuffer -- release the pin on a buffer
5374 */
5375void
5377{
5378 if (!BufferIsValid(buffer))
5379 elog(ERROR, "bad buffer ID: %d", buffer);
5380
5381 if (BufferIsLocal(buffer))
5383 else
5385}
5386
5387/*
5388 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5389 *
5390 * This is just a shorthand for a common combination.
5391 */
5392void
5394{
5397}
5398
5399/*
5400 * IncrBufferRefCount
5401 * Increment the pin count on a buffer that we have *already* pinned
5402 * at least once.
5403 *
5404 * This function cannot be used on a buffer we do not have pinned,
5405 * because it doesn't change the shared buffer state.
5406 */
5407void
5409{
5412 if (BufferIsLocal(buffer))
5413 LocalRefCount[-buffer - 1]++;
5414 else
5415 {
5417
5418 ref = GetPrivateRefCountEntry(buffer, true);
5419 Assert(ref != NULL);
5420 ref->refcount++;
5421 }
5423}
5424
5425/*
5426 * MarkBufferDirtyHint
5427 *
5428 * Mark a buffer dirty for non-critical changes.
5429 *
5430 * This is essentially the same as MarkBufferDirty, except:
5431 *
5432 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5433 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5434 * 2. The caller might have only share-lock instead of exclusive-lock on the
5435 * buffer's content lock.
5436 * 3. This function does not guarantee that the buffer is always marked dirty
5437 * (due to a race condition), so it cannot be used for important changes.
5438 */
5439void
5441{
5442 BufferDesc *bufHdr;
5443 Page page = BufferGetPage(buffer);
5444
5445 if (!BufferIsValid(buffer))
5446 elog(ERROR, "bad buffer ID: %d", buffer);
5447
5448 if (BufferIsLocal(buffer))
5449 {
5451 return;
5452 }
5453
5454 bufHdr = GetBufferDescriptor(buffer - 1);
5455
5457 /* here, either share or exclusive lock is OK */
5459
5460 /*
5461 * This routine might get called many times on the same page, if we are
5462 * making the first scan after commit of an xact that added/deleted many
5463 * tuples. So, be as quick as we can if the buffer is already dirty. We
5464 * do this by not acquiring spinlock if it looks like the status bits are
5465 * already set. Since we make this test unlocked, there's a chance we
5466 * might fail to notice that the flags have just been cleared, and failed
5467 * to reset them, due to memory-ordering issues. But since this function
5468 * is only intended to be used in cases where failing to write out the
5469 * data would be harmless anyway, it doesn't really matter.
5470 */
5471 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5473 {
5475 bool dirtied = false;
5476 bool delayChkptFlags = false;
5477 uint32 buf_state;
5478
5479 /*
5480 * If we need to protect hint bit updates from torn writes, WAL-log a
5481 * full page image of the page. This full page image is only necessary
5482 * if the hint bit update is the first change to the page since the
5483 * last checkpoint.
5484 *
5485 * We don't check full_page_writes here because that logic is included
5486 * when we call XLogInsert() since the value changes dynamically.
5487 */
5488 if (XLogHintBitIsNeeded() &&
5490 {
5491 /*
5492 * If we must not write WAL, due to a relfilelocator-specific
5493 * condition or being in recovery, don't dirty the page. We can
5494 * set the hint, just not dirty the page as a result so the hint
5495 * is lost when we evict the page or shutdown.
5496 *
5497 * See src/backend/storage/page/README for longer discussion.
5498 */
5499 if (RecoveryInProgress() ||
5501 return;
5502
5503 /*
5504 * If the block is already dirty because we either made a change
5505 * or set a hint already, then we don't need to write a full page
5506 * image. Note that aggressive cleaning of blocks dirtied by hint
5507 * bit setting would increase the call rate. Bulk setting of hint
5508 * bits would reduce the call rate...
5509 *
5510 * We must issue the WAL record before we mark the buffer dirty.
5511 * Otherwise we might write the page before we write the WAL. That
5512 * causes a race condition, since a checkpoint might occur between
5513 * writing the WAL record and marking the buffer dirty. We solve
5514 * that with a kluge, but one that is already in use during
5515 * transaction commit to prevent race conditions. Basically, we
5516 * simply prevent the checkpoint WAL record from being written
5517 * until we have marked the buffer dirty. We don't start the
5518 * checkpoint flush until we have marked dirty, so our checkpoint
5519 * must flush the change to disk successfully or the checkpoint
5520 * never gets written, so crash recovery will fix.
5521 *
5522 * It's possible we may enter here without an xid, so it is
5523 * essential that CreateCheckPoint waits for virtual transactions
5524 * rather than full transactionids.
5525 */
5528 delayChkptFlags = true;
5529 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5530 }
5531
5532 buf_state = LockBufHdr(bufHdr);
5533
5534 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5535
5536 if (!(buf_state & BM_DIRTY))
5537 {
5538 dirtied = true; /* Means "will be dirtied by this action" */
5539
5540 /*
5541 * Set the page LSN if we wrote a backup block. We aren't supposed
5542 * to set this when only holding a share lock but as long as we
5543 * serialise it somehow we're OK. We choose to set LSN while
5544 * holding the buffer header lock, which causes any reader of an
5545 * LSN who holds only a share lock to also obtain a buffer header
5546 * lock before using PageGetLSN(), which is enforced in
5547 * BufferGetLSNAtomic().
5548 *
5549 * If checksums are enabled, you might think we should reset the
5550 * checksum here. That will happen when the page is written
5551 * sometime later in this checkpoint cycle.
5552 */
5553 if (!XLogRecPtrIsInvalid(lsn))
5554 PageSetLSN(page, lsn);
5555 }
5556
5557 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5558 UnlockBufHdr(bufHdr, buf_state);
5559
5560 if (delayChkptFlags)
5561 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5562
5563 if (dirtied)
5564 {
5566 if (VacuumCostActive)
5568 }
5569 }
5570}
5571
5572/*
5573 * Release buffer content locks for shared buffers.
5574 *
5575 * Used to clean up after errors.
5576 *
5577 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5578 * of releasing buffer content locks per se; the only thing we need to deal
5579 * with here is clearing any PIN_COUNT request that was in progress.
5580 */
5581void
5583{
5585
5586 if (buf)
5587 {
5588 uint32 buf_state;
5589
5590 buf_state = LockBufHdr(buf);
5591
5592 /*
5593 * Don't complain if flag bit not set; it could have been reset but we
5594 * got a cancel/die interrupt before getting the signal.
5595 */
5596 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5597 buf->wait_backend_pgprocno == MyProcNumber)
5598 buf_state &= ~BM_PIN_COUNT_WAITER;
5599
5600 UnlockBufHdr(buf, buf_state);
5601
5602 PinCountWaitBuf = NULL;
5603 }
5604}
5605
5606/*
5607 * Acquire or release the content_lock for the buffer.
5608 */
5609void
5611{
5612 BufferDesc *buf;
5613
5615 if (BufferIsLocal(buffer))
5616 return; /* local buffers need no lock */
5617
5619
5620 if (mode == BUFFER_LOCK_UNLOCK)
5622 else if (mode == BUFFER_LOCK_SHARE)
5624 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5626 else
5627 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5628}
5629
5630/*
5631 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5632 *
5633 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5634 */
5635bool
5637{
5638 BufferDesc *buf;
5639
5641 if (BufferIsLocal(buffer))
5642 return true; /* act as though we got it */
5643
5645
5647 LW_EXCLUSIVE);
5648}
5649
5650/*
5651 * Verify that this backend is pinning the buffer exactly once.
5652 *
5653 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5654 * holds a pin on the buffer. We do not care whether some other backend does.
5655 */
5656void
5658{
5659 if (BufferIsLocal(buffer))
5660 {
5661 if (LocalRefCount[-buffer - 1] != 1)
5662 elog(ERROR, "incorrect local pin count: %d",
5663 LocalRefCount[-buffer - 1]);
5664 }
5665 else
5666 {
5667 if (GetPrivateRefCount(buffer) != 1)
5668 elog(ERROR, "incorrect local pin count: %d",
5670 }
5671}
5672
5673/*
5674 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5675 *
5676 * Items may be deleted from a disk page only when the caller (a) holds an
5677 * exclusive lock on the buffer and (b) has observed that no other backend
5678 * holds a pin on the buffer. If there is a pin, then the other backend
5679 * might have a pointer into the buffer (for example, a heapscan reference
5680 * to an item --- see README for more details). It's OK if a pin is added
5681 * after the cleanup starts, however; the newly-arrived backend will be
5682 * unable to look at the page until we release the exclusive lock.
5683 *
5684 * To implement this protocol, a would-be deleter must pin the buffer and
5685 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5686 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5687 * it has successfully observed pin count = 1.
5688 */
5689void
5691{
5692 BufferDesc *bufHdr;
5693 TimestampTz waitStart = 0;
5694 bool waiting = false;
5695 bool logged_recovery_conflict = false;
5696
5698 Assert(PinCountWaitBuf == NULL);
5699
5701
5702 /*
5703 * We do not yet need to be worried about in-progress AIOs holding a pin,
5704 * as we, so far, only support doing reads via AIO and this function can
5705 * only be called once the buffer is valid (i.e. no read can be in
5706 * flight).
5707 */
5708
5709 /* Nobody else to wait for */
5710 if (BufferIsLocal(buffer))
5711 return;
5712
5713 bufHdr = GetBufferDescriptor(buffer - 1);
5714
5715 for (;;)
5716 {
5717 uint32 buf_state;
5718
5719 /* Try to acquire lock */
5721 buf_state = LockBufHdr(bufHdr);
5722
5723 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5724 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5725 {
5726 /* Successfully acquired exclusive lock with pincount 1 */
5727 UnlockBufHdr(bufHdr, buf_state);
5728
5729 /*
5730 * Emit the log message if recovery conflict on buffer pin was
5731 * resolved but the startup process waited longer than
5732 * deadlock_timeout for it.
5733 */
5734 if (logged_recovery_conflict)
5736 waitStart, GetCurrentTimestamp(),
5737 NULL, false);
5738
5739 if (waiting)
5740 {
5741 /* reset ps display to remove the suffix if we added one */
5743 waiting = false;
5744 }
5745 return;
5746 }
5747 /* Failed, so mark myself as waiting for pincount 1 */
5748 if (buf_state & BM_PIN_COUNT_WAITER)
5749 {
5750 UnlockBufHdr(bufHdr, buf_state);
5752 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5753 }
5755 PinCountWaitBuf = bufHdr;
5756 buf_state |= BM_PIN_COUNT_WAITER;
5757 UnlockBufHdr(bufHdr, buf_state);
5759
5760 /* Wait to be signaled by UnpinBuffer() */
5761 if (InHotStandby)
5762 {
5763 if (!waiting)
5764 {
5765 /* adjust the process title to indicate that it's waiting */
5766 set_ps_display_suffix("waiting");
5767 waiting = true;
5768 }
5769
5770 /*
5771 * Emit the log message if the startup process is waiting longer
5772 * than deadlock_timeout for recovery conflict on buffer pin.
5773 *
5774 * Skip this if first time through because the startup process has
5775 * not started waiting yet in this case. So, the wait start
5776 * timestamp is set after this logic.
5777 */
5778 if (waitStart != 0 && !logged_recovery_conflict)
5779 {
5781
5782 if (TimestampDifferenceExceeds(waitStart, now,
5784 {
5786 waitStart, now, NULL, true);
5787 logged_recovery_conflict = true;
5788 }
5789 }
5790
5791 /*
5792 * Set the wait start timestamp if logging is enabled and first
5793 * time through.
5794 */
5795 if (log_recovery_conflict_waits && waitStart == 0)
5796 waitStart = GetCurrentTimestamp();
5797
5798 /* Publish the bufid that Startup process waits on */
5800 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5802 /* Reset the published bufid */
5804 }
5805 else
5806 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5807
5808 /*
5809 * Remove flag marking us as waiter. Normally this will not be set
5810 * anymore, but ProcWaitForSignal() can return for other signals as
5811 * well. We take care to only reset the flag if we're the waiter, as
5812 * theoretically another backend could have started waiting. That's
5813 * impossible with the current usages due to table level locking, but
5814 * better be safe.
5815 */
5816 buf_state = LockBufHdr(bufHdr);
5817 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5819 buf_state &= ~BM_PIN_COUNT_WAITER;
5820 UnlockBufHdr(bufHdr, buf_state);
5821
5822 PinCountWaitBuf = NULL;
5823 /* Loop back and try again */
5824 }
5825}
5826
5827/*
5828 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5829 * requests cancellation of all pin holders that are blocking it.
5830 */
5831bool
5833{
5834 int bufid = GetStartupBufferPinWaitBufId();
5835
5836 /*
5837 * If we get woken slowly then it's possible that the Startup process was
5838 * already woken by other backends before we got here. Also possible that
5839 * we get here by multiple interrupts or interrupts at inappropriate
5840 * times, so make sure we do nothing if the bufid is not set.
5841 */
5842 if (bufid < 0)
5843 return false;
5844
5845 if (GetPrivateRefCount(bufid + 1) > 0)
5846 return true;
5847
5848 return false;
5849}
5850
5851/*
5852 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5853 *
5854 * We won't loop, but just check once to see if the pin count is OK. If
5855 * not, return false with no lock held.
5856 */
5857bool
5859{
5860 BufferDesc *bufHdr;
5861 uint32 buf_state,
5862 refcount;
5863
5865
5866 /* see AIO related comment in LockBufferForCleanup() */
5867
5868 if (BufferIsLocal(buffer))
5869 {
5871 /* There should be exactly one pin */
5872 Assert(refcount > 0);
5873 if (refcount != 1)
5874 return false;
5875 /* Nobody else to wait for */
5876 return true;
5877 }
5878
5879 /* There should be exactly one local pin */
5882 if (refcount != 1)
5883 return false;
5884
5885 /* Try to acquire lock */
5887 return false;
5888
5889 bufHdr = GetBufferDescriptor(buffer - 1);
5890 buf_state = LockBufHdr(bufHdr);
5891 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5892
5893 Assert(refcount > 0);
5894 if (refcount == 1)
5895 {
5896 /* Successfully acquired exclusive lock with pincount 1 */
5897 UnlockBufHdr(bufHdr, buf_state);
5898 return true;
5899 }
5900
5901 /* Failed, so release the lock */
5902 UnlockBufHdr(bufHdr, buf_state);
5904 return false;
5905}
5906
5907/*
5908 * IsBufferCleanupOK - as above, but we already have the lock
5909 *
5910 * Check whether it's OK to perform cleanup on a buffer we've already
5911 * locked. If we observe that the pin count is 1, our exclusive lock
5912 * happens to be a cleanup lock, and we can proceed with anything that
5913 * would have been allowable had we sought a cleanup lock originally.
5914 */
5915bool
5917{
5918 BufferDesc *bufHdr;
5919 uint32 buf_state;
5920
5922
5923 /* see AIO related comment in LockBufferForCleanup() */
5924
5925 if (BufferIsLocal(buffer))
5926 {
5927 /* There should be exactly one pin */
5928 if (LocalRefCount[-buffer - 1] != 1)
5929 return false;
5930 /* Nobody else to wait for */
5931 return true;
5932 }
5933
5934 /* There should be exactly one local pin */
5935 if (GetPrivateRefCount(buffer) != 1)
5936 return false;
5937
5938 bufHdr = GetBufferDescriptor(buffer - 1);
5939
5940 /* caller must hold exclusive lock on buffer */
5942
5943 buf_state = LockBufHdr(bufHdr);
5944
5945 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5946 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5947 {
5948 /* pincount is OK. */
5949 UnlockBufHdr(bufHdr, buf_state);
5950 return true;
5951 }
5952
5953 UnlockBufHdr(bufHdr, buf_state);
5954 return false;
5955}
5956
5957
5958/*
5959 * Functions for buffer I/O handling
5960 *
5961 * Also note that these are used only for shared buffers, not local ones.
5962 */
5963
5964/*
5965 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5966 */
5967static void
5969{
5971
5973 for (;;)
5974 {
5975 uint32 buf_state;
5976 PgAioWaitRef iow;
5977
5978 /*
5979 * It may not be necessary to acquire the spinlock to check the flag
5980 * here, but since this test is essential for correctness, we'd better
5981 * play it safe.
5982 */
5983 buf_state = LockBufHdr(buf);
5984
5985 /*
5986 * Copy the wait reference while holding the spinlock. This protects
5987 * against a concurrent TerminateBufferIO() in another backend from
5988 * clearing the wref while it's being read.
5989 */
5990 iow = buf->io_wref;
5991 UnlockBufHdr(buf, buf_state);
5992
5993 /* no IO in progress, we don't need to wait */
5994 if (!(buf_state & BM_IO_IN_PROGRESS))
5995 break;
5996
5997 /*
5998 * The buffer has asynchronous IO in progress, wait for it to
5999 * complete.
6000 */
6001 if (pgaio_wref_valid(&iow))
6002 {
6003 pgaio_wref_wait(&iow);
6004
6005 /*
6006 * The AIO subsystem internally uses condition variables and thus
6007 * might remove this backend from the BufferDesc's CV. While that
6008 * wouldn't cause a correctness issue (the first CV sleep just
6009 * immediately returns if not already registered), it seems worth
6010 * avoiding unnecessary loop iterations, given that we take care
6011 * to do so at the start of the function.
6012 */
6014 continue;
6015 }
6016
6017 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6018 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6019 }
6021}
6022
6023/*
6024 * StartBufferIO: begin I/O on this buffer
6025 * (Assumptions)
6026 * My process is executing no IO on this buffer
6027 * The buffer is Pinned
6028 *
6029 * In some scenarios multiple backends could attempt the same I/O operation
6030 * concurrently. If someone else has already started I/O on this buffer then
6031 * we will wait for completion of the IO using WaitIO().
6032 *
6033 * Input operations are only attempted on buffers that are not BM_VALID,
6034 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
6035 * so we can always tell if the work is already done.
6036 *
6037 * Returns true if we successfully marked the buffer as I/O busy,
6038 * false if someone else already did the work.
6039 *
6040 * If nowait is true, then we don't wait for an I/O to be finished by another
6041 * backend. In that case, false indicates either that the I/O was already
6042 * finished, or is still in progress. This is useful for callers that want to
6043 * find out if they can perform the I/O as part of a larger operation, without
6044 * waiting for the answer or distinguishing the reasons why not.
6045 */
6046bool
6047StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
6048{
6049 uint32 buf_state;
6050
6052
6053 for (;;)
6054 {
6055 buf_state = LockBufHdr(buf);
6056
6057 if (!(buf_state & BM_IO_IN_PROGRESS))
6058 break;
6059 UnlockBufHdr(buf, buf_state);
6060 if (nowait)
6061 return false;
6062 WaitIO(buf);
6063 }
6064
6065 /* Once we get here, there is definitely no I/O active on this buffer */
6066
6067 /* Check if someone else already did the I/O */
6068 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6069 {
6070 UnlockBufHdr(buf, buf_state);
6071 return false;
6072 }
6073
6074 buf_state |= BM_IO_IN_PROGRESS;
6075 UnlockBufHdr(buf, buf_state);
6076
6079
6080 return true;
6081}
6082
6083/*
6084 * TerminateBufferIO: release a buffer we were doing I/O on
6085 * (Assumptions)
6086 * My process is executing IO for the buffer
6087 * BM_IO_IN_PROGRESS bit is set for the buffer
6088 * The buffer is Pinned
6089 *
6090 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6091 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6092 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6093 * marking the buffer clean if it was re-dirtied while we were writing.
6094 *
6095 * set_flag_bits gets ORed into the buffer's flags. It must include
6096 * BM_IO_ERROR in a failure case. For successful completion it could
6097 * be 0, or BM_VALID if we just finished reading in the page.
6098 *
6099 * If forget_owner is true, we release the buffer I/O from the current
6100 * resource owner. (forget_owner=false is used when the resource owner itself
6101 * is being released)
6102 */
6103void
6104TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6105 bool forget_owner, bool release_aio)
6106{
6107 uint32 buf_state;
6108
6109 buf_state = LockBufHdr(buf);
6110
6111 Assert(buf_state & BM_IO_IN_PROGRESS);
6112 buf_state &= ~BM_IO_IN_PROGRESS;
6113
6114 /* Clear earlier errors, if this IO failed, it'll be marked again */
6115 buf_state &= ~BM_IO_ERROR;
6116
6117 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6118 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6119
6120 if (release_aio)
6121 {
6122 /* release ownership by the AIO subsystem */
6123 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6124 buf_state -= BUF_REFCOUNT_ONE;
6125 pgaio_wref_clear(&buf->io_wref);
6126 }
6127
6128 buf_state |= set_flag_bits;
6129 UnlockBufHdr(buf, buf_state);
6130
6131 if (forget_owner)
6134
6136
6137 /*
6138 * Support LockBufferForCleanup()
6139 *
6140 * We may have just released the last pin other than the waiter's. In most
6141 * cases, this backend holds another pin on the buffer. But, if, for
6142 * example, this backend is completing an IO issued by another backend, it
6143 * may be time to wake the waiter.
6144 */
6145 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6147}
6148
6149/*
6150 * AbortBufferIO: Clean up active buffer I/O after an error.
6151 *
6152 * All LWLocks we might have held have been released,
6153 * but we haven't yet released buffer pins, so the buffer is still pinned.
6154 *
6155 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6156 * possible the error condition wasn't related to the I/O.
6157 *
6158 * Note: this does not remove the buffer I/O from the resource owner.
6159 * That's correct when we're releasing the whole resource owner, but
6160 * beware if you use this in other contexts.
6161 */
6162static void
6164{
6165 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6166 uint32 buf_state;
6167
6168 buf_state = LockBufHdr(buf_hdr);
6169 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6170
6171 if (!(buf_state & BM_VALID))
6172 {
6173 Assert(!(buf_state & BM_DIRTY));
6174 UnlockBufHdr(buf_hdr, buf_state);
6175 }
6176 else
6177 {
6178 Assert(buf_state & BM_DIRTY);
6179 UnlockBufHdr(buf_hdr, buf_state);
6180
6181 /* Issue notice if this is not the first failure... */
6182 if (buf_state & BM_IO_ERROR)
6183 {
6184 /* Buffer is pinned, so we can read tag without spinlock */
6186 (errcode(ERRCODE_IO_ERROR),
6187 errmsg("could not write block %u of %s",
6188 buf_hdr->tag.blockNum,
6190 BufTagGetForkNum(&buf_hdr->tag)).str),
6191 errdetail("Multiple failures --- write error might be permanent.")));
6192 }
6193 }
6194
6195 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6196}
6197
6198/*
6199 * Error context callback for errors occurring during shared buffer writes.
6200 */
6201static void
6203{
6204 BufferDesc *bufHdr = (BufferDesc *) arg;
6205
6206 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6207 if (bufHdr != NULL)
6208 errcontext("writing block %u of relation \"%s\"",
6209 bufHdr->tag.blockNum,
6211 BufTagGetForkNum(&bufHdr->tag)).str);
6212}
6213
6214/*
6215 * Error context callback for errors occurring during local buffer writes.
6216 */
6217static void
6219{
6220 BufferDesc *bufHdr = (BufferDesc *) arg;
6221
6222 if (bufHdr != NULL)
6223 errcontext("writing block %u of relation \"%s\"",
6224 bufHdr->tag.blockNum,
6227 BufTagGetForkNum(&bufHdr->tag)).str);
6228}
6229
6230/*
6231 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6232 */
6233static int
6234rlocator_comparator(const void *p1, const void *p2)
6235{
6236 RelFileLocator n1 = *(const RelFileLocator *) p1;
6237 RelFileLocator n2 = *(const RelFileLocator *) p2;
6238
6239 if (n1.relNumber < n2.relNumber)
6240 return -1;
6241 else if (n1.relNumber > n2.relNumber)
6242 return 1;
6243
6244 if (n1.dbOid < n2.dbOid)
6245 return -1;
6246 else if (n1.dbOid > n2.dbOid)
6247 return 1;
6248
6249 if (n1.spcOid < n2.spcOid)
6250 return -1;
6251 else if (n1.spcOid > n2.spcOid)
6252 return 1;
6253 else
6254 return 0;
6255}
6256
6257/*
6258 * Lock buffer header - set BM_LOCKED in buffer state.
6259 */
6260uint32
6262{
6263 SpinDelayStatus delayStatus;
6264 uint32 old_buf_state;
6265
6267
6268 init_local_spin_delay(&delayStatus);
6269
6270 while (true)
6271 {
6272 /* set BM_LOCKED flag */
6273 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6274 /* if it wasn't set before we're OK */
6275 if (!(old_buf_state & BM_LOCKED))
6276 break;
6277 perform_spin_delay(&delayStatus);
6278 }
6279 finish_spin_delay(&delayStatus);
6280 return old_buf_state | BM_LOCKED;
6281}
6282
6283/*
6284 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6285 * state at that point.
6286 *
6287 * Obviously the buffer could be locked by the time the value is returned, so
6288 * this is primarily useful in CAS style loops.
6289 */
6292{
6293 SpinDelayStatus delayStatus;
6294 uint32 buf_state;
6295
6296 init_local_spin_delay(&delayStatus);
6297
6298 buf_state = pg_atomic_read_u32(&buf->state);
6299
6300 while (buf_state & BM_LOCKED)
6301 {
6302 perform_spin_delay(&delayStatus);
6303 buf_state = pg_atomic_read_u32(&buf->state);
6304 }
6305
6306 finish_spin_delay(&delayStatus);
6307
6308 return buf_state;
6309}
6310
6311/*
6312 * BufferTag comparator.
6313 */
6314static inline int
6316{
6317 int ret;
6318 RelFileLocator rlocatora;
6319 RelFileLocator rlocatorb;
6320
6321 rlocatora = BufTagGetRelFileLocator(ba);
6322 rlocatorb = BufTagGetRelFileLocator(bb);
6323
6324 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6325
6326 if (ret != 0)
6327 return ret;
6328
6329 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6330 return -1;
6331 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6332 return 1;
6333
6334 if (ba->blockNum < bb->blockNum)
6335 return -1;
6336 if (ba->blockNum > bb->blockNum)
6337 return 1;
6338
6339 return 0;
6340}
6341
6342/*
6343 * Comparator determining the writeout order in a checkpoint.
6344 *
6345 * It is important that tablespaces are compared first, the logic balancing
6346 * writes between tablespaces relies on it.
6347 */
6348static inline int
6350{
6351 /* compare tablespace */
6352 if (a->tsId < b->tsId)
6353 return -1;
6354 else if (a->tsId > b->tsId)
6355 return 1;
6356 /* compare relation */
6357 if (a->relNumber < b->relNumber)
6358 return -1;
6359 else if (a->relNumber > b->relNumber)
6360 return 1;
6361 /* compare fork */
6362 else if (a->forkNum < b->forkNum)
6363 return -1;
6364 else if (a->forkNum > b->forkNum)
6365 return 1;
6366 /* compare block number */
6367 else if (a->blockNum < b->blockNum)
6368 return -1;
6369 else if (a->blockNum > b->blockNum)
6370 return 1;
6371 /* equal page IDs are unlikely, but not impossible */
6372 return 0;
6373}
6374
6375/*
6376 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6377 * progress.
6378 */
6379static int
6381{
6384
6385 /* we want a min-heap, so return 1 for the a < b */
6386 if (sa->progress < sb->progress)
6387 return 1;
6388 else if (sa->progress == sb->progress)
6389 return 0;
6390 else
6391 return -1;
6392}
6393
6394/*
6395 * Initialize a writeback context, discarding potential previous state.
6396 *
6397 * *max_pending is a pointer instead of an immediate value, so the coalesce
6398 * limits can easily changed by the GUC mechanism, and so calling code does
6399 * not have to check the current configuration. A value of 0 means that no
6400 * writeback control will be performed.
6401 */
6402void
6403WritebackContextInit(WritebackContext *context, int *max_pending)
6404{
6405 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6406
6407 context->max_pending = max_pending;
6408 context->nr_pending = 0;
6409}
6410
6411/*
6412 * Add buffer to list of pending writeback requests.
6413 */
6414void
6416 BufferTag *tag)
6417{
6418 PendingWriteback *pending;
6419
6420 /*
6421 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6422 * point in tracking in that case.
6423 */
6425 !enableFsync)
6426 return;
6427
6428 /*
6429 * Add buffer to the pending writeback array, unless writeback control is
6430 * disabled.
6431 */
6432 if (*wb_context->max_pending > 0)
6433 {
6435
6436 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6437
6438 pending->tag = *tag;
6439 }
6440
6441 /*
6442 * Perform pending flushes if the writeback limit is exceeded. This
6443 * includes the case where previously an item has been added, but control
6444 * is now disabled.
6445 */
6446 if (wb_context->nr_pending >= *wb_context->max_pending)
6447 IssuePendingWritebacks(wb_context, io_context);
6448}
6449
6450#define ST_SORT sort_pending_writebacks
6451#define ST_ELEMENT_TYPE PendingWriteback
6452#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6453#define ST_SCOPE static
6454#define ST_DEFINE
6455#include "lib/sort_template.h"
6456
6457/*
6458 * Issue all pending writeback requests, previously scheduled with
6459 * ScheduleBufferTagForWriteback, to the OS.
6460 *
6461 * Because this is only used to improve the OSs IO scheduling we try to never
6462 * error out - it's just a hint.
6463 */
6464void
6466{
6467 instr_time io_start;
6468 int i;
6469
6470 if (wb_context->nr_pending == 0)
6471 return;
6472
6473 /*
6474 * Executing the writes in-order can make them a lot faster, and allows to
6475 * merge writeback requests to consecutive blocks into larger writebacks.
6476 */
6477 sort_pending_writebacks(wb_context->pending_writebacks,
6478 wb_context->nr_pending);
6479
6481
6482 /*
6483 * Coalesce neighbouring writes, but nothing else. For that we iterate
6484 * through the, now sorted, array of pending flushes, and look forward to
6485 * find all neighbouring (or identical) writes.
6486 */
6487 for (i = 0; i < wb_context->nr_pending; i++)
6488 {
6491 SMgrRelation reln;
6492 int ahead;
6493 BufferTag tag;
6494 RelFileLocator currlocator;
6495 Size nblocks = 1;
6496
6497 cur = &wb_context->pending_writebacks[i];
6498 tag = cur->tag;
6499 currlocator = BufTagGetRelFileLocator(&tag);
6500
6501 /*
6502 * Peek ahead, into following writeback requests, to see if they can
6503 * be combined with the current one.
6504 */
6505 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6506 {
6507
6508 next = &wb_context->pending_writebacks[i + ahead + 1];
6509
6510 /* different file, stop */
6511 if (!RelFileLocatorEquals(currlocator,
6512 BufTagGetRelFileLocator(&next->tag)) ||
6513 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6514 break;
6515
6516 /* ok, block queued twice, skip */
6517 if (cur->tag.blockNum == next->tag.blockNum)
6518 continue;
6519
6520 /* only merge consecutive writes */
6521 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6522 break;
6523
6524 nblocks++;
6525 cur = next;
6526 }
6527
6528 i += ahead;
6529
6530 /* and finally tell the kernel to write the data to storage */
6531 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6532 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6533 }
6534
6535 /*
6536 * Assume that writeback requests are only issued for buffers containing
6537 * blocks of permanent relations.
6538 */
6540 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6541
6542 wb_context->nr_pending = 0;
6543}
6544
6545/* ResourceOwner callbacks */
6546
6547static void
6549{
6551
6553}
6554
6555static char *
6557{
6559
6560 return psprintf("lost track of buffer IO on buffer %d", buffer);
6561}
6562
6563static void
6565{
6567
6568 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6569 if (!BufferIsValid(buffer))
6570 elog(ERROR, "bad buffer ID: %d", buffer);
6571
6572 if (BufferIsLocal(buffer))
6574 else
6576}
6577
6578static char *
6580{
6582}
6583
6584/*
6585 * Helper function to evict unpinned buffer whose buffer header lock is
6586 * already acquired.
6587 */
6588static bool
6589EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6590{
6591 uint32 buf_state;
6592 bool result;
6593
6594 *buffer_flushed = false;
6595
6596 buf_state = pg_atomic_read_u32(&(desc->state));
6597 Assert(buf_state & BM_LOCKED);
6598
6599 if ((buf_state & BM_VALID) == 0)
6600 {
6601 UnlockBufHdr(desc, buf_state);
6602 return false;
6603 }
6604
6605 /* Check that it's not pinned already. */
6606 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6607 {
6608 UnlockBufHdr(desc, buf_state);
6609 return false;
6610 }
6611
6612 PinBuffer_Locked(desc); /* releases spinlock */
6613
6614 /* If it was dirty, try to clean it once. */
6615 if (buf_state & BM_DIRTY)
6616 {
6618 *buffer_flushed = true;
6619 }
6620
6621 /* This will return false if it becomes dirty or someone else pins it. */
6622 result = InvalidateVictimBuffer(desc);
6623
6624 UnpinBuffer(desc);
6625
6626 return result;
6627}
6628
6629/*
6630 * Try to evict the current block in a shared buffer.
6631 *
6632 * This function is intended for testing/development use only!
6633 *
6634 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6635 * particular block in mind, it might already have been replaced by some other
6636 * block by the time this function runs. It's also unpinned on return, so the
6637 * buffer might be occupied again by the time control is returned, potentially
6638 * even by the same block. This inherent raciness without other interlocking
6639 * makes the function unsuitable for non-testing usage.
6640 *
6641 * *buffer_flushed is set to true if the buffer was dirty and has been
6642 * flushed, false otherwise. However, *buffer_flushed=true does not
6643 * necessarily mean that we flushed the buffer, it could have been flushed by
6644 * someone else.
6645 *
6646 * Returns true if the buffer was valid and it has now been made invalid.
6647 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6648 * or if the buffer becomes dirty again while we're trying to write it out.
6649 */
6650bool
6651EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6652{
6653 BufferDesc *desc;
6654
6656
6657 /* Make sure we can pin the buffer. */
6660
6661 desc = GetBufferDescriptor(buf - 1);
6662 LockBufHdr(desc);
6663
6664 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6665}
6666
6667/*
6668 * Try to evict all the shared buffers.
6669 *
6670 * This function is intended for testing/development use only! See
6671 * EvictUnpinnedBuffer().
6672 *
6673 * The buffers_* parameters are mandatory and indicate the total count of
6674 * buffers that:
6675 * - buffers_evicted - were evicted
6676 * - buffers_flushed - were flushed
6677 * - buffers_skipped - could not be evicted
6678 */
6679void
6680EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6681 int32 *buffers_skipped)
6682{
6683 *buffers_evicted = 0;
6684 *buffers_skipped = 0;
6685 *buffers_flushed = 0;
6686
6687 for (int buf = 1; buf <= NBuffers; buf++)
6688 {
6689 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6690 uint32 buf_state;
6691 bool buffer_flushed;
6692
6693 buf_state = pg_atomic_read_u32(&desc->state);
6694 if (!(buf_state & BM_VALID))
6695 continue;
6696
6699
6700 LockBufHdr(desc);
6701
6702 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6703 (*buffers_evicted)++;
6704 else
6705 (*buffers_skipped)++;
6706
6707 if (buffer_flushed)
6708 (*buffers_flushed)++;
6709 }
6710}
6711
6712/*
6713 * Try to evict all the shared buffers containing provided relation's pages.
6714 *
6715 * This function is intended for testing/development use only! See
6716 * EvictUnpinnedBuffer().
6717 *
6718 * The caller must hold at least AccessShareLock on the relation to prevent
6719 * the relation from being dropped.
6720 *
6721 * The buffers_* parameters are mandatory and indicate the total count of
6722 * buffers that:
6723 * - buffers_evicted - were evicted
6724 * - buffers_flushed - were flushed
6725 * - buffers_skipped - could not be evicted
6726 */
6727void
6729 int32 *buffers_flushed, int32 *buffers_skipped)
6730{
6732
6733 *buffers_skipped = 0;
6734 *buffers_evicted = 0;
6735 *buffers_flushed = 0;
6736
6737 for (int buf = 1; buf <= NBuffers; buf++)
6738 {
6739 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6740 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6741 bool buffer_flushed;
6742
6743 /* An unlocked precheck should be safe and saves some cycles. */
6744 if ((buf_state & BM_VALID) == 0 ||
6746 continue;
6747
6748 /* Make sure we can pin the buffer. */
6751
6752 buf_state = LockBufHdr(desc);
6753
6754 /* recheck, could have changed without the lock */
6755 if ((buf_state & BM_VALID) == 0 ||
6757 {
6758 UnlockBufHdr(desc, buf_state);
6759 continue;
6760 }
6761
6762 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6763 (*buffers_evicted)++;
6764 else
6765 (*buffers_skipped)++;
6766
6767 if (buffer_flushed)
6768 (*buffers_flushed)++;
6769 }
6770}
6771
6772/*
6773 * Generic implementation of the AIO handle staging callback for readv/writev
6774 * on local/shared buffers.
6775 *
6776 * Each readv/writev can target multiple buffers. The buffers have already
6777 * been registered with the IO handle.
6778 *
6779 * To make the IO ready for execution ("staging"), we need to ensure that the
6780 * targeted buffers are in an appropriate state while the IO is ongoing. For
6781 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6782 * in this backend could lead to this backend's buffer pin being released as
6783 * part of error handling, which in turn could lead to the buffer being
6784 * replaced while IO is ongoing.
6785 */
6787buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6788{
6789 uint64 *io_data;
6790 uint8 handle_data_len;
6791 PgAioWaitRef io_ref;
6793
6794 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6795
6796 pgaio_io_get_wref(ioh, &io_ref);
6797
6798 /* iterate over all buffers affected by the vectored readv/writev */
6799 for (int i = 0; i < handle_data_len; i++)
6800 {
6801 Buffer buffer = (Buffer) io_data[i];
6802 BufferDesc *buf_hdr = is_temp ?
6805 uint32 buf_state;
6806
6807 /*
6808 * Check that all the buffers are actually ones that could conceivably
6809 * be done in one IO, i.e. are sequential. This is the last
6810 * buffer-aware code before IO is actually executed and confusion
6811 * about which buffers are targeted by IO can be hard to debug, making
6812 * it worth doing extra-paranoid checks.
6813 */
6814 if (i == 0)
6815 first = buf_hdr->tag;
6816 else
6817 {
6818 Assert(buf_hdr->tag.relNumber == first.relNumber);
6819 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6820 }
6821
6822 if (is_temp)
6823 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6824 else
6825 buf_state = LockBufHdr(buf_hdr);
6826
6827 /* verify the buffer is in the expected state */
6828 Assert(buf_state & BM_TAG_VALID);
6829 if (is_write)
6830 {
6831 Assert(buf_state & BM_VALID);
6832 Assert(buf_state & BM_DIRTY);
6833 }
6834 else
6835 {
6836 Assert(!(buf_state & BM_VALID));
6837 Assert(!(buf_state & BM_DIRTY));
6838 }
6839
6840 /* temp buffers don't use BM_IO_IN_PROGRESS */
6841 if (!is_temp)
6842 Assert(buf_state & BM_IO_IN_PROGRESS);
6843
6844 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6845
6846 /*
6847 * Reflect that the buffer is now owned by the AIO subsystem.
6848 *
6849 * For local buffers: This can't be done just via LocalRefCount, as
6850 * one might initially think, as this backend could error out while
6851 * AIO is still in progress, releasing all the pins by the backend
6852 * itself.
6853 *
6854 * This pin is released again in TerminateBufferIO().
6855 */
6856 buf_state += BUF_REFCOUNT_ONE;
6857 buf_hdr->io_wref = io_ref;
6858
6859 if (is_temp)
6860 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6861 else
6862 UnlockBufHdr(buf_hdr, buf_state);
6863
6864 /*
6865 * Ensure the content lock that prevents buffer modifications while
6866 * the buffer is being written out is not released early due to an
6867 * error.
6868 */
6869 if (is_write && !is_temp)
6870 {
6871 LWLock *content_lock;
6872
6873 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6874
6875 Assert(LWLockHeldByMe(content_lock));
6876
6877 /*
6878 * Lock is now owned by AIO subsystem.
6879 */
6880 LWLockDisown(content_lock);
6881 }
6882
6883 /*
6884 * Stop tracking this buffer via the resowner - the AIO system now
6885 * keeps track.
6886 */
6887 if (!is_temp)
6889 }
6890}
6891
6892/*
6893 * Decode readv errors as encoded by buffer_readv_encode_error().
6894 */
6895static inline void
6897 bool *zeroed_any,
6898 bool *ignored_any,
6899 uint8 *zeroed_or_error_count,
6900 uint8 *checkfail_count,
6901 uint8 *first_off)
6902{
6903 uint32 rem_error = result.error_data;
6904
6905 /* see static asserts in buffer_readv_encode_error */
6906#define READV_COUNT_BITS 7
6907#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6908
6909 *zeroed_any = rem_error & 1;
6910 rem_error >>= 1;
6911
6912 *ignored_any = rem_error & 1;
6913 rem_error >>= 1;
6914
6915 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6916 rem_error >>= READV_COUNT_BITS;
6917
6918 *checkfail_count = rem_error & READV_COUNT_MASK;
6919 rem_error >>= READV_COUNT_BITS;
6920
6921 *first_off = rem_error & READV_COUNT_MASK;
6922 rem_error >>= READV_COUNT_BITS;
6923}
6924
6925/*
6926 * Helper to encode errors for buffer_readv_complete()
6927 *
6928 * Errors are encoded as follows:
6929 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6930 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6931 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6932 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6933 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6934 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6935 * checksum
6936 */
6937static inline void
6939 bool is_temp,
6940 bool zeroed_any,
6941 bool ignored_any,
6942 uint8 error_count,
6943 uint8 zeroed_count,
6944 uint8 checkfail_count,
6945 uint8 first_error_off,
6946 uint8 first_zeroed_off,
6947 uint8 first_ignored_off)
6948{
6949
6950 uint8 shift = 0;
6951 uint8 zeroed_or_error_count =
6952 error_count > 0 ? error_count : zeroed_count;
6953 uint8 first_off;
6954
6956 "PG_IOV_MAX is bigger than reserved space for error data");
6958 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6959
6960 /*
6961 * We only have space to encode one offset - but luckily that's good
6962 * enough. If there is an error, the error is the interesting offset, same
6963 * with a zeroed buffer vs an ignored buffer.
6964 */
6965 if (error_count > 0)
6966 first_off = first_error_off;
6967 else if (zeroed_count > 0)
6968 first_off = first_zeroed_off;
6969 else
6970 first_off = first_ignored_off;
6971
6972 Assert(!zeroed_any || error_count == 0);
6973
6974 result->error_data = 0;
6975
6976 result->error_data |= zeroed_any << shift;
6977 shift += 1;
6978
6979 result->error_data |= ignored_any << shift;
6980 shift += 1;
6981
6982 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6983 shift += READV_COUNT_BITS;
6984
6985 result->error_data |= ((uint32) checkfail_count) << shift;
6986 shift += READV_COUNT_BITS;
6987
6988 result->error_data |= ((uint32) first_off) << shift;
6989 shift += READV_COUNT_BITS;
6990
6991 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6993
6994 if (error_count > 0)
6995 result->status = PGAIO_RS_ERROR;
6996 else
6997 result->status = PGAIO_RS_WARNING;
6998
6999 /*
7000 * The encoding is complicated enough to warrant cross-checking it against
7001 * the decode function.
7002 */
7003#ifdef USE_ASSERT_CHECKING
7004 {
7005 bool zeroed_any_2,
7006 ignored_any_2;
7007 uint8 zeroed_or_error_count_2,
7008 checkfail_count_2,
7009 first_off_2;
7010
7012 &zeroed_any_2, &ignored_any_2,
7013 &zeroed_or_error_count_2,
7014 &checkfail_count_2,
7015 &first_off_2);
7016 Assert(zeroed_any == zeroed_any_2);
7017 Assert(ignored_any == ignored_any_2);
7018 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7019 Assert(checkfail_count == checkfail_count_2);
7020 Assert(first_off == first_off_2);
7021 }
7022#endif
7023
7024#undef READV_COUNT_BITS
7025#undef READV_COUNT_MASK
7026}
7027
7028/*
7029 * Helper for AIO readv completion callbacks, supporting both shared and temp
7030 * buffers. Gets called once for each buffer in a multi-page read.
7031 */
7034 uint8 flags, bool failed, bool is_temp,
7035 bool *buffer_invalid,
7036 bool *failed_checksum,
7037 bool *ignored_checksum,
7038 bool *zeroed_buffer)
7039{
7040 BufferDesc *buf_hdr = is_temp ?
7043 BufferTag tag = buf_hdr->tag;
7044 char *bufdata = BufferGetBlock(buffer);
7045 uint32 set_flag_bits;
7046 int piv_flags;
7047
7048 /* check that the buffer is in the expected state for a read */
7049#ifdef USE_ASSERT_CHECKING
7050 {
7051 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7052
7053 Assert(buf_state & BM_TAG_VALID);
7054 Assert(!(buf_state & BM_VALID));
7055 /* temp buffers don't use BM_IO_IN_PROGRESS */
7056 if (!is_temp)
7057 Assert(buf_state & BM_IO_IN_PROGRESS);
7058 Assert(!(buf_state & BM_DIRTY));
7059 }
7060#endif
7061
7062 *buffer_invalid = false;
7063 *failed_checksum = false;
7064 *ignored_checksum = false;
7065 *zeroed_buffer = false;
7066
7067 /*
7068 * We ask PageIsVerified() to only log the message about checksum errors,
7069 * as the completion might be run in any backend (or IO workers). We will
7070 * report checksum errors in buffer_readv_report().
7071 */
7072 piv_flags = PIV_LOG_LOG;
7073
7074 /* the local zero_damaged_pages may differ from the definer's */
7076 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7077
7078 /* Check for garbage data. */
7079 if (!failed)
7080 {
7081 /*
7082 * If the buffer is not currently pinned by this backend, e.g. because
7083 * we're completing this IO after an error, the buffer data will have
7084 * been marked as inaccessible when the buffer was unpinned. The AIO
7085 * subsystem holds a pin, but that doesn't prevent the buffer from
7086 * having been marked as inaccessible. The completion might also be
7087 * executed in a different process.
7088 */
7089#ifdef USE_VALGRIND
7090 if (!BufferIsPinned(buffer))
7091 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7092#endif
7093
7094 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7095 failed_checksum))
7096 {
7097 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7098 {
7099 memset(bufdata, 0, BLCKSZ);
7100 *zeroed_buffer = true;
7101 }
7102 else
7103 {
7104 *buffer_invalid = true;
7105 /* mark buffer as having failed */
7106 failed = true;
7107 }
7108 }
7109 else if (*failed_checksum)
7110 *ignored_checksum = true;
7111
7112 /* undo what we did above */
7113#ifdef USE_VALGRIND
7114 if (!BufferIsPinned(buffer))
7115 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7116#endif
7117
7118 /*
7119 * Immediately log a message about the invalid page, but only to the
7120 * server log. The reason to do so immediately is that this may be
7121 * executed in a different backend than the one that originated the
7122 * request. The reason to do so immediately is that the originator
7123 * might not process the query result immediately (because it is busy
7124 * doing another part of query processing) or at all (e.g. if it was
7125 * cancelled or errored out due to another IO also failing). The
7126 * definer of the IO will emit an ERROR or WARNING when processing the
7127 * IO's results
7128 *
7129 * To avoid duplicating the code to emit these log messages, we reuse
7130 * buffer_readv_report().
7131 */
7132 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7133 {
7134 PgAioResult result_one = {0};
7135
7136 buffer_readv_encode_error(&result_one, is_temp,
7137 *zeroed_buffer,
7138 *ignored_checksum,
7139 *buffer_invalid,
7140 *zeroed_buffer ? 1 : 0,
7141 *failed_checksum ? 1 : 0,
7142 buf_off, buf_off, buf_off);
7143 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7144 }
7145 }
7146
7147 /* Terminate I/O and set BM_VALID. */
7148 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7149 if (is_temp)
7150 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7151 else
7152 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7153
7154 /*
7155 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7156 * callback may not be executed in the same backend that called
7157 * BUFFER_READ_START. The alternative would be to defer calling the
7158 * tracepoint to a later point (e.g. the local completion callback for
7159 * shared buffer reads), which seems even less helpful.
7160 */
7161 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7162 tag.blockNum,
7163 tag.spcOid,
7164 tag.dbOid,
7165 tag.relNumber,
7167 false);
7168}
7169
7170/*
7171 * Perform completion handling of a single AIO read. This read may cover
7172 * multiple blocks / buffers.
7173 *
7174 * Shared between shared and local buffers, to reduce code duplication.
7175 */
7178 uint8 cb_data, bool is_temp)
7179{
7180 PgAioResult result = prior_result;
7182 uint8 first_error_off = 0;
7183 uint8 first_zeroed_off = 0;
7184 uint8 first_ignored_off = 0;
7185 uint8 error_count = 0;
7186 uint8 zeroed_count = 0;
7187 uint8 ignored_count = 0;
7188 uint8 checkfail_count = 0;
7189 uint64 *io_data;
7190 uint8 handle_data_len;
7191
7192 if (is_temp)
7193 {
7194 Assert(td->smgr.is_temp);
7196 }
7197 else
7198 Assert(!td->smgr.is_temp);
7199
7200 /*
7201 * Iterate over all the buffers affected by this IO and call the
7202 * per-buffer completion function for each buffer.
7203 */
7204 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7205 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7206 {
7207 Buffer buf = io_data[buf_off];
7208 bool failed;
7209 bool failed_verification = false;
7210 bool failed_checksum = false;
7211 bool zeroed_buffer = false;
7212 bool ignored_checksum = false;
7213
7215
7216 /*
7217 * If the entire I/O failed on a lower-level, each buffer needs to be
7218 * marked as failed. In case of a partial read, the first few buffers
7219 * may be ok.
7220 */
7221 failed =
7222 prior_result.status == PGAIO_RS_ERROR
7223 || prior_result.result <= buf_off;
7224
7225 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7226 &failed_verification,
7227 &failed_checksum,
7228 &ignored_checksum,
7229 &zeroed_buffer);
7230
7231 /*
7232 * Track information about the number of different kinds of error
7233 * conditions across all pages, as there can be multiple pages failing
7234 * verification as part of one IO.
7235 */
7236 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7237 first_error_off = buf_off;
7238 if (zeroed_buffer && zeroed_count++ == 0)
7239 first_zeroed_off = buf_off;
7240 if (ignored_checksum && ignored_count++ == 0)
7241 first_ignored_off = buf_off;
7242 if (failed_checksum)
7243 checkfail_count++;
7244 }
7245
7246 /*
7247 * If the smgr read succeeded [partially] and page verification failed for
7248 * some of the pages, adjust the IO's result state appropriately.
7249 */
7250 if (prior_result.status != PGAIO_RS_ERROR &&
7251 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7252 {
7253 buffer_readv_encode_error(&result, is_temp,
7254 zeroed_count > 0, ignored_count > 0,
7255 error_count, zeroed_count, checkfail_count,
7256 first_error_off, first_zeroed_off,
7257 first_ignored_off);
7258 pgaio_result_report(result, td, DEBUG1);
7259 }
7260
7261 /*
7262 * For shared relations this reporting is done in
7263 * shared_buffer_readv_complete_local().
7264 */
7265 if (is_temp && checkfail_count > 0)
7267 checkfail_count);
7268
7269 return result;
7270}
7271
7272/*
7273 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7274 * aio_local_buffer_readv_cb.
7275 *
7276 * The error is encoded / decoded in buffer_readv_encode_error() /
7277 * buffer_readv_decode_error().
7278 */
7279static void
7281 int elevel)
7282{
7283 int nblocks = td->smgr.nblocks;
7284 BlockNumber first = td->smgr.blockNum;
7285 BlockNumber last = first + nblocks - 1;
7286 ProcNumber errProc =
7288 RelPathStr rpath =
7289 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7290 bool zeroed_any,
7291 ignored_any;
7292 uint8 zeroed_or_error_count,
7293 checkfail_count,
7294 first_off;
7295 uint8 affected_count;
7296 const char *msg_one,
7297 *msg_mult,
7298 *det_mult,
7299 *hint_mult;
7300
7301 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7302 &zeroed_or_error_count,
7303 &checkfail_count,
7304 &first_off);
7305
7306 /*
7307 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7308 * special case, it's too irregular to be emitted the same way as the
7309 * other cases.
7310 */
7311 if (zeroed_any && ignored_any)
7312 {
7313 Assert(zeroed_any && ignored_any);
7314 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7315 Assert(result.status != PGAIO_RS_ERROR);
7316 affected_count = zeroed_or_error_count;
7317
7318 ereport(elevel,
7320 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7321 affected_count, checkfail_count, first, last, rpath.str),
7322 affected_count > 1 ?
7323 errdetail("Block %u held the first zeroed page.",
7324 first + first_off) : 0,
7325 errhint_plural("See server log for details about the other %d invalid block.",
7326 "See server log for details about the other %d invalid blocks.",
7327 affected_count + checkfail_count - 1,
7328 affected_count + checkfail_count - 1));
7329 return;
7330 }
7331
7332 /*
7333 * The other messages are highly repetitive. To avoid duplicating a long
7334 * and complicated ereport(), gather the translated format strings
7335 * separately and then do one common ereport.
7336 */
7337 if (result.status == PGAIO_RS_ERROR)
7338 {
7339 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7340 affected_count = zeroed_or_error_count;
7341 msg_one = _("invalid page in block %u of relation \"%s\"");
7342 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7343 det_mult = _("Block %u held the first invalid page.");
7344 hint_mult = _("See server log for the other %u invalid block(s).");
7345 }
7346 else if (zeroed_any && !ignored_any)
7347 {
7348 affected_count = zeroed_or_error_count;
7349 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7350 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7351 det_mult = _("Block %u held the first zeroed page.");
7352 hint_mult = _("See server log for the other %u zeroed block(s).");
7353 }
7354 else if (!zeroed_any && ignored_any)
7355 {
7356 affected_count = checkfail_count;
7357 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7358 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7359 det_mult = _("Block %u held the first ignored page.");
7360 hint_mult = _("See server log for the other %u ignored block(s).");
7361 }
7362 else
7364
7365 ereport(elevel,
7367 affected_count == 1 ?
7368 errmsg_internal(msg_one, first + first_off, rpath.str) :
7369 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7370 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7371 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7372}
7373
7374static void
7376{
7377 buffer_stage_common(ioh, false, false);
7378}
7379
7380static PgAioResult
7382 uint8 cb_data)
7383{
7384 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7385}
7386
7387/*
7388 * We need a backend-local completion callback for shared buffers, to be able
7389 * to report checksum errors correctly. Unfortunately that can only safely
7390 * happen if the reporting backend has previously called
7391 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7392 * the backend that started the IO. Hence this callback.
7393 */
7394static PgAioResult
7396 uint8 cb_data)
7397{
7398 bool zeroed_any,
7399 ignored_any;
7400 uint8 zeroed_or_error_count,
7401 checkfail_count,
7402 first_off;
7403
7404 if (prior_result.status == PGAIO_RS_OK)
7405 return prior_result;
7406
7407 buffer_readv_decode_error(prior_result,
7408 &zeroed_any,
7409 &ignored_any,
7410 &zeroed_or_error_count,
7411 &checkfail_count,
7412 &first_off);
7413
7414 if (checkfail_count)
7415 {
7417
7419 checkfail_count);
7420 }
7421
7422 return prior_result;
7423}
7424
7425static void
7427{
7428 buffer_stage_common(ioh, false, true);
7429}
7430
7431static PgAioResult
7433 uint8 cb_data)
7434{
7435 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7436}
7437
7438/* readv callback is passed READ_BUFFERS_* flags as callback data */
7441 .complete_shared = shared_buffer_readv_complete,
7442 /* need a local callback to report checksum failures */
7443 .complete_local = shared_buffer_readv_complete_local,
7444 .report = buffer_readv_report,
7445};
7446
7447/* readv callback is passed READ_BUFFERS_* flags as callback data */
7450
7451 /*
7452 * Note that this, in contrast to the shared_buffers case, uses
7453 * complete_local, as only the issuing backend has access to the required
7454 * datastructures. This is important in case the IO completion may be
7455 * consumed incidentally by another backend.
7456 */
7457 .complete_local = local_buffer_readv_complete,
7458 .report = buffer_readv_report,
7459};
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:968
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:159
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:961
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:363
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:327
bool pgaio_have_staged(void)
Definition: aio.c:1104
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1002
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:352
void pgaio_submit_staged(void)
Definition: aio.c:1120
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:988
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:237
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:185
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:347
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:408
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:293
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:237
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:147
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5657
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:5040
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5408
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4904
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6349
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7177
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:244
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4229
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1551
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4551
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3003
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7395
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1249
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1514
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:653
bool BufferIsLockedByMeInMode(Buffer buffer, int mode)
Definition: bufmgr.c:2868
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4289
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6548
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7432
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1476
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6680
int io_max_combine_limit
Definition: bufmgr.c:172
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4425
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:235
bool zero_damaged_pages
Definition: bufmgr.c:144
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3179
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6728
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:7033
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6315
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5916
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6556
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:845
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3996
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6163
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7439
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:877
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1180
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1580
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1018
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1987
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4065
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1538
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5252
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4672
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6234
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:909
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7448
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2252
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4047
int io_combine_limit_guc
Definition: bufmgr.c:171
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6380
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4250
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6787
#define BUF_REUSABLE
Definition: bufmgr.c:81
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6218
static void BufferSync(int flags)
Definition: bufmgr.c:3352
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1751
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7426
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4172
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6579
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4215
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2910
static uint32 MaxProportionalPins
Definition: bufmgr.c:221
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2575
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3628
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3218
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4474
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7375
void UnlockBuffers(void)
Definition: bufmgr.c:5582
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:563
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7381
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2320
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5636
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4442
int bgwriter_flush_after
Definition: bufmgr.c:179
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5376
pg_noinline uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6291
bool BufferIsLockedByMe(Buffer buffer)
Definition: bufmgr.c:2842
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition: bufmgr.c:3063
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4843
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4504
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5832
int checkpoint_flush_after
Definition: bufmgr.c:178
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5393
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1097
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6104
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3262
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6202
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6415
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1619
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6403
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2942
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6589
int backend_flush_after
Definition: bufmgr.c:180
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2513
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7280
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2531
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5690
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5610
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5440
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4952
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6465
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6651
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:829
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:684
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
int maintenance_io_concurrency
Definition: bufmgr.c:162
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3253
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5316
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2158
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5138
int effective_io_concurrency
Definition: bufmgr.c:155
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6047
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1495
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6261
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6564
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6896
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:170
void InitBufferManagerAccess(void)
Definition: bufmgr.c:4013
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6938
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3926
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2487
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745
void TrackNewBufferPin(Buffer buf)
Definition: bufmgr.c:3324
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5858
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
uint32 GetPinLimit(void)
Definition: bufmgr.c:2475
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5968
#define BUF_WRITTEN
Definition: bufmgr.c:80
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5356
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define P_NEW
Definition: bufmgr.h:191
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:115
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:418
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:167
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:385
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:117
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:161
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:119
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:162
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:121
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:111
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:369
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define pg_noinline
Definition: c.h:285
#define likely(x)
Definition: c.h:401
uint8_t uint8
Definition: c.h:536
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:223
#define Max(x, y)
Definition: c.h:997
double float8
Definition: c.h:635
#define pg_attribute_always_inline
Definition: c.h:269
int16_t int16
Definition: c.h:533
int32_t int32
Definition: c.h:534
uint64_t uint64
Definition: c.h:539
#define pg_unreachable()
Definition: c.h:331
#define unlikely(x)
Definition: c.h:402
uint32_t uint32
Definition: c.h:538
#define lengthof(array)
Definition: c.h:787
#define MemSet(start, val, len)
Definition: c.h:1019
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:937
size_t Size
Definition: c.h:610
bool IsCatalogRelationOid(Oid relid)
Definition: catalog.c:121
bool IsCatalogTextUniqueIndexOid(Oid relid)
Definition: catalog.c:156
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:785
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1234
int errdetail(const char *fmt,...)
Definition: elog.c:1207
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1343
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1364
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:198
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:321
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:643
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:746
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:174
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:786
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:142
bool enableFsync
Definition: globals.c:129
ProcNumber MyProcNumber
Definition: globals.c:90
int VacuumCostPageMiss
Definition: globals.c:152
bool VacuumCostActive
Definition: globals.c:158
int VacuumCostBalance
Definition: globals.c:157
int MaxBackends
Definition: globals.c:146
int VacuumCostPageDirty
Definition: globals.c:153
int VacuumCostPageHit
Definition: globals.c:151
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:48
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:839
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:1001
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1012
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:803
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:700
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
int NLocBuffer
Definition: localbuf.c:44
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:846
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: localbuf.c:663
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1883
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1345
void ForEachLWLockHeldByMe(void(*callback)(LWLock *, LWLockMode, void *), void *context)
Definition: lwlock.c:1962
LWLockMode
Definition: lwlock.h:111
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:47
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:274
@ IOOBJECT_RELATION
Definition: pgstat.h:275
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:276
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:709
IOContext
Definition: pgstat.h:283
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287
@ IOOP_EXTEND
Definition: pgstat.h:312
@ IOOP_READ
Definition: pgstat.h:313
@ IOOP_WRITEBACK
Definition: pgstat.h:309
@ IOOP_HIT
Definition: pgstat.h:307
@ IOOP_EVICT
Definition: pgstat.h:305
@ IOOP_REUSE
Definition: pgstat.h:308
@ IOOP_WRITE
Definition: pgstat.h:314
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:714
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
#define NUM_AUXILIARY_PROCS
Definition: proc.h:463
#define DELAY_CHKPT_START
Definition: proc.h:135
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:387
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:761
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:576
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:646
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:667
#define RelationIsValid(relation)
Definition: rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:733
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1986
PGPROC * MyProc
Definition: proc.c:66
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:766
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:754
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1974
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
SMgrRelation smgr
Definition: bufmgr.h:107
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:222
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:257
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:240
PgStat_Counter maxwritten_clean
Definition: pgstat.h:241
PgStat_Counter buf_alloc
Definition: pgstat.h:242
PgStat_Counter buffers_written
Definition: pgstat.h:264
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:130
PgAioWaitRef io_wref
Definition: bufmgr.h:143
Buffer * buffers
Definition: bufmgr.h:138
SMgrRelation smgr
Definition: bufmgr.h:128
BufferAccessStrategy strategy
Definition: bufmgr.h:131
BlockNumber blocknum
Definition: bufmgr.h:139
PgAioReturn io_return
Definition: bufmgr.h:144
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1837
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
struct PgAioTargetData::@125 smgr
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:171
bool RecoveryInProgress(void)
Definition: xlog.c:6386
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3124
CheckpointStatsData CheckpointStats
Definition: xlog.c:210
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2780
#define CHECKPOINT_FLUSH_UNLOGGED
Definition: xlog.h:143
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1078
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1250
#define InHotStandby
Definition: xlogutils.h:60