PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
bufmgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * bufmgr.c
4 * buffer manager interface routines
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/buffer/bufmgr.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * Principal entry points:
17 *
18 * ReadBuffer() -- find or create a buffer holding the requested page,
19 * and pin it so that no one can destroy it while this process
20 * is using it.
21 *
22 * StartReadBuffer() -- as above, with separate wait step
23 * StartReadBuffers() -- multiple block version
24 * WaitReadBuffers() -- second step of above
25 *
26 * ReleaseBuffer() -- unpin a buffer
27 *
28 * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 * The disk write is delayed until buffer replacement or checkpoint.
30 *
31 * See also these files:
32 * freelist.c -- chooses victim for buffer replacement
33 * buf_table.c -- manages the buffer lookup table
34 */
35#include "postgres.h"
36
37#include <sys/file.h>
38#include <unistd.h>
39
40#include "access/tableam.h"
41#include "access/xloginsert.h"
42#include "access/xlogutils.h"
43#include "catalog/storage.h"
45#include "executor/instrument.h"
46#include "lib/binaryheap.h"
47#include "miscadmin.h"
48#include "pg_trace.h"
49#include "pgstat.h"
50#include "postmaster/bgwriter.h"
51#include "storage/aio.h"
53#include "storage/bufmgr.h"
54#include "storage/fd.h"
55#include "storage/ipc.h"
56#include "storage/lmgr.h"
57#include "storage/proc.h"
58#include "storage/read_stream.h"
59#include "storage/smgr.h"
60#include "storage/standby.h"
61#include "utils/memdebug.h"
62#include "utils/ps_status.h"
63#include "utils/rel.h"
64#include "utils/resowner.h"
65#include "utils/timestamp.h"
66
67
68/* Note: these two macros only work on shared buffers, not local ones! */
69#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
70#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
71
72/* Note: this macro only works on local buffers, not shared ones! */
73#define LocalBufHdrGetBlock(bufHdr) \
74 LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
75
76/* Bits in SyncOneBuffer's return value */
77#define BUF_WRITTEN 0x01
78#define BUF_REUSABLE 0x02
79
80#define RELS_BSEARCH_THRESHOLD 20
81
82/*
83 * This is the size (in the number of blocks) above which we scan the
84 * entire buffer pool to remove the buffers for all the pages of relation
85 * being dropped. For the relations with size below this threshold, we find
86 * the buffers by doing lookups in BufMapping table.
87 */
88#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
89
91{
95
96/* 64 bytes, about the size of a cache line on common systems */
97#define REFCOUNT_ARRAY_ENTRIES 8
98
99/*
100 * Status of buffers to checkpoint for a particular tablespace, used
101 * internally in BufferSync.
102 */
103typedef struct CkptTsStatus
104{
105 /* oid of the tablespace */
107
108 /*
109 * Checkpoint progress for this tablespace. To make progress comparable
110 * between tablespaces the progress is, for each tablespace, measured as a
111 * number between 0 and the total number of to-be-checkpointed pages. Each
112 * page checkpointed in this tablespace increments this space's progress
113 * by progress_slice.
114 */
117
118 /* number of to-be checkpointed pages in this tablespace */
120 /* already processed pages in this tablespace */
122
123 /* current offset in CkptBufferIds for this tablespace */
124 int index;
126
127/*
128 * Type for array used to sort SMgrRelations
129 *
130 * FlushRelationsAllBuffers shares the same comparator function with
131 * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
132 * compatible.
133 */
134typedef struct SMgrSortArray
135{
136 RelFileLocator rlocator; /* This must be the first member */
139
140/* GUC variables */
144bool track_io_timing = false;
145
146/*
147 * How many buffers PrefetchBuffer callers should try to stay ahead of their
148 * ReadBuffer calls by. Zero means "never prefetch". This value is only used
149 * for buffers not belonging to tablespaces that have their
150 * effective_io_concurrency parameter set.
151 */
153
154/*
155 * Like effective_io_concurrency, but used by maintenance code paths that might
156 * benefit from a higher setting because they work on behalf of many sessions.
157 * Overridden by the tablespace setting of the same name.
158 */
160
161/*
162 * Limit on how many blocks should be handled in single I/O operations.
163 * StartReadBuffers() callers should respect it, as should other operations
164 * that call smgr APIs directly. It is computed as the minimum of underlying
165 * GUCs io_combine_limit_guc and io_max_combine_limit.
166 */
170
171/*
172 * GUC variables about triggering kernel writeback for buffers written; OS
173 * dependent defaults are set via the GUC mechanism.
174 */
178
179/* local state for LockBufferForCleanup */
181
182/*
183 * Backend-Private refcount management:
184 *
185 * Each buffer also has a private refcount that keeps track of the number of
186 * times the buffer is pinned in the current process. This is so that the
187 * shared refcount needs to be modified only once if a buffer is pinned more
188 * than once by an individual backend. It's also used to check that no buffers
189 * are still pinned at the end of transactions and when exiting.
190 *
191 *
192 * To avoid - as we used to - requiring an array with NBuffers entries to keep
193 * track of local buffers, we use a small sequentially searched array
194 * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
195 * keep track of backend local pins.
196 *
197 * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
198 * refcounts are kept track of in the array; after that, new array entries
199 * displace old ones into the hash table. That way a frequently used entry
200 * can't get "stuck" in the hashtable while infrequent ones clog the array.
201 *
202 * Note that in most scenarios the number of pinned buffers will not exceed
203 * REFCOUNT_ARRAY_ENTRIES.
204 *
205 *
206 * To enter a buffer into the refcount tracking mechanism first reserve a free
207 * entry using ReservePrivateRefCountEntry() and then later, if necessary,
208 * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
209 * memory allocations in NewPrivateRefCountEntry() which can be important
210 * because in some scenarios it's called with a spinlock held...
211 */
217
219
220static void ReservePrivateRefCountEntry(void);
225
226/* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
227static void ResOwnerReleaseBufferIO(Datum res);
228static char *ResOwnerPrintBufferIO(Datum res);
229static void ResOwnerReleaseBufferPin(Datum res);
230static char *ResOwnerPrintBufferPin(Datum res);
231
233{
234 .name = "buffer io",
235 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
236 .release_priority = RELEASE_PRIO_BUFFER_IOS,
237 .ReleaseResource = ResOwnerReleaseBufferIO,
238 .DebugPrint = ResOwnerPrintBufferIO
239};
240
242{
243 .name = "buffer pin",
244 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
245 .release_priority = RELEASE_PRIO_BUFFER_PINS,
246 .ReleaseResource = ResOwnerReleaseBufferPin,
247 .DebugPrint = ResOwnerPrintBufferPin
248};
249
250/*
251 * Ensure that the PrivateRefCountArray has sufficient space to store one more
252 * entry. This has to be called before using NewPrivateRefCountEntry() to fill
253 * a new entry - but it's perfectly fine to not use a reserved entry.
254 */
255static void
257{
258 /* Already reserved (or freed), nothing to do */
259 if (ReservedRefCountEntry != NULL)
260 return;
261
262 /*
263 * First search for a free entry the array, that'll be sufficient in the
264 * majority of cases.
265 */
266 {
267 int i;
268
269 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
270 {
272
273 res = &PrivateRefCountArray[i];
274
275 if (res->buffer == InvalidBuffer)
276 {
278 return;
279 }
280 }
281 }
282
283 /*
284 * No luck. All array entries are full. Move one array entry into the hash
285 * table.
286 */
287 {
288 /*
289 * Move entry from the current clock position in the array into the
290 * hashtable. Use that slot.
291 */
292 PrivateRefCountEntry *hashent;
293 bool found;
294
295 /* select victim slot */
298
299 /* Better be used, otherwise we shouldn't get here. */
301
302 /* enter victim array entry into hashtable */
306 &found);
307 Assert(!found);
309
310 /* clear the now free array slot */
313
315 }
316}
317
318/*
319 * Fill a previously reserved refcount entry.
320 */
323{
325
326 /* only allowed to be called when a reservation has been made */
328
329 /* use up the reserved entry */
332
333 /* and fill it */
334 res->buffer = buffer;
335 res->refcount = 0;
336
337 return res;
338}
339
340/*
341 * Return the PrivateRefCount entry for the passed buffer.
342 *
343 * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
344 * do_move is true, and the entry resides in the hashtable the entry is
345 * optimized for frequent access by moving it to the array.
346 */
349{
351 int i;
352
355
356 /*
357 * First search for references in the array, that'll be sufficient in the
358 * majority of cases.
359 */
360 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
361 {
362 res = &PrivateRefCountArray[i];
363
364 if (res->buffer == buffer)
365 return res;
366 }
367
368 /*
369 * By here we know that the buffer, if already pinned, isn't residing in
370 * the array.
371 *
372 * Only look up the buffer in the hashtable if we've previously overflowed
373 * into it.
374 */
376 return NULL;
377
379
380 if (res == NULL)
381 return NULL;
382 else if (!do_move)
383 {
384 /* caller doesn't want us to move the hash entry into the array */
385 return res;
386 }
387 else
388 {
389 /* move buffer from hashtable into the free array slot */
390 bool found;
392
393 /* Ensure there's a free array slot */
395
396 /* Use up the reserved slot */
400 Assert(free->buffer == InvalidBuffer);
401
402 /* and fill it */
403 free->buffer = buffer;
404 free->refcount = res->refcount;
405
406 /* delete from hashtable */
408 Assert(found);
411
412 return free;
413 }
414}
415
416/*
417 * Returns how many times the passed buffer is pinned by this backend.
418 *
419 * Only works for shared memory buffers!
420 */
421static inline int32
423{
425
428
429 /*
430 * Not moving the entry - that's ok for the current users, but we might
431 * want to change this one day.
432 */
433 ref = GetPrivateRefCountEntry(buffer, false);
434
435 if (ref == NULL)
436 return 0;
437 return ref->refcount;
438}
439
440/*
441 * Release resources used to track the reference count of a buffer which we no
442 * longer have pinned and don't want to pin again immediately.
443 */
444static void
446{
447 Assert(ref->refcount == 0);
448
449 if (ref >= &PrivateRefCountArray[0] &&
451 {
452 ref->buffer = InvalidBuffer;
453
454 /*
455 * Mark the just used entry as reserved - in many scenarios that
456 * allows us to avoid ever having to search the array/hash for free
457 * entries.
458 */
460 }
461 else
462 {
463 bool found;
464 Buffer buffer = ref->buffer;
465
467 Assert(found);
470 }
471}
472
473/*
474 * BufferIsPinned
475 * True iff the buffer is pinned (also checks for valid buffer number).
476 *
477 * NOTE: what we check here is that *this* backend holds a pin on
478 * the buffer. We do not care whether some other backend does.
479 */
480#define BufferIsPinned(bufnum) \
481( \
482 !BufferIsValid(bufnum) ? \
483 false \
484 : \
485 BufferIsLocal(bufnum) ? \
486 (LocalRefCount[-(bufnum) - 1] > 0) \
487 : \
488 (GetPrivateRefCount(bufnum) > 0) \
489)
490
491
493 SMgrRelation smgr, char smgr_persistence,
494 ForkNumber forkNum, BlockNumber blockNum,
497 ForkNumber fork,
498 BufferAccessStrategy strategy,
499 uint32 flags,
500 uint32 extend_by,
501 BlockNumber extend_upto,
502 Buffer *buffers,
503 uint32 *extended_by);
505 ForkNumber fork,
506 BufferAccessStrategy strategy,
507 uint32 flags,
508 uint32 extend_by,
509 BlockNumber extend_upto,
510 Buffer *buffers,
511 uint32 *extended_by);
512static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
513static void PinBuffer_Locked(BufferDesc *buf);
514static void UnpinBuffer(BufferDesc *buf);
515static void UnpinBufferNoOwner(BufferDesc *buf);
516static void BufferSync(int flags);
518static int SyncOneBuffer(int buf_id, bool skip_recently_used,
519 WritebackContext *wb_context);
520static void WaitIO(BufferDesc *buf);
521static void AbortBufferIO(Buffer buffer);
522static void shared_buffer_write_error_callback(void *arg);
523static void local_buffer_write_error_callback(void *arg);
524static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
525 char relpersistence,
526 ForkNumber forkNum,
527 BlockNumber blockNum,
528 BufferAccessStrategy strategy,
529 bool *foundPtr, IOContext io_context);
530static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress);
531static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete);
532static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
533static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
534 IOObject io_object, IOContext io_context);
535static void FindAndDropRelationBuffers(RelFileLocator rlocator,
536 ForkNumber forkNum,
537 BlockNumber nForkBlock,
538 BlockNumber firstDelBlock);
540 RelFileLocator dstlocator,
541 ForkNumber forkNum, bool permanent);
542static void AtProcExit_Buffers(int code, Datum arg);
543static void CheckForBufferLeaks(void);
544static int rlocator_comparator(const void *p1, const void *p2);
545static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
546static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
547static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
548
549
550/*
551 * Implementation of PrefetchBuffer() for shared buffers.
552 */
555 ForkNumber forkNum,
556 BlockNumber blockNum)
557{
558 PrefetchBufferResult result = {InvalidBuffer, false};
559 BufferTag newTag; /* identity of requested block */
560 uint32 newHash; /* hash value for newTag */
561 LWLock *newPartitionLock; /* buffer partition lock for it */
562 int buf_id;
563
564 Assert(BlockNumberIsValid(blockNum));
565
566 /* create a tag so we can lookup the buffer */
567 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
568 forkNum, blockNum);
569
570 /* determine its hash code and partition lock ID */
571 newHash = BufTableHashCode(&newTag);
572 newPartitionLock = BufMappingPartitionLock(newHash);
573
574 /* see if the block is in the buffer pool already */
575 LWLockAcquire(newPartitionLock, LW_SHARED);
576 buf_id = BufTableLookup(&newTag, newHash);
577 LWLockRelease(newPartitionLock);
578
579 /* If not in buffers, initiate prefetch */
580 if (buf_id < 0)
581 {
582#ifdef USE_PREFETCH
583 /*
584 * Try to initiate an asynchronous read. This returns false in
585 * recovery if the relation file doesn't exist.
586 */
587 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
588 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
589 {
590 result.initiated_io = true;
591 }
592#endif /* USE_PREFETCH */
593 }
594 else
595 {
596 /*
597 * Report the buffer it was in at that time. The caller may be able
598 * to avoid a buffer table lookup, but it's not pinned and it must be
599 * rechecked!
600 */
601 result.recent_buffer = buf_id + 1;
602 }
603
604 /*
605 * If the block *is* in buffers, we do nothing. This is not really ideal:
606 * the block might be just about to be evicted, which would be stupid
607 * since we know we are going to need it soon. But the only easy answer
608 * is to bump the usage_count, which does not seem like a great solution:
609 * when the caller does ultimately touch the block, usage_count would get
610 * bumped again, resulting in too much favoritism for blocks that are
611 * involved in a prefetch sequence. A real fix would involve some
612 * additional per-buffer state, and it's not clear that there's enough of
613 * a problem to justify that.
614 */
615
616 return result;
617}
618
619/*
620 * PrefetchBuffer -- initiate asynchronous read of a block of a relation
621 *
622 * This is named by analogy to ReadBuffer but doesn't actually allocate a
623 * buffer. Instead it tries to ensure that a future ReadBuffer for the given
624 * block will not be delayed by the I/O. Prefetching is optional.
625 *
626 * There are three possible outcomes:
627 *
628 * 1. If the block is already cached, the result includes a valid buffer that
629 * could be used by the caller to avoid the need for a later buffer lookup, but
630 * it's not pinned, so the caller must recheck it.
631 *
632 * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
633 * true. Currently there is no way to know if the data was already cached by
634 * the kernel and therefore didn't really initiate I/O, and no way to know when
635 * the I/O completes other than using synchronous ReadBuffer().
636 *
637 * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
638 * USE_PREFETCH is not defined (this build doesn't support prefetching due to
639 * lack of a kernel facility), direct I/O is enabled, or the underlying
640 * relation file wasn't found and we are in recovery. (If the relation file
641 * wasn't found and we are not in recovery, an error is raised).
642 */
645{
646 Assert(RelationIsValid(reln));
647 Assert(BlockNumberIsValid(blockNum));
648
649 if (RelationUsesLocalBuffers(reln))
650 {
651 /* see comments in ReadBufferExtended */
652 if (RELATION_IS_OTHER_TEMP(reln))
654 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
655 errmsg("cannot access temporary tables of other sessions")));
656
657 /* pass it off to localbuf.c */
658 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
659 }
660 else
661 {
662 /* pass it to the shared buffer version */
663 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
664 }
665}
666
667/*
668 * ReadRecentBuffer -- try to pin a block in a recently observed buffer
669 *
670 * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
671 * successful. Return true if the buffer is valid and still has the expected
672 * tag. In that case, the buffer is pinned and the usage count is bumped.
673 */
674bool
676 Buffer recent_buffer)
677{
678 BufferDesc *bufHdr;
679 BufferTag tag;
680 uint32 buf_state;
681 bool have_private_ref;
682
683 Assert(BufferIsValid(recent_buffer));
684
687 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
688
689 if (BufferIsLocal(recent_buffer))
690 {
691 int b = -recent_buffer - 1;
692
693 bufHdr = GetLocalBufferDescriptor(b);
694 buf_state = pg_atomic_read_u32(&bufHdr->state);
695
696 /* Is it still valid and holding the right tag? */
697 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
698 {
699 PinLocalBuffer(bufHdr, true);
700
702
703 return true;
704 }
705 }
706 else
707 {
708 bufHdr = GetBufferDescriptor(recent_buffer - 1);
709 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
710
711 /*
712 * Do we already have this buffer pinned with a private reference? If
713 * so, it must be valid and it is safe to check the tag without
714 * locking. If not, we have to lock the header first and then check.
715 */
716 if (have_private_ref)
717 buf_state = pg_atomic_read_u32(&bufHdr->state);
718 else
719 buf_state = LockBufHdr(bufHdr);
720
721 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
722 {
723 /*
724 * It's now safe to pin the buffer. We can't pin first and ask
725 * questions later, because it might confuse code paths like
726 * InvalidateBuffer() if we pinned a random non-matching buffer.
727 */
728 if (have_private_ref)
729 PinBuffer(bufHdr, NULL); /* bump pin count */
730 else
731 PinBuffer_Locked(bufHdr); /* pin for first time */
732
734
735 return true;
736 }
737
738 /* If we locked the header above, now unlock. */
739 if (!have_private_ref)
740 UnlockBufHdr(bufHdr, buf_state);
741 }
742
743 return false;
744}
745
746/*
747 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
748 * fork with RBM_NORMAL mode and default strategy.
749 */
750Buffer
752{
753 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
754}
755
756/*
757 * ReadBufferExtended -- returns a buffer containing the requested
758 * block of the requested relation. If the blknum
759 * requested is P_NEW, extend the relation file and
760 * allocate a new block. (Caller is responsible for
761 * ensuring that only one backend tries to extend a
762 * relation at the same time!)
763 *
764 * Returns: the buffer number for the buffer containing
765 * the block read. The returned buffer has been pinned.
766 * Does not return on error --- elog's instead.
767 *
768 * Assume when this function is called, that reln has been opened already.
769 *
770 * In RBM_NORMAL mode, the page is read from disk, and the page header is
771 * validated. An error is thrown if the page header is not valid. (But
772 * note that an all-zero page is considered "valid"; see
773 * PageIsVerified().)
774 *
775 * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
776 * valid, the page is zeroed instead of throwing an error. This is intended
777 * for non-critical data, where the caller is prepared to repair errors.
778 *
779 * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
780 * filled with zeros instead of reading it from disk. Useful when the caller
781 * is going to fill the page from scratch, since this saves I/O and avoids
782 * unnecessary failure if the page-on-disk has corrupt page headers.
783 * The page is returned locked to ensure that the caller has a chance to
784 * initialize the page before it's made visible to others.
785 * Caution: do not use this mode to read a page that is beyond the relation's
786 * current physical EOF; that is likely to cause problems in md.c when
787 * the page is modified and written out. P_NEW is OK, though.
788 *
789 * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
790 * a cleanup-strength lock on the page.
791 *
792 * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
793 *
794 * If strategy is not NULL, a nondefault buffer access strategy is used.
795 * See buffer/README for details.
796 */
797inline Buffer
800{
801 Buffer buf;
802
803 /*
804 * Reject attempts to read non-local temporary relations; we would be
805 * likely to get wrong data since we have no visibility into the owning
806 * session's local buffers.
807 */
808 if (RELATION_IS_OTHER_TEMP(reln))
810 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
811 errmsg("cannot access temporary tables of other sessions")));
812
813 /*
814 * Read the buffer, and update pgstat counters to reflect a cache hit or
815 * miss.
816 */
817 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
818 forkNum, blockNum, mode, strategy);
819
820 return buf;
821}
822
823
824/*
825 * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
826 * a relcache entry for the relation.
827 *
828 * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
829 * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
830 * cannot be used for temporary relations (and making that work might be
831 * difficult, unless we only want to read temporary relations for our own
832 * ProcNumber).
833 */
834Buffer
837 BufferAccessStrategy strategy, bool permanent)
838{
839 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
840
841 return ReadBuffer_common(NULL, smgr,
842 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
843 forkNum, blockNum,
844 mode, strategy);
845}
846
847/*
848 * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
849 */
850Buffer
852 ForkNumber forkNum,
853 BufferAccessStrategy strategy,
854 uint32 flags)
855{
856 Buffer buf;
857 uint32 extend_by = 1;
858
859 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
860 &buf, &extend_by);
861
862 return buf;
863}
864
865/*
866 * Extend relation by multiple blocks.
867 *
868 * Tries to extend the relation by extend_by blocks. Depending on the
869 * availability of resources the relation may end up being extended by a
870 * smaller number of pages (unless an error is thrown, always by at least one
871 * page). *extended_by is updated to the number of pages the relation has been
872 * extended to.
873 *
874 * buffers needs to be an array that is at least extend_by long. Upon
875 * completion, the first extend_by array elements will point to a pinned
876 * buffer.
877 *
878 * If EB_LOCK_FIRST is part of flags, the first returned buffer is
879 * locked. This is useful for callers that want a buffer that is guaranteed to
880 * be empty.
881 */
884 ForkNumber fork,
885 BufferAccessStrategy strategy,
886 uint32 flags,
887 uint32 extend_by,
888 Buffer *buffers,
889 uint32 *extended_by)
890{
891 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
892 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
893 Assert(extend_by > 0);
894
895 if (bmr.smgr == NULL)
896 {
897 bmr.smgr = RelationGetSmgr(bmr.rel);
898 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
899 }
900
901 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
902 extend_by, InvalidBlockNumber,
903 buffers, extended_by);
904}
905
906/*
907 * Extend the relation so it is at least extend_to blocks large, return buffer
908 * (extend_to - 1).
909 *
910 * This is useful for callers that want to write a specific page, regardless
911 * of the current size of the relation (e.g. useful for visibilitymap and for
912 * crash recovery).
913 */
914Buffer
916 ForkNumber fork,
917 BufferAccessStrategy strategy,
918 uint32 flags,
919 BlockNumber extend_to,
921{
923 uint32 extended_by = 0;
925 Buffer buffers[64];
926
927 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
928 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
929 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
930
931 if (bmr.smgr == NULL)
932 {
933 bmr.smgr = RelationGetSmgr(bmr.rel);
934 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
935 }
936
937 /*
938 * If desired, create the file if it doesn't exist. If
939 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
940 * an smgrexists call.
941 */
942 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
943 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
945 !smgrexists(bmr.smgr, fork))
946 {
948
949 /* recheck, fork might have been created concurrently */
950 if (!smgrexists(bmr.smgr, fork))
951 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
952
954 }
955
956 /*
957 * If requested, invalidate size cache, so that smgrnblocks asks the
958 * kernel.
959 */
960 if (flags & EB_CLEAR_SIZE_CACHE)
962
963 /*
964 * Estimate how many pages we'll need to extend by. This avoids acquiring
965 * unnecessarily many victim buffers.
966 */
967 current_size = smgrnblocks(bmr.smgr, fork);
968
969 /*
970 * Since no-one else can be looking at the page contents yet, there is no
971 * difference between an exclusive lock and a cleanup-strength lock. Note
972 * that we pass the original mode to ReadBuffer_common() below, when
973 * falling back to reading the buffer to a concurrent relation extension.
974 */
976 flags |= EB_LOCK_TARGET;
977
978 while (current_size < extend_to)
979 {
980 uint32 num_pages = lengthof(buffers);
981 BlockNumber first_block;
982
983 if ((uint64) current_size + num_pages > extend_to)
984 num_pages = extend_to - current_size;
985
986 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
987 num_pages, extend_to,
988 buffers, &extended_by);
989
990 current_size = first_block + extended_by;
991 Assert(num_pages != 0 || current_size >= extend_to);
992
993 for (uint32 i = 0; i < extended_by; i++)
994 {
995 if (first_block + i != extend_to - 1)
996 ReleaseBuffer(buffers[i]);
997 else
998 buffer = buffers[i];
999 }
1000 }
1001
1002 /*
1003 * It's possible that another backend concurrently extended the relation.
1004 * In that case read the buffer.
1005 *
1006 * XXX: Should we control this via a flag?
1007 */
1008 if (buffer == InvalidBuffer)
1009 {
1010 Assert(extended_by == 0);
1012 fork, extend_to - 1, mode, strategy);
1013 }
1014
1015 return buffer;
1016}
1017
1018/*
1019 * Lock and optionally zero a buffer, as part of the implementation of
1020 * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1021 * pinned. If the buffer is not already valid, it is zeroed and made valid.
1022 */
1023static void
1025{
1026 BufferDesc *bufHdr;
1027 bool need_to_zero;
1028 bool isLocalBuf = BufferIsLocal(buffer);
1029
1031
1032 if (already_valid)
1033 {
1034 /*
1035 * If the caller already knew the buffer was valid, we can skip some
1036 * header interaction. The caller just wants to lock the buffer.
1037 */
1038 need_to_zero = false;
1039 }
1040 else if (isLocalBuf)
1041 {
1042 /* Simple case for non-shared buffers. */
1043 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1044 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1045 }
1046 else
1047 {
1048 /*
1049 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1050 * concurrently. Even though we aren't doing I/O, that ensures that
1051 * we don't zero a page that someone else has pinned. An exclusive
1052 * content lock wouldn't be enough, because readers are allowed to
1053 * drop the content lock after determining that a tuple is visible
1054 * (see buffer access rules in README).
1055 */
1056 bufHdr = GetBufferDescriptor(buffer - 1);
1057 need_to_zero = StartBufferIO(bufHdr, true, false);
1058 }
1059
1060 if (need_to_zero)
1061 {
1062 memset(BufferGetPage(buffer), 0, BLCKSZ);
1063
1064 /*
1065 * Grab the buffer content lock before marking the page as valid, to
1066 * make sure that no other backend sees the zeroed page before the
1067 * caller has had a chance to initialize it.
1068 *
1069 * Since no-one else can be looking at the page contents yet, there is
1070 * no difference between an exclusive lock and a cleanup-strength
1071 * lock. (Note that we cannot use LockBuffer() or
1072 * LockBufferForCleanup() here, because they assert that the buffer is
1073 * already valid.)
1074 */
1075 if (!isLocalBuf)
1077
1078 /* Set BM_VALID, terminate IO, and wake up any waiters */
1079 if (isLocalBuf)
1080 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1081 else
1082 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1083 }
1084 else if (!isLocalBuf)
1085 {
1086 /*
1087 * The buffer is valid, so we can't zero it. The caller still expects
1088 * the page to be locked on return.
1089 */
1090 if (mode == RBM_ZERO_AND_LOCK)
1092 else
1094 }
1095}
1096
1097/*
1098 * Pin a buffer for a given block. *foundPtr is set to true if the block was
1099 * already present, or false if more work is required to either read it in or
1100 * zero it.
1101 */
1104 SMgrRelation smgr,
1105 char persistence,
1106 ForkNumber forkNum,
1107 BlockNumber blockNum,
1108 BufferAccessStrategy strategy,
1109 bool *foundPtr)
1110{
1111 BufferDesc *bufHdr;
1112 IOContext io_context;
1113 IOObject io_object;
1114
1115 Assert(blockNum != P_NEW);
1116
1117 /* Persistence should be set before */
1118 Assert((persistence == RELPERSISTENCE_TEMP ||
1119 persistence == RELPERSISTENCE_PERMANENT ||
1120 persistence == RELPERSISTENCE_UNLOGGED));
1121
1122 if (persistence == RELPERSISTENCE_TEMP)
1123 {
1124 io_context = IOCONTEXT_NORMAL;
1125 io_object = IOOBJECT_TEMP_RELATION;
1126 }
1127 else
1128 {
1129 io_context = IOContextForStrategy(strategy);
1130 io_object = IOOBJECT_RELATION;
1131 }
1132
1133 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1137 smgr->smgr_rlocator.backend);
1138
1139 if (persistence == RELPERSISTENCE_TEMP)
1140 {
1141 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1142 if (*foundPtr)
1144 }
1145 else
1146 {
1147 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1148 strategy, foundPtr, io_context);
1149 if (*foundPtr)
1151 }
1152 if (rel)
1153 {
1154 /*
1155 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1156 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1157 * zeroed instead), the per-relation stats always count them.
1158 */
1160 if (*foundPtr)
1162 }
1163 if (*foundPtr)
1164 {
1165 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1166 if (VacuumCostActive)
1168
1169 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1173 smgr->smgr_rlocator.backend,
1174 true);
1175 }
1176
1177 return BufferDescriptorGetBuffer(bufHdr);
1178}
1179
1180/*
1181 * ReadBuffer_common -- common logic for all ReadBuffer variants
1182 *
1183 * smgr is required, rel is optional unless using P_NEW.
1184 */
1186ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1187 ForkNumber forkNum,
1189 BufferAccessStrategy strategy)
1190{
1191 ReadBuffersOperation operation;
1192 Buffer buffer;
1193 int flags;
1194 char persistence;
1195
1196 /*
1197 * Backward compatibility path, most code should use ExtendBufferedRel()
1198 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1199 * scales a lot better.
1200 */
1201 if (unlikely(blockNum == P_NEW))
1202 {
1204
1205 /*
1206 * Since no-one else can be looking at the page contents yet, there is
1207 * no difference between an exclusive lock and a cleanup-strength
1208 * lock.
1209 */
1211 flags |= EB_LOCK_FIRST;
1212
1213 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1214 }
1215
1216 if (rel)
1217 persistence = rel->rd_rel->relpersistence;
1218 else
1219 persistence = smgr_persistence;
1220
1223 {
1224 bool found;
1225
1226 buffer = PinBufferForBlock(rel, smgr, persistence,
1227 forkNum, blockNum, strategy, &found);
1228 ZeroAndLockBuffer(buffer, mode, found);
1229 return buffer;
1230 }
1231
1232 /*
1233 * Signal that we are going to immediately wait. If we're immediately
1234 * waiting, there is no benefit in actually executing the IO
1235 * asynchronously, it would just add dispatch overhead.
1236 */
1238 if (mode == RBM_ZERO_ON_ERROR)
1240 operation.smgr = smgr;
1241 operation.rel = rel;
1242 operation.persistence = persistence;
1243 operation.forknum = forkNum;
1244 operation.strategy = strategy;
1245 if (StartReadBuffer(&operation,
1246 &buffer,
1247 blockNum,
1248 flags))
1249 WaitReadBuffers(&operation);
1250
1251 return buffer;
1252}
1253
1256 Buffer *buffers,
1257 BlockNumber blockNum,
1258 int *nblocks,
1259 int flags,
1260 bool allow_forwarding)
1261{
1262 int actual_nblocks = *nblocks;
1263 int maxcombine = 0;
1264 bool did_start_io;
1265
1266 Assert(*nblocks == 1 || allow_forwarding);
1267 Assert(*nblocks > 0);
1268 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1269
1270 for (int i = 0; i < actual_nblocks; ++i)
1271 {
1272 bool found;
1273
1274 if (allow_forwarding && buffers[i] != InvalidBuffer)
1275 {
1276 BufferDesc *bufHdr;
1277
1278 /*
1279 * This is a buffer that was pinned by an earlier call to
1280 * StartReadBuffers(), but couldn't be handled in one operation at
1281 * that time. The operation was split, and the caller has passed
1282 * an already pinned buffer back to us to handle the rest of the
1283 * operation. It must continue at the expected block number.
1284 */
1285 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1286
1287 /*
1288 * It might be an already valid buffer (a hit) that followed the
1289 * final contiguous block of an earlier I/O (a miss) marking the
1290 * end of it, or a buffer that some other backend has since made
1291 * valid by performing the I/O for us, in which case we can handle
1292 * it as a hit now. It is safe to check for a BM_VALID flag with
1293 * a relaxed load, because we got a fresh view of it while pinning
1294 * it in the previous call.
1295 *
1296 * On the other hand if we don't see BM_VALID yet, it must be an
1297 * I/O that was split by the previous call and we need to try to
1298 * start a new I/O from this block. We're also racing against any
1299 * other backend that might start the I/O or even manage to mark
1300 * it BM_VALID after this check, but StartBufferIO() will handle
1301 * those cases.
1302 */
1303 if (BufferIsLocal(buffers[i]))
1304 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1305 else
1306 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1308 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1309 }
1310 else
1311 {
1312 buffers[i] = PinBufferForBlock(operation->rel,
1313 operation->smgr,
1314 operation->persistence,
1315 operation->forknum,
1316 blockNum + i,
1317 operation->strategy,
1318 &found);
1319 }
1320
1321 if (found)
1322 {
1323 /*
1324 * We have a hit. If it's the first block in the requested range,
1325 * we can return it immediately and report that WaitReadBuffers()
1326 * does not need to be called. If the initial value of *nblocks
1327 * was larger, the caller will have to call again for the rest.
1328 */
1329 if (i == 0)
1330 {
1331 *nblocks = 1;
1332
1333#ifdef USE_ASSERT_CHECKING
1334
1335 /*
1336 * Initialize enough of ReadBuffersOperation to make
1337 * CheckReadBuffersOperation() work. Outside of assertions
1338 * that's not necessary when no IO is issued.
1339 */
1340 operation->buffers = buffers;
1341 operation->blocknum = blockNum;
1342 operation->nblocks = 1;
1343 operation->nblocks_done = 1;
1344 CheckReadBuffersOperation(operation, true);
1345#endif
1346 return false;
1347 }
1348
1349 /*
1350 * Otherwise we already have an I/O to perform, but this block
1351 * can't be included as it is already valid. Split the I/O here.
1352 * There may or may not be more blocks requiring I/O after this
1353 * one, we haven't checked, but they can't be contiguous with this
1354 * one in the way. We'll leave this buffer pinned, forwarding it
1355 * to the next call, avoiding the need to unpin it here and re-pin
1356 * it in the next call.
1357 */
1358 actual_nblocks = i;
1359 break;
1360 }
1361 else
1362 {
1363 /*
1364 * Check how many blocks we can cover with the same IO. The smgr
1365 * implementation might e.g. be limited due to a segment boundary.
1366 */
1367 if (i == 0 && actual_nblocks > 1)
1368 {
1369 maxcombine = smgrmaxcombine(operation->smgr,
1370 operation->forknum,
1371 blockNum);
1372 if (unlikely(maxcombine < actual_nblocks))
1373 {
1374 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1375 blockNum, actual_nblocks, maxcombine);
1376 actual_nblocks = maxcombine;
1377 }
1378 }
1379 }
1380 }
1381 *nblocks = actual_nblocks;
1382
1383 /* Populate information needed for I/O. */
1384 operation->buffers = buffers;
1385 operation->blocknum = blockNum;
1386 operation->flags = flags;
1387 operation->nblocks = actual_nblocks;
1388 operation->nblocks_done = 0;
1389 pgaio_wref_clear(&operation->io_wref);
1390
1391 /*
1392 * When using AIO, start the IO in the background. If not, issue prefetch
1393 * requests if desired by the caller.
1394 *
1395 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1396 * de-risk the introduction of AIO somewhat. It's a large architectural
1397 * change, with lots of chances for unanticipated performance effects.
1398 *
1399 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1400 * asynchronously, but without the check here we'd execute IO earlier than
1401 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1402 */
1403 if (io_method != IOMETHOD_SYNC)
1404 {
1405 /*
1406 * Try to start IO asynchronously. It's possible that no IO needs to
1407 * be started, if another backend already performed the IO.
1408 *
1409 * Note that if an IO is started, it might not cover the entire
1410 * requested range, e.g. because an intermediary block has been read
1411 * in by another backend. In that case any "trailing" buffers we
1412 * already pinned above will be "forwarded" by read_stream.c to the
1413 * next call to StartReadBuffers().
1414 *
1415 * This is signalled to the caller by decrementing *nblocks *and*
1416 * reducing operation->nblocks. The latter is done here, but not below
1417 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1418 * overall read size anymore, we need to retry until done in its
1419 * entirety or until failed.
1420 */
1421 did_start_io = AsyncReadBuffers(operation, nblocks);
1422
1423 operation->nblocks = *nblocks;
1424 }
1425 else
1426 {
1427 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1428
1429 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1430 {
1431 /*
1432 * In theory we should only do this if PinBufferForBlock() had to
1433 * allocate new buffers above. That way, if two calls to
1434 * StartReadBuffers() were made for the same blocks before
1435 * WaitReadBuffers(), only the first would issue the advice.
1436 * That'd be a better simulation of true asynchronous I/O, which
1437 * would only start the I/O once, but isn't done here for
1438 * simplicity.
1439 */
1440 smgrprefetch(operation->smgr,
1441 operation->forknum,
1442 blockNum,
1443 actual_nblocks);
1444 }
1445
1446 /*
1447 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1448 * will initiate the necessary IO.
1449 */
1450 did_start_io = true;
1451 }
1452
1453 CheckReadBuffersOperation(operation, !did_start_io);
1454
1455 return did_start_io;
1456}
1457
1458/*
1459 * Begin reading a range of blocks beginning at blockNum and extending for
1460 * *nblocks. *nblocks and the buffers array are in/out parameters. On entry,
1461 * the buffers elements covered by *nblocks must hold either InvalidBuffer or
1462 * buffers forwarded by an earlier call to StartReadBuffers() that was split
1463 * and is now being continued. On return, *nblocks holds the number of blocks
1464 * accepted by this operation. If it is less than the original number then
1465 * this operation has been split, but buffer elements up to the original
1466 * requested size may hold forwarded buffers to be used for a continuing
1467 * operation. The caller must either start a new I/O beginning at the block
1468 * immediately following the blocks accepted by this call and pass those
1469 * buffers back in, or release them if it chooses not to. It shouldn't make
1470 * any other use of or assumptions about forwarded buffers.
1471 *
1472 * If false is returned, no I/O is necessary and the buffers covered by
1473 * *nblocks on exit are valid and ready to be accessed. If true is returned,
1474 * an I/O has been started, and WaitReadBuffers() must be called with the same
1475 * operation object before the buffers covered by *nblocks on exit can be
1476 * accessed. Along with the operation object, the caller-supplied array of
1477 * buffers must remain valid until WaitReadBuffers() is called, and any
1478 * forwarded buffers must also be preserved for a continuing call unless
1479 * they are explicitly released.
1480 *
1481 * Currently the I/O is only started with optional operating system advice if
1482 * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1483 * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1484 * be initiated here.
1485 */
1486bool
1488 Buffer *buffers,
1489 BlockNumber blockNum,
1490 int *nblocks,
1491 int flags)
1492{
1493 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1494 true /* expect forwarded buffers */ );
1495}
1496
1497/*
1498 * Single block version of the StartReadBuffers(). This might save a few
1499 * instructions when called from another translation unit, because it is
1500 * specialized for nblocks == 1.
1501 *
1502 * This version does not support "forwarded" buffers: they cannot be created
1503 * by reading only one block and *buffer is ignored on entry.
1504 */
1505bool
1507 Buffer *buffer,
1508 BlockNumber blocknum,
1509 int flags)
1510{
1511 int nblocks = 1;
1512 bool result;
1513
1514 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1515 false /* single block, no forwarding */ );
1516 Assert(nblocks == 1); /* single block can't be short */
1517
1518 return result;
1519}
1520
1521/*
1522 * Perform sanity checks on the ReadBuffersOperation.
1523 */
1524static void
1526{
1527#ifdef USE_ASSERT_CHECKING
1528 Assert(operation->nblocks_done <= operation->nblocks);
1529 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1530
1531 for (int i = 0; i < operation->nblocks; i++)
1532 {
1533 Buffer buffer = operation->buffers[i];
1534 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1537
1538 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1540
1541 if (i < operation->nblocks_done)
1543 }
1544#endif
1545}
1546
1547/* helper for ReadBuffersCanStartIO(), to avoid repetition */
1548static inline bool
1550{
1551 if (BufferIsLocal(buffer))
1553 true, nowait);
1554 else
1555 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1556}
1557
1558/*
1559 * Helper for AsyncReadBuffers that tries to get the buffer ready for IO.
1560 */
1561static inline bool
1563{
1564 /*
1565 * If this backend currently has staged IO, we need to submit the pending
1566 * IO before waiting for the right to issue IO, to avoid the potential for
1567 * deadlocks (and, more commonly, unnecessary delays for other backends).
1568 */
1569 if (!nowait && pgaio_have_staged())
1570 {
1572 return true;
1573
1574 /*
1575 * Unfortunately StartBufferIO() returning false doesn't allow to
1576 * distinguish between the buffer already being valid and IO already
1577 * being in progress. Since IO already being in progress is quite
1578 * rare, this approach seems fine.
1579 */
1581 }
1582
1583 return ReadBuffersCanStartIOOnce(buffer, nowait);
1584}
1585
1586/*
1587 * Helper for WaitReadBuffers() that processes the results of a readv
1588 * operation, raising an error if necessary.
1589 */
1590static void
1592{
1593 PgAioReturn *aio_ret = &operation->io_return;
1594 PgAioResultStatus rs = aio_ret->result.status;
1595 int newly_read_blocks = 0;
1596
1597 Assert(pgaio_wref_valid(&operation->io_wref));
1598 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1599
1600 /*
1601 * SMGR reports the number of blocks successfully read as the result of
1602 * the IO operation. Thus we can simply add that to ->nblocks_done.
1603 */
1604
1605 if (likely(rs != PGAIO_RS_ERROR))
1606 newly_read_blocks = aio_ret->result.result;
1607
1608 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1609 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1610 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1611 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1612 {
1613 /*
1614 * We'll retry, so we just emit a debug message to the server log (or
1615 * not even that in prod scenarios).
1616 */
1617 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1618 elog(DEBUG3, "partial read, will retry");
1619 }
1620
1621 Assert(newly_read_blocks > 0);
1622 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1623
1624 operation->nblocks_done += newly_read_blocks;
1625
1626 Assert(operation->nblocks_done <= operation->nblocks);
1627}
1628
1629void
1631{
1632 PgAioReturn *aio_ret = &operation->io_return;
1633 IOContext io_context;
1634 IOObject io_object;
1635
1636 if (operation->persistence == RELPERSISTENCE_TEMP)
1637 {
1638 io_context = IOCONTEXT_NORMAL;
1639 io_object = IOOBJECT_TEMP_RELATION;
1640 }
1641 else
1642 {
1643 io_context = IOContextForStrategy(operation->strategy);
1644 io_object = IOOBJECT_RELATION;
1645 }
1646
1647 /*
1648 * If we get here without an IO operation having been issued, the
1649 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1650 * caller should not have called WaitReadBuffers().
1651 *
1652 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1653 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1654 * of the retry logic below, no extra code is required.
1655 *
1656 * This path is expected to eventually go away.
1657 */
1658 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1659 elog(ERROR, "waiting for read operation that didn't read");
1660
1661 /*
1662 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1663 * done. We may need multiple retries, not just because we could get
1664 * multiple partial reads, but also because some of the remaining
1665 * to-be-read buffers may have been read in by other backends, limiting
1666 * the IO size.
1667 */
1668 while (true)
1669 {
1670 int ignored_nblocks_progress;
1671
1672 CheckReadBuffersOperation(operation, false);
1673
1674 /*
1675 * If there is an IO associated with the operation, we may need to
1676 * wait for it.
1677 */
1678 if (pgaio_wref_valid(&operation->io_wref))
1679 {
1680 /*
1681 * Track the time spent waiting for the IO to complete. As
1682 * tracking a wait even if we don't actually need to wait
1683 *
1684 * a) is not cheap, due to the timestamping overhead
1685 *
1686 * b) reports some time as waiting, even if we never waited
1687 *
1688 * we first check if we already know the IO is complete.
1689 */
1690 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1691 !pgaio_wref_check_done(&operation->io_wref))
1692 {
1694
1695 pgaio_wref_wait(&operation->io_wref);
1696
1697 /*
1698 * The IO operation itself was already counted earlier, in
1699 * AsyncReadBuffers(), this just accounts for the wait time.
1700 */
1701 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1702 io_start, 0, 0);
1703 }
1704 else
1705 {
1706 Assert(pgaio_wref_check_done(&operation->io_wref));
1707 }
1708
1709 /*
1710 * We now are sure the IO completed. Check the results. This
1711 * includes reporting on errors if there were any.
1712 */
1713 ProcessReadBuffersResult(operation);
1714 }
1715
1716 /*
1717 * Most of the time, the one IO we already started, will read in
1718 * everything. But we need to deal with partial reads and buffers not
1719 * needing IO anymore.
1720 */
1721 if (operation->nblocks_done == operation->nblocks)
1722 break;
1723
1725
1726 /*
1727 * This may only complete the IO partially, either because some
1728 * buffers were already valid, or because of a partial read.
1729 *
1730 * NB: In contrast to after the AsyncReadBuffers() call in
1731 * StartReadBuffers(), we do *not* reduce
1732 * ReadBuffersOperation->nblocks here, callers expect the full
1733 * operation to be completed at this point (as more operations may
1734 * have been queued).
1735 */
1736 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1737 }
1738
1739 CheckReadBuffersOperation(operation, true);
1740
1741 /* NB: READ_DONE tracepoint was already executed in completion callback */
1742}
1743
1744/*
1745 * Initiate IO for the ReadBuffersOperation
1746 *
1747 * This function only starts a single IO at a time. The size of the IO may be
1748 * limited to below the to-be-read blocks, if one of the buffers has
1749 * concurrently been read in. If the first to-be-read buffer is already valid,
1750 * no IO will be issued.
1751 *
1752 * To support retries after partial reads, the first operation->nblocks_done
1753 * buffers are skipped.
1754 *
1755 * On return *nblocks_progress is updated to reflect the number of buffers
1756 * affected by the call. If the first buffer is valid, *nblocks_progress is
1757 * set to 1 and operation->nblocks_done is incremented.
1758 *
1759 * Returns true if IO was initiated, false if no IO was necessary.
1760 */
1761static bool
1762AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
1763{
1764 Buffer *buffers = &operation->buffers[0];
1765 int flags = operation->flags;
1766 BlockNumber blocknum = operation->blocknum;
1767 ForkNumber forknum = operation->forknum;
1768 char persistence = operation->persistence;
1769 int16 nblocks_done = operation->nblocks_done;
1770 Buffer *io_buffers = &operation->buffers[nblocks_done];
1771 int io_buffers_len = 0;
1772 PgAioHandle *ioh;
1773 uint32 ioh_flags = 0;
1774 void *io_pages[MAX_IO_COMBINE_LIMIT];
1775 IOContext io_context;
1776 IOObject io_object;
1777 bool did_start_io;
1778
1779 /*
1780 * When this IO is executed synchronously, either because the caller will
1781 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1782 * the AIO subsystem needs to know.
1783 */
1784 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1785 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1786
1787 if (persistence == RELPERSISTENCE_TEMP)
1788 {
1789 io_context = IOCONTEXT_NORMAL;
1790 io_object = IOOBJECT_TEMP_RELATION;
1791 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1792 }
1793 else
1794 {
1795 io_context = IOContextForStrategy(operation->strategy);
1796 io_object = IOOBJECT_RELATION;
1797 }
1798
1799 /*
1800 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1801 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1802 * set globally, but on a per-session basis. The completion callback,
1803 * which may be run in other processes, e.g. in IO workers, may have a
1804 * different value of the zero_damaged_pages GUC.
1805 *
1806 * XXX: We probably should eventually use a different flag for
1807 * zero_damaged_pages, so we can report different log levels / error codes
1808 * for zero_damaged_pages and ZERO_ON_ERROR.
1809 */
1812
1813 /*
1814 * For the same reason as with zero_damaged_pages we need to use this
1815 * backend's ignore_checksum_failure value.
1816 */
1819
1820
1821 /*
1822 * To be allowed to report stats in the local completion callback we need
1823 * to prepare to report stats now. This ensures we can safely report the
1824 * checksum failure even in a critical section.
1825 */
1827
1828 /*
1829 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1830 * might block, which we don't want after setting IO_IN_PROGRESS.
1831 *
1832 * If we need to wait for IO before we can get a handle, submit
1833 * already-staged IO first, so that other backends don't need to wait.
1834 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1835 * wait for already submitted IO, which doesn't require additional locks,
1836 * but it could still cause undesirable waits.
1837 *
1838 * A secondary benefit is that this would allow us to measure the time in
1839 * pgaio_io_acquire() without causing undue timer overhead in the common,
1840 * non-blocking, case. However, currently the pgstats infrastructure
1841 * doesn't really allow that, as it a) asserts that an operation can't
1842 * have time without operations b) doesn't have an API to report
1843 * "accumulated" time.
1844 */
1846 if (unlikely(!ioh))
1847 {
1849
1851 }
1852
1853 /*
1854 * Check if we can start IO on the first to-be-read buffer.
1855 *
1856 * If an I/O is already in progress in another backend, we want to wait
1857 * for the outcome: either done, or something went wrong and we will
1858 * retry.
1859 */
1860 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1861 {
1862 /*
1863 * Someone else has already completed this block, we're done.
1864 *
1865 * When IO is necessary, ->nblocks_done is updated in
1866 * ProcessReadBuffersResult(), but that is not called if no IO is
1867 * necessary. Thus update here.
1868 */
1869 operation->nblocks_done += 1;
1870 *nblocks_progress = 1;
1871
1872 pgaio_io_release(ioh);
1873 pgaio_wref_clear(&operation->io_wref);
1874 did_start_io = false;
1875
1876 /*
1877 * Report and track this as a 'hit' for this backend, even though it
1878 * must have started out as a miss in PinBufferForBlock(). The other
1879 * backend will track this as a 'read'.
1880 */
1881 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1882 operation->smgr->smgr_rlocator.locator.spcOid,
1883 operation->smgr->smgr_rlocator.locator.dbOid,
1884 operation->smgr->smgr_rlocator.locator.relNumber,
1885 operation->smgr->smgr_rlocator.backend,
1886 true);
1887
1888 if (persistence == RELPERSISTENCE_TEMP)
1890 else
1892
1893 if (operation->rel)
1894 pgstat_count_buffer_hit(operation->rel);
1895
1896 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1897
1898 if (VacuumCostActive)
1900 }
1901 else
1902 {
1903 instr_time io_start;
1904
1905 /* We found a buffer that we need to read in. */
1906 Assert(io_buffers[0] == buffers[nblocks_done]);
1907 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1908 io_buffers_len = 1;
1909
1910 /*
1911 * How many neighboring-on-disk blocks can we scatter-read into other
1912 * buffers at the same time? In this case we don't wait if we see an
1913 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1914 * head block, so we should get on with that I/O as soon as possible.
1915 */
1916 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1917 {
1918 if (!ReadBuffersCanStartIO(buffers[i], true))
1919 break;
1920 /* Must be consecutive block numbers. */
1921 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1922 BufferGetBlockNumber(buffers[i]) - 1);
1923 Assert(io_buffers[io_buffers_len] == buffers[i]);
1924
1925 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1926 }
1927
1928 /* get a reference to wait for in WaitReadBuffers() */
1929 pgaio_io_get_wref(ioh, &operation->io_wref);
1930
1931 /* provide the list of buffers to the completion callbacks */
1932 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1933
1935 persistence == RELPERSISTENCE_TEMP ?
1938 flags);
1939
1940 pgaio_io_set_flag(ioh, ioh_flags);
1941
1942 /* ---
1943 * Even though we're trying to issue IO asynchronously, track the time
1944 * in smgrstartreadv():
1945 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1946 * immediately
1947 * - the io method might not support the IO (e.g. worker IO for a temp
1948 * table)
1949 * ---
1950 */
1952 smgrstartreadv(ioh, operation->smgr, forknum,
1953 blocknum + nblocks_done,
1954 io_pages, io_buffers_len);
1955 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1956 io_start, 1, io_buffers_len * BLCKSZ);
1957
1958 if (persistence == RELPERSISTENCE_TEMP)
1959 pgBufferUsage.local_blks_read += io_buffers_len;
1960 else
1961 pgBufferUsage.shared_blks_read += io_buffers_len;
1962
1963 /*
1964 * Track vacuum cost when issuing IO, not after waiting for it.
1965 * Otherwise we could end up issuing a lot of IO in a short timespan,
1966 * despite a low cost limit.
1967 */
1968 if (VacuumCostActive)
1969 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1970
1971 *nblocks_progress = io_buffers_len;
1972 did_start_io = true;
1973 }
1974
1975 return did_start_io;
1976}
1977
1978/*
1979 * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1980 * buffer. If no buffer exists already, selects a replacement victim and
1981 * evicts the old page, but does NOT read in new page.
1982 *
1983 * "strategy" can be a buffer replacement strategy object, or NULL for
1984 * the default strategy. The selected buffer's usage_count is advanced when
1985 * using the default strategy, but otherwise possibly not (see PinBuffer).
1986 *
1987 * The returned buffer is pinned and is already marked as holding the
1988 * desired page. If it already did have the desired page, *foundPtr is
1989 * set true. Otherwise, *foundPtr is set false.
1990 *
1991 * io_context is passed as an output parameter to avoid calling
1992 * IOContextForStrategy() when there is a shared buffers hit and no IO
1993 * statistics need be captured.
1994 *
1995 * No locks are held either at entry or exit.
1996 */
1998BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1999 BlockNumber blockNum,
2000 BufferAccessStrategy strategy,
2001 bool *foundPtr, IOContext io_context)
2002{
2003 BufferTag newTag; /* identity of requested block */
2004 uint32 newHash; /* hash value for newTag */
2005 LWLock *newPartitionLock; /* buffer partition lock for it */
2006 int existing_buf_id;
2007 Buffer victim_buffer;
2008 BufferDesc *victim_buf_hdr;
2009 uint32 victim_buf_state;
2010
2011 /* Make sure we will have room to remember the buffer pin */
2014
2015 /* create a tag so we can lookup the buffer */
2016 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2017
2018 /* determine its hash code and partition lock ID */
2019 newHash = BufTableHashCode(&newTag);
2020 newPartitionLock = BufMappingPartitionLock(newHash);
2021
2022 /* see if the block is in the buffer pool already */
2023 LWLockAcquire(newPartitionLock, LW_SHARED);
2024 existing_buf_id = BufTableLookup(&newTag, newHash);
2025 if (existing_buf_id >= 0)
2026 {
2027 BufferDesc *buf;
2028 bool valid;
2029
2030 /*
2031 * Found it. Now, pin the buffer so no one can steal it from the
2032 * buffer pool, and check to see if the correct data has been loaded
2033 * into the buffer.
2034 */
2035 buf = GetBufferDescriptor(existing_buf_id);
2036
2037 valid = PinBuffer(buf, strategy);
2038
2039 /* Can release the mapping lock as soon as we've pinned it */
2040 LWLockRelease(newPartitionLock);
2041
2042 *foundPtr = true;
2043
2044 if (!valid)
2045 {
2046 /*
2047 * We can only get here if (a) someone else is still reading in
2048 * the page, (b) a previous read attempt failed, or (c) someone
2049 * called StartReadBuffers() but not yet WaitReadBuffers().
2050 */
2051 *foundPtr = false;
2052 }
2053
2054 return buf;
2055 }
2056
2057 /*
2058 * Didn't find it in the buffer pool. We'll have to initialize a new
2059 * buffer. Remember to unlock the mapping lock while doing the work.
2060 */
2061 LWLockRelease(newPartitionLock);
2062
2063 /*
2064 * Acquire a victim buffer. Somebody else might try to do the same, we
2065 * don't hold any conflicting locks. If so we'll have to undo our work
2066 * later.
2067 */
2068 victim_buffer = GetVictimBuffer(strategy, io_context);
2069 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2070
2071 /*
2072 * Try to make a hashtable entry for the buffer under its new tag. If
2073 * somebody else inserted another buffer for the tag, we'll release the
2074 * victim buffer we acquired and use the already inserted one.
2075 */
2076 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2077 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2078 if (existing_buf_id >= 0)
2079 {
2080 BufferDesc *existing_buf_hdr;
2081 bool valid;
2082
2083 /*
2084 * Got a collision. Someone has already done what we were about to do.
2085 * We'll just handle this as if it were found in the buffer pool in
2086 * the first place. First, give up the buffer we were planning to
2087 * use.
2088 *
2089 * We could do this after releasing the partition lock, but then we'd
2090 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2091 * before acquiring the lock, for the rare case of such a collision.
2092 */
2093 UnpinBuffer(victim_buf_hdr);
2094
2095 /*
2096 * The victim buffer we acquired previously is clean and unused, let
2097 * it be found again quickly
2098 */
2099 StrategyFreeBuffer(victim_buf_hdr);
2100
2101 /* remaining code should match code at top of routine */
2102
2103 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2104
2105 valid = PinBuffer(existing_buf_hdr, strategy);
2106
2107 /* Can release the mapping lock as soon as we've pinned it */
2108 LWLockRelease(newPartitionLock);
2109
2110 *foundPtr = true;
2111
2112 if (!valid)
2113 {
2114 /*
2115 * We can only get here if (a) someone else is still reading in
2116 * the page, (b) a previous read attempt failed, or (c) someone
2117 * called StartReadBuffers() but not yet WaitReadBuffers().
2118 */
2119 *foundPtr = false;
2120 }
2121
2122 return existing_buf_hdr;
2123 }
2124
2125 /*
2126 * Need to lock the buffer header too in order to change its tag.
2127 */
2128 victim_buf_state = LockBufHdr(victim_buf_hdr);
2129
2130 /* some sanity checks while we hold the buffer header lock */
2131 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2132 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2133
2134 victim_buf_hdr->tag = newTag;
2135
2136 /*
2137 * Make sure BM_PERMANENT is set for buffers that must be written at every
2138 * checkpoint. Unlogged buffers only need to be written at shutdown
2139 * checkpoints, except for their "init" forks, which need to be treated
2140 * just like permanent relations.
2141 */
2142 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2143 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2144 victim_buf_state |= BM_PERMANENT;
2145
2146 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2147
2148 LWLockRelease(newPartitionLock);
2149
2150 /*
2151 * Buffer contents are currently invalid.
2152 */
2153 *foundPtr = false;
2154
2155 return victim_buf_hdr;
2156}
2157
2158/*
2159 * InvalidateBuffer -- mark a shared buffer invalid and return it to the
2160 * freelist.
2161 *
2162 * The buffer header spinlock must be held at entry. We drop it before
2163 * returning. (This is sane because the caller must have locked the
2164 * buffer in order to be sure it should be dropped.)
2165 *
2166 * This is used only in contexts such as dropping a relation. We assume
2167 * that no other backend could possibly be interested in using the page,
2168 * so the only reason the buffer might be pinned is if someone else is
2169 * trying to write it out. We have to let them finish before we can
2170 * reclaim the buffer.
2171 *
2172 * The buffer could get reclaimed by someone else while we are waiting
2173 * to acquire the necessary locks; if so, don't mess it up.
2174 */
2175static void
2177{
2178 BufferTag oldTag;
2179 uint32 oldHash; /* hash value for oldTag */
2180 LWLock *oldPartitionLock; /* buffer partition lock for it */
2181 uint32 oldFlags;
2182 uint32 buf_state;
2183
2184 /* Save the original buffer tag before dropping the spinlock */
2185 oldTag = buf->tag;
2186
2187 buf_state = pg_atomic_read_u32(&buf->state);
2188 Assert(buf_state & BM_LOCKED);
2189 UnlockBufHdr(buf, buf_state);
2190
2191 /*
2192 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2193 * worth storing the hashcode in BufferDesc so we need not recompute it
2194 * here? Probably not.
2195 */
2196 oldHash = BufTableHashCode(&oldTag);
2197 oldPartitionLock = BufMappingPartitionLock(oldHash);
2198
2199retry:
2200
2201 /*
2202 * Acquire exclusive mapping lock in preparation for changing the buffer's
2203 * association.
2204 */
2205 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2206
2207 /* Re-lock the buffer header */
2208 buf_state = LockBufHdr(buf);
2209
2210 /* If it's changed while we were waiting for lock, do nothing */
2211 if (!BufferTagsEqual(&buf->tag, &oldTag))
2212 {
2213 UnlockBufHdr(buf, buf_state);
2214 LWLockRelease(oldPartitionLock);
2215 return;
2216 }
2217
2218 /*
2219 * We assume the reason for it to be pinned is that either we were
2220 * asynchronously reading the page in before erroring out or someone else
2221 * is flushing the page out. Wait for the IO to finish. (This could be
2222 * an infinite loop if the refcount is messed up... it would be nice to
2223 * time out after awhile, but there seems no way to be sure how many loops
2224 * may be needed. Note that if the other guy has pinned the buffer but
2225 * not yet done StartBufferIO, WaitIO will fall through and we'll
2226 * effectively be busy-looping here.)
2227 */
2228 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2229 {
2230 UnlockBufHdr(buf, buf_state);
2231 LWLockRelease(oldPartitionLock);
2232 /* safety check: should definitely not be our *own* pin */
2234 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2235 WaitIO(buf);
2236 goto retry;
2237 }
2238
2239 /*
2240 * Clear out the buffer's tag and flags. We must do this to ensure that
2241 * linear scans of the buffer array don't think the buffer is valid.
2242 */
2243 oldFlags = buf_state & BUF_FLAG_MASK;
2244 ClearBufferTag(&buf->tag);
2245 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2246 UnlockBufHdr(buf, buf_state);
2247
2248 /*
2249 * Remove the buffer from the lookup hashtable, if it was in there.
2250 */
2251 if (oldFlags & BM_TAG_VALID)
2252 BufTableDelete(&oldTag, oldHash);
2253
2254 /*
2255 * Done with mapping lock.
2256 */
2257 LWLockRelease(oldPartitionLock);
2258
2259 /*
2260 * Insert the buffer at the head of the list of free buffers.
2261 */
2263}
2264
2265/*
2266 * Helper routine for GetVictimBuffer()
2267 *
2268 * Needs to be called on a buffer with a valid tag, pinned, but without the
2269 * buffer header spinlock held.
2270 *
2271 * Returns true if the buffer can be reused, in which case the buffer is only
2272 * pinned by this backend and marked as invalid, false otherwise.
2273 */
2274static bool
2276{
2277 uint32 buf_state;
2278 uint32 hash;
2279 LWLock *partition_lock;
2280 BufferTag tag;
2281
2283
2284 /* have buffer pinned, so it's safe to read tag without lock */
2285 tag = buf_hdr->tag;
2286
2287 hash = BufTableHashCode(&tag);
2288 partition_lock = BufMappingPartitionLock(hash);
2289
2290 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2291
2292 /* lock the buffer header */
2293 buf_state = LockBufHdr(buf_hdr);
2294
2295 /*
2296 * We have the buffer pinned nobody else should have been able to unset
2297 * this concurrently.
2298 */
2299 Assert(buf_state & BM_TAG_VALID);
2300 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2301 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2302
2303 /*
2304 * If somebody else pinned the buffer since, or even worse, dirtied it,
2305 * give up on this buffer: It's clearly in use.
2306 */
2307 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2308 {
2309 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2310
2311 UnlockBufHdr(buf_hdr, buf_state);
2312 LWLockRelease(partition_lock);
2313
2314 return false;
2315 }
2316
2317 /*
2318 * Clear out the buffer's tag and flags and usagecount. This is not
2319 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2320 * doing anything with the buffer. But currently it's beneficial, as the
2321 * cheaper pre-check for several linear scans of shared buffers use the
2322 * tag (see e.g. FlushDatabaseBuffers()).
2323 */
2324 ClearBufferTag(&buf_hdr->tag);
2325 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2326 UnlockBufHdr(buf_hdr, buf_state);
2327
2328 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2329
2330 /* finally delete buffer from the buffer mapping table */
2331 BufTableDelete(&tag, hash);
2332
2333 LWLockRelease(partition_lock);
2334
2335 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2336 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2338
2339 return true;
2340}
2341
2342static Buffer
2344{
2345 BufferDesc *buf_hdr;
2346 Buffer buf;
2347 uint32 buf_state;
2348 bool from_ring;
2349
2350 /*
2351 * Ensure, while the spinlock's not yet held, that there's a free refcount
2352 * entry, and a resource owner slot for the pin.
2353 */
2356
2357 /* we return here if a prospective victim buffer gets used concurrently */
2358again:
2359
2360 /*
2361 * Select a victim buffer. The buffer is returned with its header
2362 * spinlock still held!
2363 */
2364 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2365 buf = BufferDescriptorGetBuffer(buf_hdr);
2366
2367 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2368
2369 /* Pin the buffer and then release the buffer spinlock */
2370 PinBuffer_Locked(buf_hdr);
2371
2372 /*
2373 * We shouldn't have any other pins for this buffer.
2374 */
2376
2377 /*
2378 * If the buffer was dirty, try to write it out. There is a race
2379 * condition here, in that someone might dirty it after we released the
2380 * buffer header lock above, or even while we are writing it out (since
2381 * our share-lock won't prevent hint-bit updates). We will recheck the
2382 * dirty bit after re-locking the buffer header.
2383 */
2384 if (buf_state & BM_DIRTY)
2385 {
2386 LWLock *content_lock;
2387
2388 Assert(buf_state & BM_TAG_VALID);
2389 Assert(buf_state & BM_VALID);
2390
2391 /*
2392 * We need a share-lock on the buffer contents to write it out (else
2393 * we might write invalid data, eg because someone else is compacting
2394 * the page contents while we write). We must use a conditional lock
2395 * acquisition here to avoid deadlock. Even though the buffer was not
2396 * pinned (and therefore surely not locked) when StrategyGetBuffer
2397 * returned it, someone else could have pinned and exclusive-locked it
2398 * by the time we get here. If we try to get the lock unconditionally,
2399 * we'd block waiting for them; if they later block waiting for us,
2400 * deadlock ensues. (This has been observed to happen when two
2401 * backends are both trying to split btree index pages, and the second
2402 * one just happens to be trying to split the page the first one got
2403 * from StrategyGetBuffer.)
2404 */
2405 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2406 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2407 {
2408 /*
2409 * Someone else has locked the buffer, so give it up and loop back
2410 * to get another one.
2411 */
2412 UnpinBuffer(buf_hdr);
2413 goto again;
2414 }
2415
2416 /*
2417 * If using a nondefault strategy, and writing the buffer would
2418 * require a WAL flush, let the strategy decide whether to go ahead
2419 * and write/reuse the buffer or to choose another victim. We need a
2420 * lock to inspect the page LSN, so this can't be done inside
2421 * StrategyGetBuffer.
2422 */
2423 if (strategy != NULL)
2424 {
2425 XLogRecPtr lsn;
2426
2427 /* Read the LSN while holding buffer header lock */
2428 buf_state = LockBufHdr(buf_hdr);
2429 lsn = BufferGetLSN(buf_hdr);
2430 UnlockBufHdr(buf_hdr, buf_state);
2431
2432 if (XLogNeedsFlush(lsn)
2433 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2434 {
2435 LWLockRelease(content_lock);
2436 UnpinBuffer(buf_hdr);
2437 goto again;
2438 }
2439 }
2440
2441 /* OK, do the I/O */
2442 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2443 LWLockRelease(content_lock);
2444
2446 &buf_hdr->tag);
2447 }
2448
2449
2450 if (buf_state & BM_VALID)
2451 {
2452 /*
2453 * When a BufferAccessStrategy is in use, blocks evicted from shared
2454 * buffers are counted as IOOP_EVICT in the corresponding context
2455 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2456 * strategy in two cases: 1) while initially claiming buffers for the
2457 * strategy ring 2) to replace an existing strategy ring buffer
2458 * because it is pinned or in use and cannot be reused.
2459 *
2460 * Blocks evicted from buffers already in the strategy ring are
2461 * counted as IOOP_REUSE in the corresponding strategy context.
2462 *
2463 * At this point, we can accurately count evictions and reuses,
2464 * because we have successfully claimed the valid buffer. Previously,
2465 * we may have been forced to release the buffer due to concurrent
2466 * pinners or erroring out.
2467 */
2469 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2470 }
2471
2472 /*
2473 * If the buffer has an entry in the buffer mapping table, delete it. This
2474 * can fail because another backend could have pinned or dirtied the
2475 * buffer.
2476 */
2477 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2478 {
2479 UnpinBuffer(buf_hdr);
2480 goto again;
2481 }
2482
2483 /* a final set of sanity checks */
2484#ifdef USE_ASSERT_CHECKING
2485 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2486
2487 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2488 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2489
2491#endif
2492
2493 return buf;
2494}
2495
2496/*
2497 * Return the maximum number of buffers that a backend should try to pin once,
2498 * to avoid exceeding its fair share. This is the highest value that
2499 * GetAdditionalPinLimit() could ever return. Note that it may be zero on a
2500 * system with a very small buffer pool relative to max_connections.
2501 */
2502uint32
2504{
2505 return MaxProportionalPins;
2506}
2507
2508/*
2509 * Return the maximum number of additional buffers that this backend should
2510 * pin if it wants to stay under the per-backend limit, considering the number
2511 * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit
2512 * return by this function can be zero.
2513 */
2514uint32
2516{
2517 uint32 estimated_pins_held;
2518
2519 /*
2520 * We get the number of "overflowed" pins for free, but don't know the
2521 * number of pins in PrivateRefCountArray. The cost of calculating that
2522 * exactly doesn't seem worth it, so just assume the max.
2523 */
2524 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2525
2526 /* Is this backend already holding more than its fair share? */
2527 if (estimated_pins_held > MaxProportionalPins)
2528 return 0;
2529
2530 return MaxProportionalPins - estimated_pins_held;
2531}
2532
2533/*
2534 * Limit the number of pins a batch operation may additionally acquire, to
2535 * avoid running out of pinnable buffers.
2536 *
2537 * One additional pin is always allowed, on the assumption that the operation
2538 * requires at least one to make progress.
2539 */
2540void
2542{
2543 uint32 limit;
2544
2545 if (*additional_pins <= 1)
2546 return;
2547
2548 limit = GetAdditionalPinLimit();
2549 limit = Max(limit, 1);
2550 if (limit < *additional_pins)
2551 *additional_pins = limit;
2552}
2553
2554/*
2555 * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2556 * avoid duplicating the tracing and relpersistence related logic.
2557 */
2558static BlockNumber
2560 ForkNumber fork,
2561 BufferAccessStrategy strategy,
2562 uint32 flags,
2563 uint32 extend_by,
2564 BlockNumber extend_upto,
2565 Buffer *buffers,
2566 uint32 *extended_by)
2567{
2568 BlockNumber first_block;
2569
2570 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2575 extend_by);
2576
2577 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2578 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2579 extend_by, extend_upto,
2580 buffers, &extend_by);
2581 else
2582 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2583 extend_by, extend_upto,
2584 buffers, &extend_by);
2585 *extended_by = extend_by;
2586
2587 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2592 *extended_by,
2593 first_block);
2594
2595 return first_block;
2596}
2597
2598/*
2599 * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2600 * shared buffers.
2601 */
2602static BlockNumber
2604 ForkNumber fork,
2605 BufferAccessStrategy strategy,
2606 uint32 flags,
2607 uint32 extend_by,
2608 BlockNumber extend_upto,
2609 Buffer *buffers,
2610 uint32 *extended_by)
2611{
2612 BlockNumber first_block;
2613 IOContext io_context = IOContextForStrategy(strategy);
2614 instr_time io_start;
2615
2616 LimitAdditionalPins(&extend_by);
2617
2618 /*
2619 * Acquire victim buffers for extension without holding extension lock.
2620 * Writing out victim buffers is the most expensive part of extending the
2621 * relation, particularly when doing so requires WAL flushes. Zeroing out
2622 * the buffers is also quite expensive, so do that before holding the
2623 * extension lock as well.
2624 *
2625 * These pages are pinned by us and not valid. While we hold the pin they
2626 * can't be acquired as victim buffers by another backend.
2627 */
2628 for (uint32 i = 0; i < extend_by; i++)
2629 {
2630 Block buf_block;
2631
2632 buffers[i] = GetVictimBuffer(strategy, io_context);
2633 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2634
2635 /* new buffers are zero-filled */
2636 MemSet(buf_block, 0, BLCKSZ);
2637 }
2638
2639 /*
2640 * Lock relation against concurrent extensions, unless requested not to.
2641 *
2642 * We use the same extension lock for all forks. That's unnecessarily
2643 * restrictive, but currently extensions for forks don't happen often
2644 * enough to make it worth locking more granularly.
2645 *
2646 * Note that another backend might have extended the relation by the time
2647 * we get the lock.
2648 */
2649 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2651
2652 /*
2653 * If requested, invalidate size cache, so that smgrnblocks asks the
2654 * kernel.
2655 */
2656 if (flags & EB_CLEAR_SIZE_CACHE)
2658
2659 first_block = smgrnblocks(bmr.smgr, fork);
2660
2661 /*
2662 * Now that we have the accurate relation size, check if the caller wants
2663 * us to extend to only up to a specific size. If there were concurrent
2664 * extensions, we might have acquired too many buffers and need to release
2665 * them.
2666 */
2667 if (extend_upto != InvalidBlockNumber)
2668 {
2669 uint32 orig_extend_by = extend_by;
2670
2671 if (first_block > extend_upto)
2672 extend_by = 0;
2673 else if ((uint64) first_block + extend_by > extend_upto)
2674 extend_by = extend_upto - first_block;
2675
2676 for (uint32 i = extend_by; i < orig_extend_by; i++)
2677 {
2678 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2679
2680 /*
2681 * The victim buffer we acquired previously is clean and unused,
2682 * let it be found again quickly
2683 */
2684 StrategyFreeBuffer(buf_hdr);
2685 UnpinBuffer(buf_hdr);
2686 }
2687
2688 if (extend_by == 0)
2689 {
2690 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2692 *extended_by = extend_by;
2693 return first_block;
2694 }
2695 }
2696
2697 /* Fail if relation is already at maximum possible length */
2698 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2699 ereport(ERROR,
2700 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2701 errmsg("cannot extend relation %s beyond %u blocks",
2702 relpath(bmr.smgr->smgr_rlocator, fork).str,
2703 MaxBlockNumber)));
2704
2705 /*
2706 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2707 *
2708 * This needs to happen before we extend the relation, because as soon as
2709 * we do, other backends can start to read in those pages.
2710 */
2711 for (uint32 i = 0; i < extend_by; i++)
2712 {
2713 Buffer victim_buf = buffers[i];
2714 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2715 BufferTag tag;
2716 uint32 hash;
2717 LWLock *partition_lock;
2718 int existing_id;
2719
2720 /* in case we need to pin an existing buffer below */
2723
2724 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2725 hash = BufTableHashCode(&tag);
2726 partition_lock = BufMappingPartitionLock(hash);
2727
2728 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2729
2730 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2731
2732 /*
2733 * We get here only in the corner case where we are trying to extend
2734 * the relation but we found a pre-existing buffer. This can happen
2735 * because a prior attempt at extending the relation failed, and
2736 * because mdread doesn't complain about reads beyond EOF (when
2737 * zero_damaged_pages is ON) and so a previous attempt to read a block
2738 * beyond EOF could have left a "valid" zero-filled buffer.
2739 * Unfortunately, we have also seen this case occurring because of
2740 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2741 * that doesn't account for a recent write. In that situation, the
2742 * pre-existing buffer would contain valid data that we don't want to
2743 * overwrite. Since the legitimate cases should always have left a
2744 * zero-filled buffer, complain if not PageIsNew.
2745 */
2746 if (existing_id >= 0)
2747 {
2748 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2749 Block buf_block;
2750 bool valid;
2751
2752 /*
2753 * Pin the existing buffer before releasing the partition lock,
2754 * preventing it from being evicted.
2755 */
2756 valid = PinBuffer(existing_hdr, strategy);
2757
2758 LWLockRelease(partition_lock);
2759
2760 /*
2761 * The victim buffer we acquired previously is clean and unused,
2762 * let it be found again quickly
2763 */
2764 StrategyFreeBuffer(victim_buf_hdr);
2765 UnpinBuffer(victim_buf_hdr);
2766
2767 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2768 buf_block = BufHdrGetBlock(existing_hdr);
2769
2770 if (valid && !PageIsNew((Page) buf_block))
2771 ereport(ERROR,
2772 (errmsg("unexpected data beyond EOF in block %u of relation %s",
2773 existing_hdr->tag.blockNum,
2774 relpath(bmr.smgr->smgr_rlocator, fork).str),
2775 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2776
2777 /*
2778 * We *must* do smgr[zero]extend before succeeding, else the page
2779 * will not be reserved by the kernel, and the next P_NEW call
2780 * will decide to return the same page. Clear the BM_VALID bit,
2781 * do StartBufferIO() and proceed.
2782 *
2783 * Loop to handle the very small possibility that someone re-sets
2784 * BM_VALID between our clearing it and StartBufferIO inspecting
2785 * it.
2786 */
2787 do
2788 {
2789 uint32 buf_state = LockBufHdr(existing_hdr);
2790
2791 buf_state &= ~BM_VALID;
2792 UnlockBufHdr(existing_hdr, buf_state);
2793 } while (!StartBufferIO(existing_hdr, true, false));
2794 }
2795 else
2796 {
2797 uint32 buf_state;
2798
2799 buf_state = LockBufHdr(victim_buf_hdr);
2800
2801 /* some sanity checks while we hold the buffer header lock */
2802 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2803 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2804
2805 victim_buf_hdr->tag = tag;
2806
2807 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2808 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2809 buf_state |= BM_PERMANENT;
2810
2811 UnlockBufHdr(victim_buf_hdr, buf_state);
2812
2813 LWLockRelease(partition_lock);
2814
2815 /* XXX: could combine the locked operations in it with the above */
2816 StartBufferIO(victim_buf_hdr, true, false);
2817 }
2818 }
2819
2821
2822 /*
2823 * Note: if smgrzeroextend fails, we will end up with buffers that are
2824 * allocated but not marked BM_VALID. The next relation extension will
2825 * still select the same block number (because the relation didn't get any
2826 * longer on disk) and so future attempts to extend the relation will find
2827 * the same buffers (if they have not been recycled) but come right back
2828 * here to try smgrzeroextend again.
2829 *
2830 * We don't need to set checksum for all-zero pages.
2831 */
2832 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2833
2834 /*
2835 * Release the file-extension lock; it's now OK for someone else to extend
2836 * the relation some more.
2837 *
2838 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2839 * take noticeable time.
2840 */
2841 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2843
2845 io_start, 1, extend_by * BLCKSZ);
2846
2847 /* Set BM_VALID, terminate IO, and wake up any waiters */
2848 for (uint32 i = 0; i < extend_by; i++)
2849 {
2850 Buffer buf = buffers[i];
2851 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2852 bool lock = false;
2853
2854 if (flags & EB_LOCK_FIRST && i == 0)
2855 lock = true;
2856 else if (flags & EB_LOCK_TARGET)
2857 {
2858 Assert(extend_upto != InvalidBlockNumber);
2859 if (first_block + i + 1 == extend_upto)
2860 lock = true;
2861 }
2862
2863 if (lock)
2865
2866 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2867 }
2868
2870
2871 *extended_by = extend_by;
2872
2873 return first_block;
2874}
2875
2876/*
2877 * BufferIsExclusiveLocked
2878 *
2879 * Checks if buffer is exclusive-locked.
2880 *
2881 * Buffer must be pinned.
2882 */
2883bool
2885{
2886 BufferDesc *bufHdr;
2887
2889
2890 if (BufferIsLocal(buffer))
2891 {
2892 /* Content locks are not maintained for local buffers. */
2893 return true;
2894 }
2895 else
2896 {
2897 bufHdr = GetBufferDescriptor(buffer - 1);
2899 LW_EXCLUSIVE);
2900 }
2901}
2902
2903/*
2904 * BufferIsDirty
2905 *
2906 * Checks if buffer is already dirty.
2907 *
2908 * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2909 * the result may be stale before it's returned.)
2910 */
2911bool
2913{
2914 BufferDesc *bufHdr;
2915
2917
2918 if (BufferIsLocal(buffer))
2919 {
2920 int bufid = -buffer - 1;
2921
2922 bufHdr = GetLocalBufferDescriptor(bufid);
2923 /* Content locks are not maintained for local buffers. */
2924 }
2925 else
2926 {
2927 bufHdr = GetBufferDescriptor(buffer - 1);
2929 LW_EXCLUSIVE));
2930 }
2931
2932 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2933}
2934
2935/*
2936 * MarkBufferDirty
2937 *
2938 * Marks buffer contents as dirty (actual write happens later).
2939 *
2940 * Buffer must be pinned and exclusive-locked. (If caller does not hold
2941 * exclusive lock, then somebody could be in process of writing the buffer,
2942 * leading to risk of bad data written to disk.)
2943 */
2944void
2946{
2947 BufferDesc *bufHdr;
2948 uint32 buf_state;
2949 uint32 old_buf_state;
2950
2951 if (!BufferIsValid(buffer))
2952 elog(ERROR, "bad buffer ID: %d", buffer);
2953
2954 if (BufferIsLocal(buffer))
2955 {
2957 return;
2958 }
2959
2960 bufHdr = GetBufferDescriptor(buffer - 1);
2961
2964 LW_EXCLUSIVE));
2965
2966 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2967 for (;;)
2968 {
2969 if (old_buf_state & BM_LOCKED)
2970 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2971
2972 buf_state = old_buf_state;
2973
2974 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2975 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2976
2977 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2978 buf_state))
2979 break;
2980 }
2981
2982 /*
2983 * If the buffer was not dirty already, do vacuum accounting.
2984 */
2985 if (!(old_buf_state & BM_DIRTY))
2986 {
2988 if (VacuumCostActive)
2990 }
2991}
2992
2993/*
2994 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2995 *
2996 * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2997 * compared to calling the two routines separately. Now it's mainly just
2998 * a convenience function. However, if the passed buffer is valid and
2999 * already contains the desired block, we just return it as-is; and that
3000 * does save considerable work compared to a full release and reacquire.
3001 *
3002 * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
3003 * buffer actually needs to be released. This case is the same as ReadBuffer,
3004 * but can save some tests in the caller.
3005 */
3006Buffer
3008 Relation relation,
3009 BlockNumber blockNum)
3010{
3011 ForkNumber forkNum = MAIN_FORKNUM;
3012 BufferDesc *bufHdr;
3013
3014 if (BufferIsValid(buffer))
3015 {
3017 if (BufferIsLocal(buffer))
3018 {
3019 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3020 if (bufHdr->tag.blockNum == blockNum &&
3021 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3022 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3023 return buffer;
3025 }
3026 else
3027 {
3028 bufHdr = GetBufferDescriptor(buffer - 1);
3029 /* we have pin, so it's ok to examine tag without spinlock */
3030 if (bufHdr->tag.blockNum == blockNum &&
3031 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3032 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3033 return buffer;
3034 UnpinBuffer(bufHdr);
3035 }
3036 }
3037
3038 return ReadBuffer(relation, blockNum);
3039}
3040
3041/*
3042 * PinBuffer -- make buffer unavailable for replacement.
3043 *
3044 * For the default access strategy, the buffer's usage_count is incremented
3045 * when we first pin it; for other strategies we just make sure the usage_count
3046 * isn't zero. (The idea of the latter is that we don't want synchronized
3047 * heap scans to inflate the count, but we need it to not be zero to discourage
3048 * other backends from stealing buffers from our ring. As long as we cycle
3049 * through the ring faster than the global clock-sweep cycles, buffers in
3050 * our ring won't be chosen as victims for replacement by other backends.)
3051 *
3052 * This should be applied only to shared buffers, never local ones.
3053 *
3054 * Since buffers are pinned/unpinned very frequently, pin buffers without
3055 * taking the buffer header lock; instead update the state variable in loop of
3056 * CAS operations. Hopefully it's just a single CAS.
3057 *
3058 * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
3059 * must have been done already.
3060 *
3061 * Returns true if buffer is BM_VALID, else false. This provision allows
3062 * some callers to avoid an extra spinlock cycle.
3063 */
3064static bool
3066{
3068 bool result;
3070
3073
3074 ref = GetPrivateRefCountEntry(b, true);
3075
3076 if (ref == NULL)
3077 {
3078 uint32 buf_state;
3079 uint32 old_buf_state;
3080
3082
3083 old_buf_state = pg_atomic_read_u32(&buf->state);
3084 for (;;)
3085 {
3086 if (old_buf_state & BM_LOCKED)
3087 old_buf_state = WaitBufHdrUnlocked(buf);
3088
3089 buf_state = old_buf_state;
3090
3091 /* increase refcount */
3092 buf_state += BUF_REFCOUNT_ONE;
3093
3094 if (strategy == NULL)
3095 {
3096 /* Default case: increase usagecount unless already max. */
3098 buf_state += BUF_USAGECOUNT_ONE;
3099 }
3100 else
3101 {
3102 /*
3103 * Ring buffers shouldn't evict others from pool. Thus we
3104 * don't make usagecount more than 1.
3105 */
3106 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3107 buf_state += BUF_USAGECOUNT_ONE;
3108 }
3109
3110 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3111 buf_state))
3112 {
3113 result = (buf_state & BM_VALID) != 0;
3114
3115 /*
3116 * Assume that we acquired a buffer pin for the purposes of
3117 * Valgrind buffer client checks (even in !result case) to
3118 * keep things simple. Buffers that are unsafe to access are
3119 * not generally guaranteed to be marked undefined or
3120 * non-accessible in any case.
3121 */
3123 break;
3124 }
3125 }
3126 }
3127 else
3128 {
3129 /*
3130 * If we previously pinned the buffer, it is likely to be valid, but
3131 * it may not be if StartReadBuffers() was called and
3132 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3133 * the flags without locking. This is racy, but it's OK to return
3134 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3135 * it'll see that it's now valid.
3136 *
3137 * Note: We deliberately avoid a Valgrind client request here.
3138 * Individual access methods can optionally superimpose buffer page
3139 * client requests on top of our client requests to enforce that
3140 * buffers are only accessed while locked (and pinned). It's possible
3141 * that the buffer page is legitimately non-accessible here. We
3142 * cannot meddle with that.
3143 */
3144 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3145 }
3146
3147 ref->refcount++;
3148 Assert(ref->refcount > 0);
3150 return result;
3151}
3152
3153/*
3154 * PinBuffer_Locked -- as above, but caller already locked the buffer header.
3155 * The spinlock is released before return.
3156 *
3157 * As this function is called with the spinlock held, the caller has to
3158 * previously call ReservePrivateRefCountEntry() and
3159 * ResourceOwnerEnlarge(CurrentResourceOwner);
3160 *
3161 * Currently, no callers of this function want to modify the buffer's
3162 * usage_count at all, so there's no need for a strategy parameter.
3163 * Also we don't bother with a BM_VALID test (the caller could check that for
3164 * itself).
3165 *
3166 * Also all callers only ever use this function when it's known that the
3167 * buffer can't have a preexisting pin by this backend. That allows us to skip
3168 * searching the private refcount array & hash, which is a boon, because the
3169 * spinlock is still held.
3170 *
3171 * Note: use of this routine is frequently mandatory, not just an optimization
3172 * to save a spin lock/unlock cycle, because we need to pin a buffer before
3173 * its state can change under us.
3174 */
3175static void
3177{
3178 Buffer b;
3180 uint32 buf_state;
3181
3182 /*
3183 * As explained, We don't expect any preexisting pins. That allows us to
3184 * manipulate the PrivateRefCount after releasing the spinlock
3185 */
3187
3188 /*
3189 * Buffer can't have a preexisting pin, so mark its page as defined to
3190 * Valgrind (this is similar to the PinBuffer() case where the backend
3191 * doesn't already have a buffer pin)
3192 */
3194
3195 /*
3196 * Since we hold the buffer spinlock, we can update the buffer state and
3197 * release the lock in one operation.
3198 */
3199 buf_state = pg_atomic_read_u32(&buf->state);
3200 Assert(buf_state & BM_LOCKED);
3201 buf_state += BUF_REFCOUNT_ONE;
3202 UnlockBufHdr(buf, buf_state);
3203
3205
3207 ref->refcount++;
3208
3210}
3211
3212/*
3213 * Support for waking up another backend that is waiting for the cleanup lock
3214 * to be released using BM_PIN_COUNT_WAITER.
3215 *
3216 * See LockBufferForCleanup().
3217 *
3218 * Expected to be called just after releasing a buffer pin (in a BufferDesc,
3219 * not just reducing the backend-local pincount for the buffer).
3220 */
3221static void
3223{
3224 /*
3225 * Acquire the buffer header lock, re-check that there's a waiter. Another
3226 * backend could have unpinned this buffer, and already woken up the
3227 * waiter.
3228 *
3229 * There's no danger of the buffer being replaced after we unpinned it
3230 * above, as it's pinned by the waiter. The waiter removes
3231 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3232 * backend waking it up.
3233 */
3234 uint32 buf_state = LockBufHdr(buf);
3235
3236 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3237 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3238 {
3239 /* we just released the last pin other than the waiter's */
3240 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3241
3242 buf_state &= ~BM_PIN_COUNT_WAITER;
3243 UnlockBufHdr(buf, buf_state);
3244 ProcSendSignal(wait_backend_pgprocno);
3245 }
3246 else
3247 UnlockBufHdr(buf, buf_state);
3248}
3249
3250/*
3251 * UnpinBuffer -- make buffer available for replacement.
3252 *
3253 * This should be applied only to shared buffers, never local ones. This
3254 * always adjusts CurrentResourceOwner.
3255 */
3256static void
3258{
3260
3263}
3264
3265static void
3267{
3270
3272
3273 /* not moving as we're likely deleting it soon anyway */
3274 ref = GetPrivateRefCountEntry(b, false);
3275 Assert(ref != NULL);
3276 Assert(ref->refcount > 0);
3277 ref->refcount--;
3278 if (ref->refcount == 0)
3279 {
3280 uint32 buf_state;
3281 uint32 old_buf_state;
3282
3283 /*
3284 * Mark buffer non-accessible to Valgrind.
3285 *
3286 * Note that the buffer may have already been marked non-accessible
3287 * within access method code that enforces that buffers are only
3288 * accessed while a buffer lock is held.
3289 */
3291
3292 /* I'd better not still hold the buffer content lock */
3294
3295 /*
3296 * Decrement the shared reference count.
3297 *
3298 * Since buffer spinlock holder can update status using just write,
3299 * it's not safe to use atomic decrement here; thus use a CAS loop.
3300 */
3301 old_buf_state = pg_atomic_read_u32(&buf->state);
3302 for (;;)
3303 {
3304 if (old_buf_state & BM_LOCKED)
3305 old_buf_state = WaitBufHdrUnlocked(buf);
3306
3307 buf_state = old_buf_state;
3308
3309 buf_state -= BUF_REFCOUNT_ONE;
3310
3311 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3312 buf_state))
3313 break;
3314 }
3315
3316 /* Support LockBufferForCleanup() */
3317 if (buf_state & BM_PIN_COUNT_WAITER)
3319
3321 }
3322}
3323
3324#define ST_SORT sort_checkpoint_bufferids
3325#define ST_ELEMENT_TYPE CkptSortItem
3326#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
3327#define ST_SCOPE static
3328#define ST_DEFINE
3329#include <lib/sort_template.h>
3330
3331/*
3332 * BufferSync -- Write out all dirty buffers in the pool.
3333 *
3334 * This is called at checkpoint time to write out all dirty shared buffers.
3335 * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
3336 * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
3337 * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
3338 * unlogged buffers, which are otherwise skipped. The remaining flags
3339 * currently have no effect here.
3340 */
3341static void
3342BufferSync(int flags)
3343{
3344 uint32 buf_state;
3345 int buf_id;
3346 int num_to_scan;
3347 int num_spaces;
3348 int num_processed;
3349 int num_written;
3350 CkptTsStatus *per_ts_stat = NULL;
3351 Oid last_tsid;
3352 binaryheap *ts_heap;
3353 int i;
3354 int mask = BM_DIRTY;
3355 WritebackContext wb_context;
3356
3357 /*
3358 * Unless this is a shutdown checkpoint or we have been explicitly told,
3359 * we write only permanent, dirty buffers. But at shutdown or end of
3360 * recovery, we write all dirty buffers.
3361 */
3364 mask |= BM_PERMANENT;
3365
3366 /*
3367 * Loop over all buffers, and mark the ones that need to be written with
3368 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3369 * can estimate how much work needs to be done.
3370 *
3371 * This allows us to write only those pages that were dirty when the
3372 * checkpoint began, and not those that get dirtied while it proceeds.
3373 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3374 * later in this function, or by normal backends or the bgwriter cleaning
3375 * scan, the flag is cleared. Any buffer dirtied after this point won't
3376 * have the flag set.
3377 *
3378 * Note that if we fail to write some buffer, we may leave buffers with
3379 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3380 * certainly need to be written for the next checkpoint attempt, too.
3381 */
3382 num_to_scan = 0;
3383 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3384 {
3385 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3386
3387 /*
3388 * Header spinlock is enough to examine BM_DIRTY, see comment in
3389 * SyncOneBuffer.
3390 */
3391 buf_state = LockBufHdr(bufHdr);
3392
3393 if ((buf_state & mask) == mask)
3394 {
3395 CkptSortItem *item;
3396
3397 buf_state |= BM_CHECKPOINT_NEEDED;
3398
3399 item = &CkptBufferIds[num_to_scan++];
3400 item->buf_id = buf_id;
3401 item->tsId = bufHdr->tag.spcOid;
3402 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3403 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3404 item->blockNum = bufHdr->tag.blockNum;
3405 }
3406
3407 UnlockBufHdr(bufHdr, buf_state);
3408
3409 /* Check for barrier events in case NBuffers is large. */
3412 }
3413
3414 if (num_to_scan == 0)
3415 return; /* nothing to do */
3416
3418
3419 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3420
3421 /*
3422 * Sort buffers that need to be written to reduce the likelihood of random
3423 * IO. The sorting is also important for the implementation of balancing
3424 * writes between tablespaces. Without balancing writes we'd potentially
3425 * end up writing to the tablespaces one-by-one; possibly overloading the
3426 * underlying system.
3427 */
3428 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3429
3430 num_spaces = 0;
3431
3432 /*
3433 * Allocate progress status for each tablespace with buffers that need to
3434 * be flushed. This requires the to-be-flushed array to be sorted.
3435 */
3436 last_tsid = InvalidOid;
3437 for (i = 0; i < num_to_scan; i++)
3438 {
3439 CkptTsStatus *s;
3440 Oid cur_tsid;
3441
3442 cur_tsid = CkptBufferIds[i].tsId;
3443
3444 /*
3445 * Grow array of per-tablespace status structs, every time a new
3446 * tablespace is found.
3447 */
3448 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3449 {
3450 Size sz;
3451
3452 num_spaces++;
3453
3454 /*
3455 * Not worth adding grow-by-power-of-2 logic here - even with a
3456 * few hundred tablespaces this should be fine.
3457 */
3458 sz = sizeof(CkptTsStatus) * num_spaces;
3459
3460 if (per_ts_stat == NULL)
3461 per_ts_stat = (CkptTsStatus *) palloc(sz);
3462 else
3463 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3464
3465 s = &per_ts_stat[num_spaces - 1];
3466 memset(s, 0, sizeof(*s));
3467 s->tsId = cur_tsid;
3468
3469 /*
3470 * The first buffer in this tablespace. As CkptBufferIds is sorted
3471 * by tablespace all (s->num_to_scan) buffers in this tablespace
3472 * will follow afterwards.
3473 */
3474 s->index = i;
3475
3476 /*
3477 * progress_slice will be determined once we know how many buffers
3478 * are in each tablespace, i.e. after this loop.
3479 */
3480
3481 last_tsid = cur_tsid;
3482 }
3483 else
3484 {
3485 s = &per_ts_stat[num_spaces - 1];
3486 }
3487
3488 s->num_to_scan++;
3489
3490 /* Check for barrier events. */
3493 }
3494
3495 Assert(num_spaces > 0);
3496
3497 /*
3498 * Build a min-heap over the write-progress in the individual tablespaces,
3499 * and compute how large a portion of the total progress a single
3500 * processed buffer is.
3501 */
3502 ts_heap = binaryheap_allocate(num_spaces,
3504 NULL);
3505
3506 for (i = 0; i < num_spaces; i++)
3507 {
3508 CkptTsStatus *ts_stat = &per_ts_stat[i];
3509
3510 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3511
3512 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3513 }
3514
3515 binaryheap_build(ts_heap);
3516
3517 /*
3518 * Iterate through to-be-checkpointed buffers and write the ones (still)
3519 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3520 * tablespaces; otherwise the sorting would lead to only one tablespace
3521 * receiving writes at a time, making inefficient use of the hardware.
3522 */
3523 num_processed = 0;
3524 num_written = 0;
3525 while (!binaryheap_empty(ts_heap))
3526 {
3527 BufferDesc *bufHdr = NULL;
3528 CkptTsStatus *ts_stat = (CkptTsStatus *)
3530
3531 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3532 Assert(buf_id != -1);
3533
3534 bufHdr = GetBufferDescriptor(buf_id);
3535
3536 num_processed++;
3537
3538 /*
3539 * We don't need to acquire the lock here, because we're only looking
3540 * at a single bit. It's possible that someone else writes the buffer
3541 * and clears the flag right after we check, but that doesn't matter
3542 * since SyncOneBuffer will then do nothing. However, there is a
3543 * further race condition: it's conceivable that between the time we
3544 * examine the bit here and the time SyncOneBuffer acquires the lock,
3545 * someone else not only wrote the buffer but replaced it with another
3546 * page and dirtied it. In that improbable case, SyncOneBuffer will
3547 * write the buffer though we didn't need to. It doesn't seem worth
3548 * guarding against this, though.
3549 */
3551 {
3552 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3553 {
3554 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3556 num_written++;
3557 }
3558 }
3559
3560 /*
3561 * Measure progress independent of actually having to flush the buffer
3562 * - otherwise writing become unbalanced.
3563 */
3564 ts_stat->progress += ts_stat->progress_slice;
3565 ts_stat->num_scanned++;
3566 ts_stat->index++;
3567
3568 /* Have all the buffers from the tablespace been processed? */
3569 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3570 {
3571 binaryheap_remove_first(ts_heap);
3572 }
3573 else
3574 {
3575 /* update heap with the new progress */
3576 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3577 }
3578
3579 /*
3580 * Sleep to throttle our I/O rate.
3581 *
3582 * (This will check for barrier events even if it doesn't sleep.)
3583 */
3584 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3585 }
3586
3587 /*
3588 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3589 * IOContext will always be IOCONTEXT_NORMAL.
3590 */
3592
3593 pfree(per_ts_stat);
3594 per_ts_stat = NULL;
3595 binaryheap_free(ts_heap);
3596
3597 /*
3598 * Update checkpoint statistics. As noted above, this doesn't include
3599 * buffers written by other backends or bgwriter scan.
3600 */
3601 CheckpointStats.ckpt_bufs_written += num_written;
3602
3603 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3604}
3605
3606/*
3607 * BgBufferSync -- Write out some dirty buffers in the pool.
3608 *
3609 * This is called periodically by the background writer process.
3610 *
3611 * Returns true if it's appropriate for the bgwriter process to go into
3612 * low-power hibernation mode. (This happens if the strategy clock sweep
3613 * has been "lapped" and no buffer allocations have occurred recently,
3614 * or if the bgwriter has been effectively disabled by setting
3615 * bgwriter_lru_maxpages to 0.)
3616 */
3617bool
3619{
3620 /* info obtained from freelist.c */
3621 int strategy_buf_id;
3622 uint32 strategy_passes;
3623 uint32 recent_alloc;
3624
3625 /*
3626 * Information saved between calls so we can determine the strategy
3627 * point's advance rate and avoid scanning already-cleaned buffers.
3628 */
3629 static bool saved_info_valid = false;
3630 static int prev_strategy_buf_id;
3631 static uint32 prev_strategy_passes;
3632 static int next_to_clean;
3633 static uint32 next_passes;
3634
3635 /* Moving averages of allocation rate and clean-buffer density */
3636 static float smoothed_alloc = 0;
3637 static float smoothed_density = 10.0;
3638
3639 /* Potentially these could be tunables, but for now, not */
3640 float smoothing_samples = 16;
3641 float scan_whole_pool_milliseconds = 120000.0;
3642
3643 /* Used to compute how far we scan ahead */
3644 long strategy_delta;
3645 int bufs_to_lap;
3646 int bufs_ahead;
3647 float scans_per_alloc;
3648 int reusable_buffers_est;
3649 int upcoming_alloc_est;
3650 int min_scan_buffers;
3651
3652 /* Variables for the scanning loop proper */
3653 int num_to_scan;
3654 int num_written;
3655 int reusable_buffers;
3656
3657 /* Variables for final smoothed_density update */
3658 long new_strategy_delta;
3659 uint32 new_recent_alloc;
3660
3661 /*
3662 * Find out where the freelist clock sweep currently is, and how many
3663 * buffer allocations have happened since our last call.
3664 */
3665 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3666
3667 /* Report buffer alloc counts to pgstat */
3668 PendingBgWriterStats.buf_alloc += recent_alloc;
3669
3670 /*
3671 * If we're not running the LRU scan, just stop after doing the stats
3672 * stuff. We mark the saved state invalid so that we can recover sanely
3673 * if LRU scan is turned back on later.
3674 */
3675 if (bgwriter_lru_maxpages <= 0)
3676 {
3677 saved_info_valid = false;
3678 return true;
3679 }
3680
3681 /*
3682 * Compute strategy_delta = how many buffers have been scanned by the
3683 * clock sweep since last time. If first time through, assume none. Then
3684 * see if we are still ahead of the clock sweep, and if so, how many
3685 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3686 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3687 * behavior when the passes counts wrap around.
3688 */
3689 if (saved_info_valid)
3690 {
3691 int32 passes_delta = strategy_passes - prev_strategy_passes;
3692
3693 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3694 strategy_delta += (long) passes_delta * NBuffers;
3695
3696 Assert(strategy_delta >= 0);
3697
3698 if ((int32) (next_passes - strategy_passes) > 0)
3699 {
3700 /* we're one pass ahead of the strategy point */
3701 bufs_to_lap = strategy_buf_id - next_to_clean;
3702#ifdef BGW_DEBUG
3703 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3704 next_passes, next_to_clean,
3705 strategy_passes, strategy_buf_id,
3706 strategy_delta, bufs_to_lap);
3707#endif
3708 }
3709 else if (next_passes == strategy_passes &&
3710 next_to_clean >= strategy_buf_id)
3711 {
3712 /* on same pass, but ahead or at least not behind */
3713 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3714#ifdef BGW_DEBUG
3715 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3716 next_passes, next_to_clean,
3717 strategy_passes, strategy_buf_id,
3718 strategy_delta, bufs_to_lap);
3719#endif
3720 }
3721 else
3722 {
3723 /*
3724 * We're behind, so skip forward to the strategy point and start
3725 * cleaning from there.
3726 */
3727#ifdef BGW_DEBUG
3728 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3729 next_passes, next_to_clean,
3730 strategy_passes, strategy_buf_id,
3731 strategy_delta);
3732#endif
3733 next_to_clean = strategy_buf_id;
3734 next_passes = strategy_passes;
3735 bufs_to_lap = NBuffers;
3736 }
3737 }
3738 else
3739 {
3740 /*
3741 * Initializing at startup or after LRU scanning had been off. Always
3742 * start at the strategy point.
3743 */
3744#ifdef BGW_DEBUG
3745 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3746 strategy_passes, strategy_buf_id);
3747#endif
3748 strategy_delta = 0;
3749 next_to_clean = strategy_buf_id;
3750 next_passes = strategy_passes;
3751 bufs_to_lap = NBuffers;
3752 }
3753
3754 /* Update saved info for next time */
3755 prev_strategy_buf_id = strategy_buf_id;
3756 prev_strategy_passes = strategy_passes;
3757 saved_info_valid = true;
3758
3759 /*
3760 * Compute how many buffers had to be scanned for each new allocation, ie,
3761 * 1/density of reusable buffers, and track a moving average of that.
3762 *
3763 * If the strategy point didn't move, we don't update the density estimate
3764 */
3765 if (strategy_delta > 0 && recent_alloc > 0)
3766 {
3767 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3768 smoothed_density += (scans_per_alloc - smoothed_density) /
3769 smoothing_samples;
3770 }
3771
3772 /*
3773 * Estimate how many reusable buffers there are between the current
3774 * strategy point and where we've scanned ahead to, based on the smoothed
3775 * density estimate.
3776 */
3777 bufs_ahead = NBuffers - bufs_to_lap;
3778 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3779
3780 /*
3781 * Track a moving average of recent buffer allocations. Here, rather than
3782 * a true average we want a fast-attack, slow-decline behavior: we
3783 * immediately follow any increase.
3784 */
3785 if (smoothed_alloc <= (float) recent_alloc)
3786 smoothed_alloc = recent_alloc;
3787 else
3788 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3789 smoothing_samples;
3790
3791 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3792 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3793
3794 /*
3795 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3796 * eventually underflow to zero, and the underflows produce annoying
3797 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3798 * zero, there's no point in tracking smaller and smaller values of
3799 * smoothed_alloc, so just reset it to exactly zero to avoid this
3800 * syndrome. It will pop back up as soon as recent_alloc increases.
3801 */
3802 if (upcoming_alloc_est == 0)
3803 smoothed_alloc = 0;
3804
3805 /*
3806 * Even in cases where there's been little or no buffer allocation
3807 * activity, we want to make a small amount of progress through the buffer
3808 * cache so that as many reusable buffers as possible are clean after an
3809 * idle period.
3810 *
3811 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3812 * the BGW will be called during the scan_whole_pool time; slice the
3813 * buffer pool into that many sections.
3814 */
3815 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3816
3817 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3818 {
3819#ifdef BGW_DEBUG
3820 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3821 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3822#endif
3823 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3824 }
3825
3826 /*
3827 * Now write out dirty reusable buffers, working forward from the
3828 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3829 * enough buffers to match our estimate of the next cycle's allocation
3830 * requirements, or hit the bgwriter_lru_maxpages limit.
3831 */
3832
3833 num_to_scan = bufs_to_lap;
3834 num_written = 0;
3835 reusable_buffers = reusable_buffers_est;
3836
3837 /* Execute the LRU scan */
3838 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3839 {
3840 int sync_state = SyncOneBuffer(next_to_clean, true,
3841 wb_context);
3842
3843 if (++next_to_clean >= NBuffers)
3844 {
3845 next_to_clean = 0;
3846 next_passes++;
3847 }
3848 num_to_scan--;
3849
3850 if (sync_state & BUF_WRITTEN)
3851 {
3852 reusable_buffers++;
3853 if (++num_written >= bgwriter_lru_maxpages)
3854 {
3856 break;
3857 }
3858 }
3859 else if (sync_state & BUF_REUSABLE)
3860 reusable_buffers++;
3861 }
3862
3864
3865#ifdef BGW_DEBUG
3866 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3867 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3868 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3869 bufs_to_lap - num_to_scan,
3870 num_written,
3871 reusable_buffers - reusable_buffers_est);
3872#endif
3873
3874 /*
3875 * Consider the above scan as being like a new allocation scan.
3876 * Characterize its density and update the smoothed one based on it. This
3877 * effectively halves the moving average period in cases where both the
3878 * strategy and the background writer are doing some useful scanning,
3879 * which is helpful because a long memory isn't as desirable on the
3880 * density estimates.
3881 */
3882 new_strategy_delta = bufs_to_lap - num_to_scan;
3883 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3884 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3885 {
3886 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3887 smoothed_density += (scans_per_alloc - smoothed_density) /
3888 smoothing_samples;
3889
3890#ifdef BGW_DEBUG
3891 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3892 new_recent_alloc, new_strategy_delta,
3893 scans_per_alloc, smoothed_density);
3894#endif
3895 }
3896
3897 /* Return true if OK to hibernate */
3898 return (bufs_to_lap == 0 && recent_alloc == 0);
3899}
3900
3901/*
3902 * SyncOneBuffer -- process a single buffer during syncing.
3903 *
3904 * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3905 * buffers marked recently used, as these are not replacement candidates.
3906 *
3907 * Returns a bitmask containing the following flag bits:
3908 * BUF_WRITTEN: we wrote the buffer.
3909 * BUF_REUSABLE: buffer is available for replacement, ie, it has
3910 * pin count 0 and usage count 0.
3911 *
3912 * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3913 * after locking it, but we don't care all that much.)
3914 */
3915static int
3916SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3917{
3918 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3919 int result = 0;
3920 uint32 buf_state;
3921 BufferTag tag;
3922
3923 /* Make sure we can handle the pin */
3926
3927 /*
3928 * Check whether buffer needs writing.
3929 *
3930 * We can make this check without taking the buffer content lock so long
3931 * as we mark pages dirty in access methods *before* logging changes with
3932 * XLogInsert(): if someone marks the buffer dirty just after our check we
3933 * don't worry because our checkpoint.redo points before log record for
3934 * upcoming changes and so we are not required to write such dirty buffer.
3935 */
3936 buf_state = LockBufHdr(bufHdr);
3937
3938 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3939 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3940 {
3941 result |= BUF_REUSABLE;
3942 }
3943 else if (skip_recently_used)
3944 {
3945 /* Caller told us not to write recently-used buffers */
3946 UnlockBufHdr(bufHdr, buf_state);
3947 return result;
3948 }
3949
3950 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3951 {
3952 /* It's clean, so nothing to do */
3953 UnlockBufHdr(bufHdr, buf_state);
3954 return result;
3955 }
3956
3957 /*
3958 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3959 * buffer is clean by the time we've locked it.)
3960 */
3961 PinBuffer_Locked(bufHdr);
3963
3965
3967
3968 tag = bufHdr->tag;
3969
3970 UnpinBuffer(bufHdr);
3971
3972 /*
3973 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3974 * IOContext will always be IOCONTEXT_NORMAL.
3975 */
3977
3978 return result | BUF_WRITTEN;
3979}
3980
3981/*
3982 * AtEOXact_Buffers - clean up at end of transaction.
3983 *
3984 * As of PostgreSQL 8.0, buffer pins should get released by the
3985 * ResourceOwner mechanism. This routine is just a debugging
3986 * cross-check that no pins remain.
3987 */
3988void
3989AtEOXact_Buffers(bool isCommit)
3990{
3992
3993 AtEOXact_LocalBuffers(isCommit);
3994
3996}
3997
3998/*
3999 * Initialize access to shared buffer pool
4000 *
4001 * This is called during backend startup (whether standalone or under the
4002 * postmaster). It sets up for this backend's access to the already-existing
4003 * buffer pool.
4004 */
4005void
4007{
4008 HASHCTL hash_ctl;
4009
4010 /*
4011 * An advisory limit on the number of pins each backend should hold, based
4012 * on shared_buffers and the maximum number of connections possible.
4013 * That's very pessimistic, but outside toy-sized shared_buffers it should
4014 * allow plenty of pins. LimitAdditionalPins() and
4015 * GetAdditionalPinLimit() can be used to check the remaining balance.
4016 */
4018
4019 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4020
4021 hash_ctl.keysize = sizeof(int32);
4022 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4023
4024 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4026
4027 /*
4028 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4029 * the corresponding phase of backend shutdown.
4030 */
4031 Assert(MyProc != NULL);
4033}
4034
4035/*
4036 * During backend exit, ensure that we released all shared-buffer locks and
4037 * assert that we have no remaining pins.
4038 */
4039static void
4041{
4042 UnlockBuffers();
4043
4045
4046 /* localbuf.c needs a chance too */
4048}
4049
4050/*
4051 * CheckForBufferLeaks - ensure this backend holds no buffer pins
4052 *
4053 * As of PostgreSQL 8.0, buffer pins should get released by the
4054 * ResourceOwner mechanism. This routine is just a debugging
4055 * cross-check that no pins remain.
4056 */
4057static void
4059{
4060#ifdef USE_ASSERT_CHECKING
4061 int RefCountErrors = 0;
4063 int i;
4064 char *s;
4065
4066 /* check the array */
4067 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4068 {
4069 res = &PrivateRefCountArray[i];
4070
4071 if (res->buffer != InvalidBuffer)
4072 {
4074 elog(WARNING, "buffer refcount leak: %s", s);
4075 pfree(s);
4076
4077 RefCountErrors++;
4078 }
4079 }
4080
4081 /* if necessary search the hash */
4083 {
4084 HASH_SEQ_STATUS hstat;
4085
4087 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4088 {
4090 elog(WARNING, "buffer refcount leak: %s", s);
4091 pfree(s);
4092 RefCountErrors++;
4093 }
4094 }
4095
4096 Assert(RefCountErrors == 0);
4097#endif
4098}
4099
4100/*
4101 * Helper routine to issue warnings when a buffer is unexpectedly pinned
4102 */
4103char *
4105{
4106 BufferDesc *buf;
4107 int32 loccount;
4108 char *result;
4109 ProcNumber backend;
4110 uint32 buf_state;
4111
4113 if (BufferIsLocal(buffer))
4114 {
4116 loccount = LocalRefCount[-buffer - 1];
4117 backend = MyProcNumber;
4118 }
4119 else
4120 {
4122 loccount = GetPrivateRefCount(buffer);
4123 backend = INVALID_PROC_NUMBER;
4124 }
4125
4126 /* theoretically we should lock the bufhdr here */
4127 buf_state = pg_atomic_read_u32(&buf->state);
4128
4129 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4130 buffer,
4132 BufTagGetForkNum(&buf->tag)).str,
4133 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4134 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4135 return result;
4136}
4137
4138/*
4139 * CheckPointBuffers
4140 *
4141 * Flush all dirty blocks in buffer pool to disk at checkpoint time.
4142 *
4143 * Note: temporary relations do not participate in checkpoints, so they don't
4144 * need to be flushed.
4145 */
4146void
4148{
4149 BufferSync(flags);
4150}
4151
4152/*
4153 * BufferGetBlockNumber
4154 * Returns the block number associated with a buffer.
4155 *
4156 * Note:
4157 * Assumes that the buffer is valid and pinned, else the
4158 * value may be obsolete immediately...
4159 */
4162{
4163 BufferDesc *bufHdr;
4164
4166
4167 if (BufferIsLocal(buffer))
4168 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4169 else
4170 bufHdr = GetBufferDescriptor(buffer - 1);
4171
4172 /* pinned, so OK to read tag without spinlock */
4173 return bufHdr->tag.blockNum;
4174}
4175
4176/*
4177 * BufferGetTag
4178 * Returns the relfilelocator, fork number and block number associated with
4179 * a buffer.
4180 */
4181void
4183 BlockNumber *blknum)
4184{
4185 BufferDesc *bufHdr;
4186
4187 /* Do the same checks as BufferGetBlockNumber. */
4189
4190 if (BufferIsLocal(buffer))
4191 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4192 else
4193 bufHdr = GetBufferDescriptor(buffer - 1);
4194
4195 /* pinned, so OK to read tag without spinlock */
4196 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4197 *forknum = BufTagGetForkNum(&bufHdr->tag);
4198 *blknum = bufHdr->tag.blockNum;
4199}
4200
4201/*
4202 * FlushBuffer
4203 * Physically write out a shared buffer.
4204 *
4205 * NOTE: this actually just passes the buffer contents to the kernel; the
4206 * real write to disk won't happen until the kernel feels like it. This
4207 * is okay from our point of view since we can redo the changes from WAL.
4208 * However, we will need to force the changes to disk via fsync before
4209 * we can checkpoint WAL.
4210 *
4211 * The caller must hold a pin on the buffer and have share-locked the
4212 * buffer contents. (Note: a share-lock does not prevent updates of
4213 * hint bits in the buffer, so the page could change while the write
4214 * is in progress, but we assume that that will not invalidate the data
4215 * written.)
4216 *
4217 * If the caller has an smgr reference for the buffer's relation, pass it
4218 * as the second parameter. If not, pass NULL.
4219 */
4220static void
4222 IOContext io_context)
4223{
4224 XLogRecPtr recptr;
4225 ErrorContextCallback errcallback;
4226 instr_time io_start;
4227 Block bufBlock;
4228 char *bufToWrite;
4229 uint32 buf_state;
4230
4231 /*
4232 * Try to start an I/O operation. If StartBufferIO returns false, then
4233 * someone else flushed the buffer before we could, so we need not do
4234 * anything.
4235 */
4236 if (!StartBufferIO(buf, false, false))
4237 return;
4238
4239 /* Setup error traceback support for ereport() */
4241 errcallback.arg = buf;
4242 errcallback.previous = error_context_stack;
4243 error_context_stack = &errcallback;
4244
4245 /* Find smgr relation for buffer */
4246 if (reln == NULL)
4248
4249 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4250 buf->tag.blockNum,
4254
4255 buf_state = LockBufHdr(buf);
4256
4257 /*
4258 * Run PageGetLSN while holding header lock, since we don't have the
4259 * buffer locked exclusively in all cases.
4260 */
4261 recptr = BufferGetLSN(buf);
4262
4263 /* To check if block content changes while flushing. - vadim 01/17/97 */
4264 buf_state &= ~BM_JUST_DIRTIED;
4265 UnlockBufHdr(buf, buf_state);
4266
4267 /*
4268 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4269 * rule that log updates must hit disk before any of the data-file changes
4270 * they describe do.
4271 *
4272 * However, this rule does not apply to unlogged relations, which will be
4273 * lost after a crash anyway. Most unlogged relation pages do not bear
4274 * LSNs since we never emit WAL records for them, and therefore flushing
4275 * up through the buffer LSN would be useless, but harmless. However,
4276 * GiST indexes use LSNs internally to track page-splits, and therefore
4277 * unlogged GiST pages bear "fake" LSNs generated by
4278 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4279 * LSN counter could advance past the WAL insertion point; and if it did
4280 * happen, attempting to flush WAL through that location would fail, with
4281 * disastrous system-wide consequences. To make sure that can't happen,
4282 * skip the flush if the buffer isn't permanent.
4283 */
4284 if (buf_state & BM_PERMANENT)
4285 XLogFlush(recptr);
4286
4287 /*
4288 * Now it's safe to write the buffer to disk. Note that no one else should
4289 * have been able to write it, while we were busy with log flushing,
4290 * because we got the exclusive right to perform I/O by setting the
4291 * BM_IO_IN_PROGRESS bit.
4292 */
4293 bufBlock = BufHdrGetBlock(buf);
4294
4295 /*
4296 * Update page checksum if desired. Since we have only shared lock on the
4297 * buffer, other processes might be updating hint bits in it, so we must
4298 * copy the page to private storage if we do checksumming.
4299 */
4300 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4301
4303
4304 /*
4305 * bufToWrite is either the shared buffer or a copy, as appropriate.
4306 */
4307 smgrwrite(reln,
4308 BufTagGetForkNum(&buf->tag),
4309 buf->tag.blockNum,
4310 bufToWrite,
4311 false);
4312
4313 /*
4314 * When a strategy is in use, only flushes of dirty buffers already in the
4315 * strategy ring are counted as strategy writes (IOCONTEXT
4316 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4317 * statistics tracking.
4318 *
4319 * If a shared buffer initially added to the ring must be flushed before
4320 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4321 *
4322 * If a shared buffer which was added to the ring later because the
4323 * current strategy buffer is pinned or in use or because all strategy
4324 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4325 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4326 * (from_ring will be false).
4327 *
4328 * When a strategy is not in use, the write can only be a "regular" write
4329 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4330 */
4332 IOOP_WRITE, io_start, 1, BLCKSZ);
4333
4335
4336 /*
4337 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4338 * end the BM_IO_IN_PROGRESS state.
4339 */
4340 TerminateBufferIO(buf, true, 0, true, false);
4341
4342 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4343 buf->tag.blockNum,
4347
4348 /* Pop the error context stack */
4349 error_context_stack = errcallback.previous;
4350}
4351
4352/*
4353 * RelationGetNumberOfBlocksInFork
4354 * Determines the current number of pages in the specified relation fork.
4355 *
4356 * Note that the accuracy of the result will depend on the details of the
4357 * relation's storage. For builtin AMs it'll be accurate, but for external AMs
4358 * it might not be.
4359 */
4362{
4363 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4364 {
4365 /*
4366 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4367 * tableam returns the size in bytes - but for the purpose of this
4368 * routine, we want the number of blocks. Therefore divide, rounding
4369 * up.
4370 */
4371 uint64 szbytes;
4372
4373 szbytes = table_relation_size(relation, forkNum);
4374
4375 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4376 }
4377 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4378 {
4379 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4380 }
4381 else
4382 Assert(false);
4383
4384 return 0; /* keep compiler quiet */
4385}
4386
4387/*
4388 * BufferIsPermanent
4389 * Determines whether a buffer will potentially still be around after
4390 * a crash. Caller must hold a buffer pin.
4391 */
4392bool
4394{
4395 BufferDesc *bufHdr;
4396
4397 /* Local buffers are used only for temp relations. */
4398 if (BufferIsLocal(buffer))
4399 return false;
4400
4401 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4404
4405 /*
4406 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4407 * need not bother with the buffer header spinlock. Even if someone else
4408 * changes the buffer header state while we're doing this, the state is
4409 * changed atomically, so we'll read the old value or the new value, but
4410 * not random garbage.
4411 */
4412 bufHdr = GetBufferDescriptor(buffer - 1);
4413 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4414}
4415
4416/*
4417 * BufferGetLSNAtomic
4418 * Retrieves the LSN of the buffer atomically using a buffer header lock.
4419 * This is necessary for some callers who may not have an exclusive lock
4420 * on the buffer.
4421 */
4424{
4425 char *page = BufferGetPage(buffer);
4426 BufferDesc *bufHdr;
4427 XLogRecPtr lsn;
4428 uint32 buf_state;
4429
4430 /*
4431 * If we don't need locking for correctness, fastpath out.
4432 */
4434 return PageGetLSN(page);
4435
4436 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4439
4440 bufHdr = GetBufferDescriptor(buffer - 1);
4441 buf_state = LockBufHdr(bufHdr);
4442 lsn = PageGetLSN(page);
4443 UnlockBufHdr(bufHdr, buf_state);
4444
4445 return lsn;
4446}
4447
4448/* ---------------------------------------------------------------------
4449 * DropRelationBuffers
4450 *
4451 * This function removes from the buffer pool all the pages of the
4452 * specified relation forks that have block numbers >= firstDelBlock.
4453 * (In particular, with firstDelBlock = 0, all pages are removed.)
4454 * Dirty pages are simply dropped, without bothering to write them
4455 * out first. Therefore, this is NOT rollback-able, and so should be
4456 * used only with extreme caution!
4457 *
4458 * Currently, this is called only from smgr.c when the underlying file
4459 * is about to be deleted or truncated (firstDelBlock is needed for
4460 * the truncation case). The data in the affected pages would therefore
4461 * be deleted momentarily anyway, and there is no point in writing it.
4462 * It is the responsibility of higher-level code to ensure that the
4463 * deletion or truncation does not lose any data that could be needed
4464 * later. It is also the responsibility of higher-level code to ensure
4465 * that no other process could be trying to load more pages of the
4466 * relation into buffers.
4467 * --------------------------------------------------------------------
4468 */
4469void
4471 int nforks, BlockNumber *firstDelBlock)
4472{
4473 int i;
4474 int j;
4475 RelFileLocatorBackend rlocator;
4476 BlockNumber nForkBlock[MAX_FORKNUM];
4477 uint64 nBlocksToInvalidate = 0;
4478
4479 rlocator = smgr_reln->smgr_rlocator;
4480
4481 /* If it's a local relation, it's localbuf.c's problem. */
4482 if (RelFileLocatorBackendIsTemp(rlocator))
4483 {
4484 if (rlocator.backend == MyProcNumber)
4485 {
4486 for (j = 0; j < nforks; j++)
4487 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4488 firstDelBlock[j]);
4489 }
4490 return;
4491 }
4492
4493 /*
4494 * To remove all the pages of the specified relation forks from the buffer
4495 * pool, we need to scan the entire buffer pool but we can optimize it by
4496 * finding the buffers from BufMapping table provided we know the exact
4497 * size of each fork of the relation. The exact size is required to ensure
4498 * that we don't leave any buffer for the relation being dropped as
4499 * otherwise the background writer or checkpointer can lead to a PANIC
4500 * error while flushing buffers corresponding to files that don't exist.
4501 *
4502 * To know the exact size, we rely on the size cached for each fork by us
4503 * during recovery which limits the optimization to recovery and on
4504 * standbys but we can easily extend it once we have shared cache for
4505 * relation size.
4506 *
4507 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4508 * and the future writes keeps the cached value up-to-date. See
4509 * smgrextend. It is possible that the value of the first lseek is smaller
4510 * than the actual number of existing blocks in the file due to buggy
4511 * Linux kernels that might not have accounted for the recent write. But
4512 * that should be fine because there must not be any buffers after that
4513 * file size.
4514 */
4515 for (i = 0; i < nforks; i++)
4516 {
4517 /* Get the number of blocks for a relation's fork */
4518 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4519
4520 if (nForkBlock[i] == InvalidBlockNumber)
4521 {
4522 nBlocksToInvalidate = InvalidBlockNumber;
4523 break;
4524 }
4525
4526 /* calculate the number of blocks to be invalidated */
4527 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4528 }
4529
4530 /*
4531 * We apply the optimization iff the total number of blocks to invalidate
4532 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4533 */
4534 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4535 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4536 {
4537 for (j = 0; j < nforks; j++)
4538 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4539 nForkBlock[j], firstDelBlock[j]);
4540 return;
4541 }
4542
4543 for (i = 0; i < NBuffers; i++)
4544 {
4545 BufferDesc *bufHdr = GetBufferDescriptor(i);
4546 uint32 buf_state;
4547
4548 /*
4549 * We can make this a tad faster by prechecking the buffer tag before
4550 * we attempt to lock the buffer; this saves a lot of lock
4551 * acquisitions in typical cases. It should be safe because the
4552 * caller must have AccessExclusiveLock on the relation, or some other
4553 * reason to be certain that no one is loading new pages of the rel
4554 * into the buffer pool. (Otherwise we might well miss such pages
4555 * entirely.) Therefore, while the tag might be changing while we
4556 * look at it, it can't be changing *to* a value we care about, only
4557 * *away* from such a value. So false negatives are impossible, and
4558 * false positives are safe because we'll recheck after getting the
4559 * buffer lock.
4560 *
4561 * We could check forkNum and blockNum as well as the rlocator, but
4562 * the incremental win from doing so seems small.
4563 */
4564 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4565 continue;
4566
4567 buf_state = LockBufHdr(bufHdr);
4568
4569 for (j = 0; j < nforks; j++)
4570 {
4571 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4572 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4573 bufHdr->tag.blockNum >= firstDelBlock[j])
4574 {
4575 InvalidateBuffer(bufHdr); /* releases spinlock */
4576 break;
4577 }
4578 }
4579 if (j >= nforks)
4580 UnlockBufHdr(bufHdr, buf_state);
4581 }
4582}
4583
4584/* ---------------------------------------------------------------------
4585 * DropRelationsAllBuffers
4586 *
4587 * This function removes from the buffer pool all the pages of all
4588 * forks of the specified relations. It's equivalent to calling
4589 * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4590 * --------------------------------------------------------------------
4591 */
4592void
4593DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4594{
4595 int i;
4596 int n = 0;
4597 SMgrRelation *rels;
4598 BlockNumber (*block)[MAX_FORKNUM + 1];
4599 uint64 nBlocksToInvalidate = 0;
4600 RelFileLocator *locators;
4601 bool cached = true;
4602 bool use_bsearch;
4603
4604 if (nlocators == 0)
4605 return;
4606
4607 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4608
4609 /* If it's a local relation, it's localbuf.c's problem. */
4610 for (i = 0; i < nlocators; i++)
4611 {
4612 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4613 {
4614 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4615 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4616 }
4617 else
4618 rels[n++] = smgr_reln[i];
4619 }
4620
4621 /*
4622 * If there are no non-local relations, then we're done. Release the
4623 * memory and return.
4624 */
4625 if (n == 0)
4626 {
4627 pfree(rels);
4628 return;
4629 }
4630
4631 /*
4632 * This is used to remember the number of blocks for all the relations
4633 * forks.
4634 */
4635 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4636 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4637
4638 /*
4639 * We can avoid scanning the entire buffer pool if we know the exact size
4640 * of each of the given relation forks. See DropRelationBuffers.
4641 */
4642 for (i = 0; i < n && cached; i++)
4643 {
4644 for (int j = 0; j <= MAX_FORKNUM; j++)
4645 {
4646 /* Get the number of blocks for a relation's fork. */
4647 block[i][j] = smgrnblocks_cached(rels[i], j);
4648
4649 /* We need to only consider the relation forks that exists. */
4650 if (block[i][j] == InvalidBlockNumber)
4651 {
4652 if (!smgrexists(rels[i], j))
4653 continue;
4654 cached = false;
4655 break;
4656 }
4657
4658 /* calculate the total number of blocks to be invalidated */
4659 nBlocksToInvalidate += block[i][j];
4660 }
4661 }
4662
4663 /*
4664 * We apply the optimization iff the total number of blocks to invalidate
4665 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4666 */
4667 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4668 {
4669 for (i = 0; i < n; i++)
4670 {
4671 for (int j = 0; j <= MAX_FORKNUM; j++)
4672 {
4673 /* ignore relation forks that doesn't exist */
4674 if (!BlockNumberIsValid(block[i][j]))
4675 continue;
4676
4677 /* drop all the buffers for a particular relation fork */
4678 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4679 j, block[i][j], 0);
4680 }
4681 }
4682
4683 pfree(block);
4684 pfree(rels);
4685 return;
4686 }
4687
4688 pfree(block);
4689 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4690 for (i = 0; i < n; i++)
4691 locators[i] = rels[i]->smgr_rlocator.locator;
4692
4693 /*
4694 * For low number of relations to drop just use a simple walk through, to
4695 * save the bsearch overhead. The threshold to use is rather a guess than
4696 * an exactly determined value, as it depends on many factors (CPU and RAM
4697 * speeds, amount of shared buffers etc.).
4698 */
4699 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4700
4701 /* sort the list of rlocators if necessary */
4702 if (use_bsearch)
4703 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4704
4705 for (i = 0; i < NBuffers; i++)
4706 {
4707 RelFileLocator *rlocator = NULL;
4708 BufferDesc *bufHdr = GetBufferDescriptor(i);
4709 uint32 buf_state;
4710
4711 /*
4712 * As in DropRelationBuffers, an unlocked precheck should be safe and
4713 * saves some cycles.
4714 */
4715
4716 if (!use_bsearch)
4717 {
4718 int j;
4719
4720 for (j = 0; j < n; j++)
4721 {
4722 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4723 {
4724 rlocator = &locators[j];
4725 break;
4726 }
4727 }
4728 }
4729 else
4730 {
4731 RelFileLocator locator;
4732
4733 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4734 rlocator = bsearch(&locator,
4735 locators, n, sizeof(RelFileLocator),
4737 }
4738
4739 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4740 if (rlocator == NULL)
4741 continue;
4742
4743 buf_state = LockBufHdr(bufHdr);
4744 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4745 InvalidateBuffer(bufHdr); /* releases spinlock */
4746 else
4747 UnlockBufHdr(bufHdr, buf_state);
4748 }
4749
4750 pfree(locators);
4751 pfree(rels);
4752}
4753
4754/* ---------------------------------------------------------------------
4755 * FindAndDropRelationBuffers
4756 *
4757 * This function performs look up in BufMapping table and removes from the
4758 * buffer pool all the pages of the specified relation fork that has block
4759 * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4760 * pages are removed.)
4761 * --------------------------------------------------------------------
4762 */
4763static void
4765 BlockNumber nForkBlock,
4766 BlockNumber firstDelBlock)
4767{
4768 BlockNumber curBlock;
4769
4770 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4771 {
4772 uint32 bufHash; /* hash value for tag */
4773 BufferTag bufTag; /* identity of requested block */
4774 LWLock *bufPartitionLock; /* buffer partition lock for it */
4775 int buf_id;
4776 BufferDesc *bufHdr;
4777 uint32 buf_state;
4778
4779 /* create a tag so we can lookup the buffer */
4780 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4781
4782 /* determine its hash code and partition lock ID */
4783 bufHash = BufTableHashCode(&bufTag);
4784 bufPartitionLock = BufMappingPartitionLock(bufHash);
4785
4786 /* Check that it is in the buffer pool. If not, do nothing. */
4787 LWLockAcquire(bufPartitionLock, LW_SHARED);
4788 buf_id = BufTableLookup(&bufTag, bufHash);
4789 LWLockRelease(bufPartitionLock);
4790
4791 if (buf_id < 0)
4792 continue;
4793
4794 bufHdr = GetBufferDescriptor(buf_id);
4795
4796 /*
4797 * We need to lock the buffer header and recheck if the buffer is
4798 * still associated with the same block because the buffer could be
4799 * evicted by some other backend loading blocks for a different
4800 * relation after we release lock on the BufMapping table.
4801 */
4802 buf_state = LockBufHdr(bufHdr);
4803
4804 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4805 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4806 bufHdr->tag.blockNum >= firstDelBlock)
4807 InvalidateBuffer(bufHdr); /* releases spinlock */
4808 else
4809 UnlockBufHdr(bufHdr, buf_state);
4810 }
4811}
4812
4813/* ---------------------------------------------------------------------
4814 * DropDatabaseBuffers
4815 *
4816 * This function removes all the buffers in the buffer cache for a
4817 * particular database. Dirty pages are simply dropped, without
4818 * bothering to write them out first. This is used when we destroy a
4819 * database, to avoid trying to flush data to disk when the directory
4820 * tree no longer exists. Implementation is pretty similar to
4821 * DropRelationBuffers() which is for destroying just one relation.
4822 * --------------------------------------------------------------------
4823 */
4824void
4826{
4827 int i;
4828
4829 /*
4830 * We needn't consider local buffers, since by assumption the target
4831 * database isn't our own.
4832 */
4833
4834 for (i = 0; i < NBuffers; i++)
4835 {
4836 BufferDesc *bufHdr = GetBufferDescriptor(i);
4837 uint32 buf_state;
4838
4839 /*
4840 * As in DropRelationBuffers, an unlocked precheck should be safe and
4841 * saves some cycles.
4842 */
4843 if (bufHdr->tag.dbOid != dbid)
4844 continue;
4845
4846 buf_state = LockBufHdr(bufHdr);
4847 if (bufHdr->tag.dbOid == dbid)
4848 InvalidateBuffer(bufHdr); /* releases spinlock */
4849 else
4850 UnlockBufHdr(bufHdr, buf_state);
4851 }
4852}
4853
4854/* ---------------------------------------------------------------------
4855 * FlushRelationBuffers
4856 *
4857 * This function writes all dirty pages of a relation out to disk
4858 * (or more accurately, out to kernel disk buffers), ensuring that the
4859 * kernel has an up-to-date view of the relation.
4860 *
4861 * Generally, the caller should be holding AccessExclusiveLock on the
4862 * target relation to ensure that no other backend is busy dirtying
4863 * more blocks of the relation; the effects can't be expected to last
4864 * after the lock is released.
4865 *
4866 * XXX currently it sequentially searches the buffer pool, should be
4867 * changed to more clever ways of searching. This routine is not
4868 * used in any performance-critical code paths, so it's not worth
4869 * adding additional overhead to normal paths to make it go faster.
4870 * --------------------------------------------------------------------
4871 */
4872void
4874{
4875 int i;
4876 BufferDesc *bufHdr;
4877 SMgrRelation srel = RelationGetSmgr(rel);
4878
4879 if (RelationUsesLocalBuffers(rel))
4880 {
4881 for (i = 0; i < NLocBuffer; i++)
4882 {
4883 uint32 buf_state;
4884
4885 bufHdr = GetLocalBufferDescriptor(i);
4886 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4887 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4888 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4889 {
4890 ErrorContextCallback errcallback;
4891
4892 /* Setup error traceback support for ereport() */
4894 errcallback.arg = bufHdr;
4895 errcallback.previous = error_context_stack;
4896 error_context_stack = &errcallback;
4897
4898 /* Make sure we can handle the pin */
4901
4902 /*
4903 * Pin/upin mostly to make valgrind work, but it also seems
4904 * like the right thing to do.
4905 */
4906 PinLocalBuffer(bufHdr, false);
4907
4908
4909 FlushLocalBuffer(bufHdr, srel);
4910
4912
4913 /* Pop the error context stack */
4914 error_context_stack = errcallback.previous;
4915 }
4916 }
4917
4918 return;
4919 }
4920
4921 for (i = 0; i < NBuffers; i++)
4922 {
4923 uint32 buf_state;
4924
4925 bufHdr = GetBufferDescriptor(i);
4926
4927 /*
4928 * As in DropRelationBuffers, an unlocked precheck should be safe and
4929 * saves some cycles.
4930 */
4931 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4932 continue;
4933
4934 /* Make sure we can handle the pin */
4937
4938 buf_state = LockBufHdr(bufHdr);
4939 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4940 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4941 {
4942 PinBuffer_Locked(bufHdr);
4946 UnpinBuffer(bufHdr);
4947 }
4948 else
4949 UnlockBufHdr(bufHdr, buf_state);
4950 }
4951}
4952
4953/* ---------------------------------------------------------------------
4954 * FlushRelationsAllBuffers
4955 *
4956 * This function flushes out of the buffer pool all the pages of all
4957 * forks of the specified smgr relations. It's equivalent to calling
4958 * FlushRelationBuffers once per relation. The relations are assumed not
4959 * to use local buffers.
4960 * --------------------------------------------------------------------
4961 */
4962void
4964{
4965 int i;
4966 SMgrSortArray *srels;
4967 bool use_bsearch;
4968
4969 if (nrels == 0)
4970 return;
4971
4972 /* fill-in array for qsort */
4973 srels = palloc(sizeof(SMgrSortArray) * nrels);
4974
4975 for (i = 0; i < nrels; i++)
4976 {
4977 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4978
4979 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4980 srels[i].srel = smgrs[i];
4981 }
4982
4983 /*
4984 * Save the bsearch overhead for low number of relations to sync. See
4985 * DropRelationsAllBuffers for details.
4986 */
4987 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4988
4989 /* sort the list of SMgrRelations if necessary */
4990 if (use_bsearch)
4991 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4992
4993 for (i = 0; i < NBuffers; i++)
4994 {
4995 SMgrSortArray *srelent = NULL;
4996 BufferDesc *bufHdr = GetBufferDescriptor(i);
4997 uint32 buf_state;
4998
4999 /*
5000 * As in DropRelationBuffers, an unlocked precheck should be safe and
5001 * saves some cycles.
5002 */
5003
5004 if (!use_bsearch)
5005 {
5006 int j;
5007
5008 for (j = 0; j < nrels; j++)
5009 {
5010 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5011 {
5012 srelent = &srels[j];
5013 break;
5014 }
5015 }
5016 }
5017 else
5018 {
5019 RelFileLocator rlocator;
5020
5021 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5022 srelent = bsearch(&rlocator,
5023 srels, nrels, sizeof(SMgrSortArray),
5025 }
5026
5027 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5028 if (srelent == NULL)
5029 continue;
5030
5031 /* Make sure we can handle the pin */
5034
5035 buf_state = LockBufHdr(bufHdr);
5036 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5037 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5038 {
5039 PinBuffer_Locked(bufHdr);
5043 UnpinBuffer(bufHdr);
5044 }
5045 else
5046 UnlockBufHdr(bufHdr, buf_state);
5047 }
5048
5049 pfree(srels);
5050}
5051
5052/* ---------------------------------------------------------------------
5053 * RelationCopyStorageUsingBuffer
5054 *
5055 * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
5056 * of using smgrread and smgrextend this will copy using bufmgr APIs.
5057 *
5058 * Refer comments atop CreateAndCopyRelationData() for details about
5059 * 'permanent' parameter.
5060 * --------------------------------------------------------------------
5061 */
5062static void
5064 RelFileLocator dstlocator,
5065 ForkNumber forkNum, bool permanent)
5066{
5067 Buffer srcBuf;
5068 Buffer dstBuf;
5069 Page srcPage;
5070 Page dstPage;
5071 bool use_wal;
5072 BlockNumber nblocks;
5073 BlockNumber blkno;
5075 BufferAccessStrategy bstrategy_src;
5076 BufferAccessStrategy bstrategy_dst;
5078 ReadStream *src_stream;
5079 SMgrRelation src_smgr;
5080
5081 /*
5082 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5083 * can skip it when copying any fork of an unlogged relation other than
5084 * the init fork.
5085 */
5086 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5087
5088 /* Get number of blocks in the source relation. */
5089 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5090 forkNum);
5091
5092 /* Nothing to copy; just return. */
5093 if (nblocks == 0)
5094 return;
5095
5096 /*
5097 * Bulk extend the destination relation of the same size as the source
5098 * relation before starting to copy block by block.
5099 */
5100 memset(buf.data, 0, BLCKSZ);
5101 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5102 buf.data, true);
5103
5104 /* This is a bulk operation, so use buffer access strategies. */
5105 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5106 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5107
5108 /* Initialize streaming read */
5109 p.current_blocknum = 0;
5110 p.last_exclusive = nblocks;
5111 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5112
5113 /*
5114 * It is safe to use batchmode as block_range_read_stream_cb takes no
5115 * locks.
5116 */
5119 bstrategy_src,
5120 src_smgr,
5121 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5122 forkNum,
5124 &p,
5125 0);
5126
5127 /* Iterate over each block of the source relation file. */
5128 for (blkno = 0; blkno < nblocks; blkno++)
5129 {
5131
5132 /* Read block from source relation. */
5133 srcBuf = read_stream_next_buffer(src_stream, NULL);
5135 srcPage = BufferGetPage(srcBuf);
5136
5137 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5138 BufferGetBlockNumber(srcBuf),
5139 RBM_ZERO_AND_LOCK, bstrategy_dst,
5140 permanent);
5141 dstPage = BufferGetPage(dstBuf);
5142
5144
5145 /* Copy page data from the source to the destination. */
5146 memcpy(dstPage, srcPage, BLCKSZ);
5147 MarkBufferDirty(dstBuf);
5148
5149 /* WAL-log the copied page. */
5150 if (use_wal)
5151 log_newpage_buffer(dstBuf, true);
5152
5154
5155 UnlockReleaseBuffer(dstBuf);
5156 UnlockReleaseBuffer(srcBuf);
5157 }
5158 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5159 read_stream_end(src_stream);
5160
5161 FreeAccessStrategy(bstrategy_src);
5162 FreeAccessStrategy(bstrategy_dst);
5163}
5164
5165/* ---------------------------------------------------------------------
5166 * CreateAndCopyRelationData
5167 *
5168 * Create destination relation storage and copy all forks from the
5169 * source relation to the destination.
5170 *
5171 * Pass permanent as true for permanent relations and false for
5172 * unlogged relations. Currently this API is not supported for
5173 * temporary relations.
5174 * --------------------------------------------------------------------
5175 */
5176void
5178 RelFileLocator dst_rlocator, bool permanent)
5179{
5180 char relpersistence;
5181 SMgrRelation src_rel;
5182 SMgrRelation dst_rel;
5183
5184 /* Set the relpersistence. */
5185 relpersistence = permanent ?
5186 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5187
5188 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5189 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5190
5191 /*
5192 * Create and copy all forks of the relation. During create database we
5193 * have a separate cleanup mechanism which deletes complete database
5194 * directory. Therefore, each individual relation doesn't need to be
5195 * registered for cleanup.
5196 */
5197 RelationCreateStorage(dst_rlocator, relpersistence, false);
5198
5199 /* copy main fork. */
5200 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5201 permanent);
5202
5203 /* copy those extra forks that exist */
5204 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5205 forkNum <= MAX_FORKNUM; forkNum++)
5206 {
5207 if (smgrexists(src_rel, forkNum))
5208 {
5209 smgrcreate(dst_rel, forkNum, false);
5210
5211 /*
5212 * WAL log creation if the relation is persistent, or this is the
5213 * init fork of an unlogged relation.
5214 */
5215 if (permanent || forkNum == INIT_FORKNUM)
5216 log_smgrcreate(&dst_rlocator, forkNum);
5217
5218 /* Copy a fork's data, block by block. */
5219 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5220 permanent);
5221 }
5222 }
5223}
5224
5225/* ---------------------------------------------------------------------
5226 * FlushDatabaseBuffers
5227 *
5228 * This function writes all dirty pages of a database out to disk
5229 * (or more accurately, out to kernel disk buffers), ensuring that the
5230 * kernel has an up-to-date view of the database.
5231 *
5232 * Generally, the caller should be holding an appropriate lock to ensure
5233 * no other backend is active in the target database; otherwise more
5234 * pages could get dirtied.
5235 *
5236 * Note we don't worry about flushing any pages of temporary relations.
5237 * It's assumed these wouldn't be interesting.
5238 * --------------------------------------------------------------------
5239 */
5240void
5242{
5243 int i;
5244 BufferDesc *bufHdr;
5245
5246 for (i = 0; i < NBuffers; i++)
5247 {
5248 uint32 buf_state;
5249
5250 bufHdr = GetBufferDescriptor(i);
5251
5252 /*
5253 * As in DropRelationBuffers, an unlocked precheck should be safe and
5254 * saves some cycles.
5255 */
5256 if (bufHdr->tag.dbOid != dbid)
5257 continue;
5258
5259 /* Make sure we can handle the pin */
5262
5263 buf_state = LockBufHdr(bufHdr);
5264 if (bufHdr->tag.dbOid == dbid &&
5265 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5266 {
5267 PinBuffer_Locked(bufHdr);
5271 UnpinBuffer(bufHdr);
5272 }
5273 else
5274 UnlockBufHdr(bufHdr, buf_state);
5275 }
5276}
5277
5278/*
5279 * Flush a previously, shared or exclusively, locked and pinned buffer to the
5280 * OS.
5281 */
5282void
5284{
5285 BufferDesc *bufHdr;
5286
5287 /* currently not needed, but no fundamental reason not to support */
5289
5291
5292 bufHdr = GetBufferDescriptor(buffer - 1);
5293
5295
5297}
5298
5299/*
5300 * ReleaseBuffer -- release the pin on a buffer
5301 */
5302void
5304{
5305 if (!BufferIsValid(buffer))
5306 elog(ERROR, "bad buffer ID: %d", buffer);
5307
5308 if (BufferIsLocal(buffer))
5310 else
5312}
5313
5314/*
5315 * UnlockReleaseBuffer -- release the content lock and pin on a buffer
5316 *
5317 * This is just a shorthand for a common combination.
5318 */
5319void
5321{
5324}
5325
5326/*
5327 * IncrBufferRefCount
5328 * Increment the pin count on a buffer that we have *already* pinned
5329 * at least once.
5330 *
5331 * This function cannot be used on a buffer we do not have pinned,
5332 * because it doesn't change the shared buffer state.
5333 */
5334void
5336{
5339 if (BufferIsLocal(buffer))
5340 LocalRefCount[-buffer - 1]++;
5341 else
5342 {
5344
5345 ref = GetPrivateRefCountEntry(buffer, true);
5346 Assert(ref != NULL);
5347 ref->refcount++;
5348 }
5350}
5351
5352/*
5353 * MarkBufferDirtyHint
5354 *
5355 * Mark a buffer dirty for non-critical changes.
5356 *
5357 * This is essentially the same as MarkBufferDirty, except:
5358 *
5359 * 1. The caller does not write WAL; so if checksums are enabled, we may need
5360 * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
5361 * 2. The caller might have only share-lock instead of exclusive-lock on the
5362 * buffer's content lock.
5363 * 3. This function does not guarantee that the buffer is always marked dirty
5364 * (due to a race condition), so it cannot be used for important changes.
5365 */
5366void
5368{
5369 BufferDesc *bufHdr;
5370 Page page = BufferGetPage(buffer);
5371
5372 if (!BufferIsValid(buffer))
5373 elog(ERROR, "bad buffer ID: %d", buffer);
5374
5375 if (BufferIsLocal(buffer))
5376 {
5378 return;
5379 }
5380
5381 bufHdr = GetBufferDescriptor(buffer - 1);
5382
5384 /* here, either share or exclusive lock is OK */
5386
5387 /*
5388 * This routine might get called many times on the same page, if we are
5389 * making the first scan after commit of an xact that added/deleted many
5390 * tuples. So, be as quick as we can if the buffer is already dirty. We
5391 * do this by not acquiring spinlock if it looks like the status bits are
5392 * already set. Since we make this test unlocked, there's a chance we
5393 * might fail to notice that the flags have just been cleared, and failed
5394 * to reset them, due to memory-ordering issues. But since this function
5395 * is only intended to be used in cases where failing to write out the
5396 * data would be harmless anyway, it doesn't really matter.
5397 */
5398 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5400 {
5402 bool dirtied = false;
5403 bool delayChkptFlags = false;
5404 uint32 buf_state;
5405
5406 /*
5407 * If we need to protect hint bit updates from torn writes, WAL-log a
5408 * full page image of the page. This full page image is only necessary
5409 * if the hint bit update is the first change to the page since the
5410 * last checkpoint.
5411 *
5412 * We don't check full_page_writes here because that logic is included
5413 * when we call XLogInsert() since the value changes dynamically.
5414 */
5415 if (XLogHintBitIsNeeded() &&
5417 {
5418 /*
5419 * If we must not write WAL, due to a relfilelocator-specific
5420 * condition or being in recovery, don't dirty the page. We can
5421 * set the hint, just not dirty the page as a result so the hint
5422 * is lost when we evict the page or shutdown.
5423 *
5424 * See src/backend/storage/page/README for longer discussion.
5425 */
5426 if (RecoveryInProgress() ||
5428 return;
5429
5430 /*
5431 * If the block is already dirty because we either made a change
5432 * or set a hint already, then we don't need to write a full page
5433 * image. Note that aggressive cleaning of blocks dirtied by hint
5434 * bit setting would increase the call rate. Bulk setting of hint
5435 * bits would reduce the call rate...
5436 *
5437 * We must issue the WAL record before we mark the buffer dirty.
5438 * Otherwise we might write the page before we write the WAL. That
5439 * causes a race condition, since a checkpoint might occur between
5440 * writing the WAL record and marking the buffer dirty. We solve
5441 * that with a kluge, but one that is already in use during
5442 * transaction commit to prevent race conditions. Basically, we
5443 * simply prevent the checkpoint WAL record from being written
5444 * until we have marked the buffer dirty. We don't start the
5445 * checkpoint flush until we have marked dirty, so our checkpoint
5446 * must flush the change to disk successfully or the checkpoint
5447 * never gets written, so crash recovery will fix.
5448 *
5449 * It's possible we may enter here without an xid, so it is
5450 * essential that CreateCheckPoint waits for virtual transactions
5451 * rather than full transactionids.
5452 */
5455 delayChkptFlags = true;
5456 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5457 }
5458
5459 buf_state = LockBufHdr(bufHdr);
5460
5461 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5462
5463 if (!(buf_state & BM_DIRTY))
5464 {
5465 dirtied = true; /* Means "will be dirtied by this action" */
5466
5467 /*
5468 * Set the page LSN if we wrote a backup block. We aren't supposed
5469 * to set this when only holding a share lock but as long as we
5470 * serialise it somehow we're OK. We choose to set LSN while
5471 * holding the buffer header lock, which causes any reader of an
5472 * LSN who holds only a share lock to also obtain a buffer header
5473 * lock before using PageGetLSN(), which is enforced in
5474 * BufferGetLSNAtomic().
5475 *
5476 * If checksums are enabled, you might think we should reset the
5477 * checksum here. That will happen when the page is written
5478 * sometime later in this checkpoint cycle.
5479 */
5480 if (!XLogRecPtrIsInvalid(lsn))
5481 PageSetLSN(page, lsn);
5482 }
5483
5484 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5485 UnlockBufHdr(bufHdr, buf_state);
5486
5487 if (delayChkptFlags)
5488 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5489
5490 if (dirtied)
5491 {
5493 if (VacuumCostActive)
5495 }
5496 }
5497}
5498
5499/*
5500 * Release buffer content locks for shared buffers.
5501 *
5502 * Used to clean up after errors.
5503 *
5504 * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5505 * of releasing buffer content locks per se; the only thing we need to deal
5506 * with here is clearing any PIN_COUNT request that was in progress.
5507 */
5508void
5510{
5512
5513 if (buf)
5514 {
5515 uint32 buf_state;
5516
5517 buf_state = LockBufHdr(buf);
5518
5519 /*
5520 * Don't complain if flag bit not set; it could have been reset but we
5521 * got a cancel/die interrupt before getting the signal.
5522 */
5523 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5524 buf->wait_backend_pgprocno == MyProcNumber)
5525 buf_state &= ~BM_PIN_COUNT_WAITER;
5526
5527 UnlockBufHdr(buf, buf_state);
5528
5529 PinCountWaitBuf = NULL;
5530 }
5531}
5532
5533/*
5534 * Acquire or release the content_lock for the buffer.
5535 */
5536void
5538{
5539 BufferDesc *buf;
5540
5542 if (BufferIsLocal(buffer))
5543 return; /* local buffers need no lock */
5544
5546
5547 if (mode == BUFFER_LOCK_UNLOCK)
5549 else if (mode == BUFFER_LOCK_SHARE)
5551 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5553 else
5554 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5555}
5556
5557/*
5558 * Acquire the content_lock for the buffer, but only if we don't have to wait.
5559 *
5560 * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5561 */
5562bool
5564{
5565 BufferDesc *buf;
5566
5568 if (BufferIsLocal(buffer))
5569 return true; /* act as though we got it */
5570
5572
5574 LW_EXCLUSIVE);
5575}
5576
5577/*
5578 * Verify that this backend is pinning the buffer exactly once.
5579 *
5580 * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5581 * holds a pin on the buffer. We do not care whether some other backend does.
5582 */
5583void
5585{
5586 if (BufferIsLocal(buffer))
5587 {
5588 if (LocalRefCount[-buffer - 1] != 1)
5589 elog(ERROR, "incorrect local pin count: %d",
5590 LocalRefCount[-buffer - 1]);
5591 }
5592 else
5593 {
5594 if (GetPrivateRefCount(buffer) != 1)
5595 elog(ERROR, "incorrect local pin count: %d",
5597 }
5598}
5599
5600/*
5601 * LockBufferForCleanup - lock a buffer in preparation for deleting items
5602 *
5603 * Items may be deleted from a disk page only when the caller (a) holds an
5604 * exclusive lock on the buffer and (b) has observed that no other backend
5605 * holds a pin on the buffer. If there is a pin, then the other backend
5606 * might have a pointer into the buffer (for example, a heapscan reference
5607 * to an item --- see README for more details). It's OK if a pin is added
5608 * after the cleanup starts, however; the newly-arrived backend will be
5609 * unable to look at the page until we release the exclusive lock.
5610 *
5611 * To implement this protocol, a would-be deleter must pin the buffer and
5612 * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5613 * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5614 * it has successfully observed pin count = 1.
5615 */
5616void
5618{
5619 BufferDesc *bufHdr;
5620 TimestampTz waitStart = 0;
5621 bool waiting = false;
5622 bool logged_recovery_conflict = false;
5623
5625 Assert(PinCountWaitBuf == NULL);
5626
5628
5629 /*
5630 * We do not yet need to be worried about in-progress AIOs holding a pin,
5631 * as we, so far, only support doing reads via AIO and this function can
5632 * only be called once the buffer is valid (i.e. no read can be in
5633 * flight).
5634 */
5635
5636 /* Nobody else to wait for */
5637 if (BufferIsLocal(buffer))
5638 return;
5639
5640 bufHdr = GetBufferDescriptor(buffer - 1);
5641
5642 for (;;)
5643 {
5644 uint32 buf_state;
5645
5646 /* Try to acquire lock */
5648 buf_state = LockBufHdr(bufHdr);
5649
5650 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5651 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5652 {
5653 /* Successfully acquired exclusive lock with pincount 1 */
5654 UnlockBufHdr(bufHdr, buf_state);
5655
5656 /*
5657 * Emit the log message if recovery conflict on buffer pin was
5658 * resolved but the startup process waited longer than
5659 * deadlock_timeout for it.
5660 */
5661 if (logged_recovery_conflict)
5663 waitStart, GetCurrentTimestamp(),
5664 NULL, false);
5665
5666 if (waiting)
5667 {
5668 /* reset ps display to remove the suffix if we added one */
5670 waiting = false;
5671 }
5672 return;
5673 }
5674 /* Failed, so mark myself as waiting for pincount 1 */
5675 if (buf_state & BM_PIN_COUNT_WAITER)
5676 {
5677 UnlockBufHdr(bufHdr, buf_state);
5679 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5680 }
5682 PinCountWaitBuf = bufHdr;
5683 buf_state |= BM_PIN_COUNT_WAITER;
5684 UnlockBufHdr(bufHdr, buf_state);
5686
5687 /* Wait to be signaled by UnpinBuffer() */
5688 if (InHotStandby)
5689 {
5690 if (!waiting)
5691 {
5692 /* adjust the process title to indicate that it's waiting */
5693 set_ps_display_suffix("waiting");
5694 waiting = true;
5695 }
5696
5697 /*
5698 * Emit the log message if the startup process is waiting longer
5699 * than deadlock_timeout for recovery conflict on buffer pin.
5700 *
5701 * Skip this if first time through because the startup process has
5702 * not started waiting yet in this case. So, the wait start
5703 * timestamp is set after this logic.
5704 */
5705 if (waitStart != 0 && !logged_recovery_conflict)
5706 {
5708
5709 if (TimestampDifferenceExceeds(waitStart, now,
5711 {
5713 waitStart, now, NULL, true);
5714 logged_recovery_conflict = true;
5715 }
5716 }
5717
5718 /*
5719 * Set the wait start timestamp if logging is enabled and first
5720 * time through.
5721 */
5722 if (log_recovery_conflict_waits && waitStart == 0)
5723 waitStart = GetCurrentTimestamp();
5724
5725 /* Publish the bufid that Startup process waits on */
5727 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5729 /* Reset the published bufid */
5731 }
5732 else
5733 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5734
5735 /*
5736 * Remove flag marking us as waiter. Normally this will not be set
5737 * anymore, but ProcWaitForSignal() can return for other signals as
5738 * well. We take care to only reset the flag if we're the waiter, as
5739 * theoretically another backend could have started waiting. That's
5740 * impossible with the current usages due to table level locking, but
5741 * better be safe.
5742 */
5743 buf_state = LockBufHdr(bufHdr);
5744 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5746 buf_state &= ~BM_PIN_COUNT_WAITER;
5747 UnlockBufHdr(bufHdr, buf_state);
5748
5749 PinCountWaitBuf = NULL;
5750 /* Loop back and try again */
5751 }
5752}
5753
5754/*
5755 * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5756 * requests cancellation of all pin holders that are blocking it.
5757 */
5758bool
5760{
5761 int bufid = GetStartupBufferPinWaitBufId();
5762
5763 /*
5764 * If we get woken slowly then it's possible that the Startup process was
5765 * already woken by other backends before we got here. Also possible that
5766 * we get here by multiple interrupts or interrupts at inappropriate
5767 * times, so make sure we do nothing if the bufid is not set.
5768 */
5769 if (bufid < 0)
5770 return false;
5771
5772 if (GetPrivateRefCount(bufid + 1) > 0)
5773 return true;
5774
5775 return false;
5776}
5777
5778/*
5779 * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5780 *
5781 * We won't loop, but just check once to see if the pin count is OK. If
5782 * not, return false with no lock held.
5783 */
5784bool
5786{
5787 BufferDesc *bufHdr;
5788 uint32 buf_state,
5789 refcount;
5790
5792
5793 /* see AIO related comment in LockBufferForCleanup() */
5794
5795 if (BufferIsLocal(buffer))
5796 {
5798 /* There should be exactly one pin */
5799 Assert(refcount > 0);
5800 if (refcount != 1)
5801 return false;
5802 /* Nobody else to wait for */
5803 return true;
5804 }
5805
5806 /* There should be exactly one local pin */
5809 if (refcount != 1)
5810 return false;
5811
5812 /* Try to acquire lock */
5814 return false;
5815
5816 bufHdr = GetBufferDescriptor(buffer - 1);
5817 buf_state = LockBufHdr(bufHdr);
5818 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5819
5820 Assert(refcount > 0);
5821 if (refcount == 1)
5822 {
5823 /* Successfully acquired exclusive lock with pincount 1 */
5824 UnlockBufHdr(bufHdr, buf_state);
5825 return true;
5826 }
5827
5828 /* Failed, so release the lock */
5829 UnlockBufHdr(bufHdr, buf_state);
5831 return false;
5832}
5833
5834/*
5835 * IsBufferCleanupOK - as above, but we already have the lock
5836 *
5837 * Check whether it's OK to perform cleanup on a buffer we've already
5838 * locked. If we observe that the pin count is 1, our exclusive lock
5839 * happens to be a cleanup lock, and we can proceed with anything that
5840 * would have been allowable had we sought a cleanup lock originally.
5841 */
5842bool
5844{
5845 BufferDesc *bufHdr;
5846 uint32 buf_state;
5847
5849
5850 /* see AIO related comment in LockBufferForCleanup() */
5851
5852 if (BufferIsLocal(buffer))
5853 {
5854 /* There should be exactly one pin */
5855 if (LocalRefCount[-buffer - 1] != 1)
5856 return false;
5857 /* Nobody else to wait for */
5858 return true;
5859 }
5860
5861 /* There should be exactly one local pin */
5862 if (GetPrivateRefCount(buffer) != 1)
5863 return false;
5864
5865 bufHdr = GetBufferDescriptor(buffer - 1);
5866
5867 /* caller must hold exclusive lock on buffer */
5869 LW_EXCLUSIVE));
5870
5871 buf_state = LockBufHdr(bufHdr);
5872
5873 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5874 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5875 {
5876 /* pincount is OK. */
5877 UnlockBufHdr(bufHdr, buf_state);
5878 return true;
5879 }
5880
5881 UnlockBufHdr(bufHdr, buf_state);
5882 return false;
5883}
5884
5885
5886/*
5887 * Functions for buffer I/O handling
5888 *
5889 * Also note that these are used only for shared buffers, not local ones.
5890 */
5891
5892/*
5893 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5894 */
5895static void
5897{
5899
5901 for (;;)
5902 {
5903 uint32 buf_state;
5904 PgAioWaitRef iow;
5905
5906 /*
5907 * It may not be necessary to acquire the spinlock to check the flag
5908 * here, but since this test is essential for correctness, we'd better
5909 * play it safe.
5910 */
5911 buf_state = LockBufHdr(buf);
5912
5913 /*
5914 * Copy the wait reference while holding the spinlock. This protects
5915 * against a concurrent TerminateBufferIO() in another backend from
5916 * clearing the wref while it's being read.
5917 */
5918 iow = buf->io_wref;
5919 UnlockBufHdr(buf, buf_state);
5920
5921 /* no IO in progress, we don't need to wait */
5922 if (!(buf_state & BM_IO_IN_PROGRESS))
5923 break;
5924
5925 /*
5926 * The buffer has asynchronous IO in progress, wait for it to
5927 * complete.
5928 */
5929 if (pgaio_wref_valid(&iow))
5930 {
5931 pgaio_wref_wait(&iow);
5932
5933 /*
5934 * The AIO subsystem internally uses condition variables and thus
5935 * might remove this backend from the BufferDesc's CV. While that
5936 * wouldn't cause a correctness issue (the first CV sleep just
5937 * immediately returns if not already registered), it seems worth
5938 * avoiding unnecessary loop iterations, given that we take care
5939 * to do so at the start of the function.
5940 */
5942 continue;
5943 }
5944
5945 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
5946 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5947 }
5949}
5950
5951/*
5952 * StartBufferIO: begin I/O on this buffer
5953 * (Assumptions)
5954 * My process is executing no IO on this buffer
5955 * The buffer is Pinned
5956 *
5957 * In some scenarios multiple backends could attempt the same I/O operation
5958 * concurrently. If someone else has already started I/O on this buffer then
5959 * we will wait for completion of the IO using WaitIO().
5960 *
5961 * Input operations are only attempted on buffers that are not BM_VALID,
5962 * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5963 * so we can always tell if the work is already done.
5964 *
5965 * Returns true if we successfully marked the buffer as I/O busy,
5966 * false if someone else already did the work.
5967 *
5968 * If nowait is true, then we don't wait for an I/O to be finished by another
5969 * backend. In that case, false indicates either that the I/O was already
5970 * finished, or is still in progress. This is useful for callers that want to
5971 * find out if they can perform the I/O as part of a larger operation, without
5972 * waiting for the answer or distinguishing the reasons why not.
5973 */
5974bool
5975StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5976{
5977 uint32 buf_state;
5978
5980
5981 for (;;)
5982 {
5983 buf_state = LockBufHdr(buf);
5984
5985 if (!(buf_state & BM_IO_IN_PROGRESS))
5986 break;
5987 UnlockBufHdr(buf, buf_state);
5988 if (nowait)
5989 return false;
5990 WaitIO(buf);
5991 }
5992
5993 /* Once we get here, there is definitely no I/O active on this buffer */
5994
5995 /* Check if someone else already did the I/O */
5996 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5997 {
5998 UnlockBufHdr(buf, buf_state);
5999 return false;
6000 }
6001
6002 buf_state |= BM_IO_IN_PROGRESS;
6003 UnlockBufHdr(buf, buf_state);
6004
6007
6008 return true;
6009}
6010
6011/*
6012 * TerminateBufferIO: release a buffer we were doing I/O on
6013 * (Assumptions)
6014 * My process is executing IO for the buffer
6015 * BM_IO_IN_PROGRESS bit is set for the buffer
6016 * The buffer is Pinned
6017 *
6018 * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
6019 * buffer's BM_DIRTY flag. This is appropriate when terminating a
6020 * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
6021 * marking the buffer clean if it was re-dirtied while we were writing.
6022 *
6023 * set_flag_bits gets ORed into the buffer's flags. It must include
6024 * BM_IO_ERROR in a failure case. For successful completion it could
6025 * be 0, or BM_VALID if we just finished reading in the page.
6026 *
6027 * If forget_owner is true, we release the buffer I/O from the current
6028 * resource owner. (forget_owner=false is used when the resource owner itself
6029 * is being released)
6030 */
6031void
6032TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
6033 bool forget_owner, bool release_aio)
6034{
6035 uint32 buf_state;
6036
6037 buf_state = LockBufHdr(buf);
6038
6039 Assert(buf_state & BM_IO_IN_PROGRESS);
6040 buf_state &= ~BM_IO_IN_PROGRESS;
6041
6042 /* Clear earlier errors, if this IO failed, it'll be marked again */
6043 buf_state &= ~BM_IO_ERROR;
6044
6045 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6046 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6047
6048 if (release_aio)
6049 {
6050 /* release ownership by the AIO subsystem */
6051 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6052 buf_state -= BUF_REFCOUNT_ONE;
6053 pgaio_wref_clear(&buf->io_wref);
6054 }
6055
6056 buf_state |= set_flag_bits;
6057 UnlockBufHdr(buf, buf_state);
6058
6059 if (forget_owner)
6062
6064
6065 /*
6066 * Support LockBufferForCleanup()
6067 *
6068 * We may have just released the last pin other than the waiter's. In most
6069 * cases, this backend holds another pin on the buffer. But, if, for
6070 * example, this backend is completing an IO issued by another backend, it
6071 * may be time to wake the waiter.
6072 */
6073 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6075}
6076
6077/*
6078 * AbortBufferIO: Clean up active buffer I/O after an error.
6079 *
6080 * All LWLocks we might have held have been released,
6081 * but we haven't yet released buffer pins, so the buffer is still pinned.
6082 *
6083 * If I/O was in progress, we always set BM_IO_ERROR, even though it's
6084 * possible the error condition wasn't related to the I/O.
6085 *
6086 * Note: this does not remove the buffer I/O from the resource owner.
6087 * That's correct when we're releasing the whole resource owner, but
6088 * beware if you use this in other contexts.
6089 */
6090static void
6092{
6093 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6094 uint32 buf_state;
6095
6096 buf_state = LockBufHdr(buf_hdr);
6097 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6098
6099 if (!(buf_state & BM_VALID))
6100 {
6101 Assert(!(buf_state & BM_DIRTY));
6102 UnlockBufHdr(buf_hdr, buf_state);
6103 }
6104 else
6105 {
6106 Assert(buf_state & BM_DIRTY);
6107 UnlockBufHdr(buf_hdr, buf_state);
6108
6109 /* Issue notice if this is not the first failure... */
6110 if (buf_state & BM_IO_ERROR)
6111 {
6112 /* Buffer is pinned, so we can read tag without spinlock */
6114 (errcode(ERRCODE_IO_ERROR),
6115 errmsg("could not write block %u of %s",
6116 buf_hdr->tag.blockNum,
6118 BufTagGetForkNum(&buf_hdr->tag)).str),
6119 errdetail("Multiple failures --- write error might be permanent.")));
6120 }
6121 }
6122
6123 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6124}
6125
6126/*
6127 * Error context callback for errors occurring during shared buffer writes.
6128 */
6129static void
6131{
6132 BufferDesc *bufHdr = (BufferDesc *) arg;
6133
6134 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6135 if (bufHdr != NULL)
6136 errcontext("writing block %u of relation %s",
6137 bufHdr->tag.blockNum,
6139 BufTagGetForkNum(&bufHdr->tag)).str);
6140}
6141
6142/*
6143 * Error context callback for errors occurring during local buffer writes.
6144 */
6145static void
6147{
6148 BufferDesc *bufHdr = (BufferDesc *) arg;
6149
6150 if (bufHdr != NULL)
6151 errcontext("writing block %u of relation %s",
6152 bufHdr->tag.blockNum,
6155 BufTagGetForkNum(&bufHdr->tag)).str);
6156}
6157
6158/*
6159 * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
6160 */
6161static int
6162rlocator_comparator(const void *p1, const void *p2)
6163{
6164 RelFileLocator n1 = *(const RelFileLocator *) p1;
6165 RelFileLocator n2 = *(const RelFileLocator *) p2;
6166
6167 if (n1.relNumber < n2.relNumber)
6168 return -1;
6169 else if (n1.relNumber > n2.relNumber)
6170 return 1;
6171
6172 if (n1.dbOid < n2.dbOid)
6173 return -1;
6174 else if (n1.dbOid > n2.dbOid)
6175 return 1;
6176
6177 if (n1.spcOid < n2.spcOid)
6178 return -1;
6179 else if (n1.spcOid > n2.spcOid)
6180 return 1;
6181 else
6182 return 0;
6183}
6184
6185/*
6186 * Lock buffer header - set BM_LOCKED in buffer state.
6187 */
6188uint32
6190{
6191 SpinDelayStatus delayStatus;
6192 uint32 old_buf_state;
6193
6195
6196 init_local_spin_delay(&delayStatus);
6197
6198 while (true)
6199 {
6200 /* set BM_LOCKED flag */
6201 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6202 /* if it wasn't set before we're OK */
6203 if (!(old_buf_state & BM_LOCKED))
6204 break;
6205 perform_spin_delay(&delayStatus);
6206 }
6207 finish_spin_delay(&delayStatus);
6208 return old_buf_state | BM_LOCKED;
6209}
6210
6211/*
6212 * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
6213 * state at that point.
6214 *
6215 * Obviously the buffer could be locked by the time the value is returned, so
6216 * this is primarily useful in CAS style loops.
6217 */
6218static uint32
6220{
6221 SpinDelayStatus delayStatus;
6222 uint32 buf_state;
6223
6224 init_local_spin_delay(&delayStatus);
6225
6226 buf_state = pg_atomic_read_u32(&buf->state);
6227
6228 while (buf_state & BM_LOCKED)
6229 {
6230 perform_spin_delay(&delayStatus);
6231 buf_state = pg_atomic_read_u32(&buf->state);
6232 }
6233
6234 finish_spin_delay(&delayStatus);
6235
6236 return buf_state;
6237}
6238
6239/*
6240 * BufferTag comparator.
6241 */
6242static inline int
6244{
6245 int ret;
6246 RelFileLocator rlocatora;
6247 RelFileLocator rlocatorb;
6248
6249 rlocatora = BufTagGetRelFileLocator(ba);
6250 rlocatorb = BufTagGetRelFileLocator(bb);
6251
6252 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6253
6254 if (ret != 0)
6255 return ret;
6256
6257 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6258 return -1;
6259 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6260 return 1;
6261
6262 if (ba->blockNum < bb->blockNum)
6263 return -1;
6264 if (ba->blockNum > bb->blockNum)
6265 return 1;
6266
6267 return 0;
6268}
6269
6270/*
6271 * Comparator determining the writeout order in a checkpoint.
6272 *
6273 * It is important that tablespaces are compared first, the logic balancing
6274 * writes between tablespaces relies on it.
6275 */
6276static inline int
6278{
6279 /* compare tablespace */
6280 if (a->tsId < b->tsId)
6281 return -1;
6282 else if (a->tsId > b->tsId)
6283 return 1;
6284 /* compare relation */
6285 if (a->relNumber < b->relNumber)
6286 return -1;
6287 else if (a->relNumber > b->relNumber)
6288 return 1;
6289 /* compare fork */
6290 else if (a->forkNum < b->forkNum)
6291 return -1;
6292 else if (a->forkNum > b->forkNum)
6293 return 1;
6294 /* compare block number */
6295 else if (a->blockNum < b->blockNum)
6296 return -1;
6297 else if (a->blockNum > b->blockNum)
6298 return 1;
6299 /* equal page IDs are unlikely, but not impossible */
6300 return 0;
6301}
6302
6303/*
6304 * Comparator for a Min-Heap over the per-tablespace checkpoint completion
6305 * progress.
6306 */
6307static int
6309{
6311 CkptTsStatus *sb = (CkptTsStatus *) b;
6312
6313 /* we want a min-heap, so return 1 for the a < b */
6314 if (sa->progress < sb->progress)
6315 return 1;
6316 else if (sa->progress == sb->progress)
6317 return 0;
6318 else
6319 return -1;
6320}
6321
6322/*
6323 * Initialize a writeback context, discarding potential previous state.
6324 *
6325 * *max_pending is a pointer instead of an immediate value, so the coalesce
6326 * limits can easily changed by the GUC mechanism, and so calling code does
6327 * not have to check the current configuration. A value of 0 means that no
6328 * writeback control will be performed.
6329 */
6330void
6331WritebackContextInit(WritebackContext *context, int *max_pending)
6332{
6333 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6334
6335 context->max_pending = max_pending;
6336 context->nr_pending = 0;
6337}
6338
6339/*
6340 * Add buffer to list of pending writeback requests.
6341 */
6342void
6344 BufferTag *tag)
6345{
6346 PendingWriteback *pending;
6347
6348 /*
6349 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6350 * point in tracking in that case.
6351 */
6353 !enableFsync)
6354 return;
6355
6356 /*
6357 * Add buffer to the pending writeback array, unless writeback control is
6358 * disabled.
6359 */
6360 if (*wb_context->max_pending > 0)
6361 {
6363
6364 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6365
6366 pending->tag = *tag;
6367 }
6368
6369 /*
6370 * Perform pending flushes if the writeback limit is exceeded. This
6371 * includes the case where previously an item has been added, but control
6372 * is now disabled.
6373 */
6374 if (wb_context->nr_pending >= *wb_context->max_pending)
6375 IssuePendingWritebacks(wb_context, io_context);
6376}
6377
6378#define ST_SORT sort_pending_writebacks
6379#define ST_ELEMENT_TYPE PendingWriteback
6380#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
6381#define ST_SCOPE static
6382#define ST_DEFINE
6383#include <lib/sort_template.h>
6384
6385/*
6386 * Issue all pending writeback requests, previously scheduled with
6387 * ScheduleBufferTagForWriteback, to the OS.
6388 *
6389 * Because this is only used to improve the OSs IO scheduling we try to never
6390 * error out - it's just a hint.
6391 */
6392void
6394{
6395 instr_time io_start;
6396 int i;
6397
6398 if (wb_context->nr_pending == 0)
6399 return;
6400
6401 /*
6402 * Executing the writes in-order can make them a lot faster, and allows to
6403 * merge writeback requests to consecutive blocks into larger writebacks.
6404 */
6405 sort_pending_writebacks(wb_context->pending_writebacks,
6406 wb_context->nr_pending);
6407
6409
6410 /*
6411 * Coalesce neighbouring writes, but nothing else. For that we iterate
6412 * through the, now sorted, array of pending flushes, and look forward to
6413 * find all neighbouring (or identical) writes.
6414 */
6415 for (i = 0; i < wb_context->nr_pending; i++)
6416 {
6419 SMgrRelation reln;
6420 int ahead;
6421 BufferTag tag;
6422 RelFileLocator currlocator;
6423 Size nblocks = 1;
6424
6425 cur = &wb_context->pending_writebacks[i];
6426 tag = cur->tag;
6427 currlocator = BufTagGetRelFileLocator(&tag);
6428
6429 /*
6430 * Peek ahead, into following writeback requests, to see if they can
6431 * be combined with the current one.
6432 */
6433 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6434 {
6435
6436 next = &wb_context->pending_writebacks[i + ahead + 1];
6437
6438 /* different file, stop */
6439 if (!RelFileLocatorEquals(currlocator,
6440 BufTagGetRelFileLocator(&next->tag)) ||
6441 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6442 break;
6443
6444 /* ok, block queued twice, skip */
6445 if (cur->tag.blockNum == next->tag.blockNum)
6446 continue;
6447
6448 /* only merge consecutive writes */
6449 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6450 break;
6451
6452 nblocks++;
6453 cur = next;
6454 }
6455
6456 i += ahead;
6457
6458 /* and finally tell the kernel to write the data to storage */
6459 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6460 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6461 }
6462
6463 /*
6464 * Assume that writeback requests are only issued for buffers containing
6465 * blocks of permanent relations.
6466 */
6468 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6469
6470 wb_context->nr_pending = 0;
6471}
6472
6473/* ResourceOwner callbacks */
6474
6475static void
6477{
6479
6481}
6482
6483static char *
6485{
6487
6488 return psprintf("lost track of buffer IO on buffer %d", buffer);
6489}
6490
6491static void
6493{
6495
6496 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6497 if (!BufferIsValid(buffer))
6498 elog(ERROR, "bad buffer ID: %d", buffer);
6499
6500 if (BufferIsLocal(buffer))
6502 else
6504}
6505
6506static char *
6508{
6510}
6511
6512/*
6513 * Helper function to evict unpinned buffer whose buffer header lock is
6514 * already acquired.
6515 */
6516static bool
6517EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
6518{
6519 uint32 buf_state;
6520 bool result;
6521
6522 *buffer_flushed = false;
6523
6524 buf_state = pg_atomic_read_u32(&(desc->state));
6525 Assert(buf_state & BM_LOCKED);
6526
6527 if ((buf_state & BM_VALID) == 0)
6528 {
6529 UnlockBufHdr(desc, buf_state);
6530 return false;
6531 }
6532
6533 /* Check that it's not pinned already. */
6534 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6535 {
6536 UnlockBufHdr(desc, buf_state);
6537 return false;
6538 }
6539
6540 PinBuffer_Locked(desc); /* releases spinlock */
6541
6542 /* If it was dirty, try to clean it once. */
6543 if (buf_state & BM_DIRTY)
6544 {
6547 *buffer_flushed = true;
6549 }
6550
6551 /* This will return false if it becomes dirty or someone else pins it. */
6552 result = InvalidateVictimBuffer(desc);
6553
6554 UnpinBuffer(desc);
6555
6556 return result;
6557}
6558
6559/*
6560 * Try to evict the current block in a shared buffer.
6561 *
6562 * This function is intended for testing/development use only!
6563 *
6564 * To succeed, the buffer must not be pinned on entry, so if the caller had a
6565 * particular block in mind, it might already have been replaced by some other
6566 * block by the time this function runs. It's also unpinned on return, so the
6567 * buffer might be occupied again by the time control is returned, potentially
6568 * even by the same block. This inherent raciness without other interlocking
6569 * makes the function unsuitable for non-testing usage.
6570 *
6571 * *buffer_flushed is set to true if the buffer was dirty and has been
6572 * flushed, false otherwise. However, *buffer_flushed=true does not
6573 * necessarily mean that we flushed the buffer, it could have been flushed by
6574 * someone else.
6575 *
6576 * Returns true if the buffer was valid and it has now been made invalid.
6577 * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6578 * or if the buffer becomes dirty again while we're trying to write it out.
6579 */
6580bool
6581EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
6582{
6583 BufferDesc *desc;
6584
6586
6587 /* Make sure we can pin the buffer. */
6590
6591 desc = GetBufferDescriptor(buf - 1);
6592 LockBufHdr(desc);
6593
6594 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6595}
6596
6597/*
6598 * Try to evict all the shared buffers.
6599 *
6600 * This function is intended for testing/development use only! See
6601 * EvictUnpinnedBuffer().
6602 *
6603 * The buffers_* parameters are mandatory and indicate the total count of
6604 * buffers that:
6605 * - buffers_evicted - were evicted
6606 * - buffers_flushed - were flushed
6607 * - buffers_skipped - could not be evicted
6608 */
6609void
6610EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed,
6611 int32 *buffers_skipped)
6612{
6613 *buffers_evicted = 0;
6614 *buffers_skipped = 0;
6615 *buffers_flushed = 0;
6616
6617 for (int buf = 1; buf <= NBuffers; buf++)
6618 {
6619 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6620 uint32 buf_state;
6621 bool buffer_flushed;
6622
6623 buf_state = pg_atomic_read_u32(&desc->state);
6624 if (!(buf_state & BM_VALID))
6625 continue;
6626
6629
6630 LockBufHdr(desc);
6631
6632 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6633 (*buffers_evicted)++;
6634 else
6635 (*buffers_skipped)++;
6636
6637 if (buffer_flushed)
6638 (*buffers_flushed)++;
6639 }
6640}
6641
6642/*
6643 * Try to evict all the shared buffers containing provided relation's pages.
6644 *
6645 * This function is intended for testing/development use only! See
6646 * EvictUnpinnedBuffer().
6647 *
6648 * The caller must hold at least AccessShareLock on the relation to prevent
6649 * the relation from being dropped.
6650 *
6651 * The buffers_* parameters are mandatory and indicate the total count of
6652 * buffers that:
6653 * - buffers_evicted - were evicted
6654 * - buffers_flushed - were flushed
6655 * - buffers_skipped - could not be evicted
6656 */
6657void
6659 int32 *buffers_flushed, int32 *buffers_skipped)
6660{
6662
6663 *buffers_skipped = 0;
6664 *buffers_evicted = 0;
6665 *buffers_flushed = 0;
6666
6667 for (int buf = 1; buf <= NBuffers; buf++)
6668 {
6669 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6670 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6671 bool buffer_flushed;
6672
6673 /* An unlocked precheck should be safe and saves some cycles. */
6674 if ((buf_state & BM_VALID) == 0 ||
6676 continue;
6677
6678 /* Make sure we can pin the buffer. */
6681
6682 buf_state = LockBufHdr(desc);
6683
6684 /* recheck, could have changed without the lock */
6685 if ((buf_state & BM_VALID) == 0 ||
6687 {
6688 UnlockBufHdr(desc, buf_state);
6689 continue;
6690 }
6691
6692 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6693 (*buffers_evicted)++;
6694 else
6695 (*buffers_skipped)++;
6696
6697 if (buffer_flushed)
6698 (*buffers_flushed)++;
6699 }
6700}
6701
6702/*
6703 * Generic implementation of the AIO handle staging callback for readv/writev
6704 * on local/shared buffers.
6705 *
6706 * Each readv/writev can target multiple buffers. The buffers have already
6707 * been registered with the IO handle.
6708 *
6709 * To make the IO ready for execution ("staging"), we need to ensure that the
6710 * targeted buffers are in an appropriate state while the IO is ongoing. For
6711 * that the AIO subsystem needs to have its own buffer pin, otherwise an error
6712 * in this backend could lead to this backend's buffer pin being released as
6713 * part of error handling, which in turn could lead to the buffer being
6714 * replaced while IO is ongoing.
6715 */
6717buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
6718{
6719 uint64 *io_data;
6720 uint8 handle_data_len;
6721 PgAioWaitRef io_ref;
6723
6724 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6725
6726 pgaio_io_get_wref(ioh, &io_ref);
6727
6728 /* iterate over all buffers affected by the vectored readv/writev */
6729 for (int i = 0; i < handle_data_len; i++)
6730 {
6731 Buffer buffer = (Buffer) io_data[i];
6732 BufferDesc *buf_hdr = is_temp ?
6735 uint32 buf_state;
6736
6737 /*
6738 * Check that all the buffers are actually ones that could conceivably
6739 * be done in one IO, i.e. are sequential. This is the last
6740 * buffer-aware code before IO is actually executed and confusion
6741 * about which buffers are targeted by IO can be hard to debug, making
6742 * it worth doing extra-paranoid checks.
6743 */
6744 if (i == 0)
6745 first = buf_hdr->tag;
6746 else
6747 {
6748 Assert(buf_hdr->tag.relNumber == first.relNumber);
6749 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6750 }
6751
6752 if (is_temp)
6753 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6754 else
6755 buf_state = LockBufHdr(buf_hdr);
6756
6757 /* verify the buffer is in the expected state */
6758 Assert(buf_state & BM_TAG_VALID);
6759 if (is_write)
6760 {
6761 Assert(buf_state & BM_VALID);
6762 Assert(buf_state & BM_DIRTY);
6763 }
6764 else
6765 {
6766 Assert(!(buf_state & BM_VALID));
6767 Assert(!(buf_state & BM_DIRTY));
6768 }
6769
6770 /* temp buffers don't use BM_IO_IN_PROGRESS */
6771 if (!is_temp)
6772 Assert(buf_state & BM_IO_IN_PROGRESS);
6773
6774 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6775
6776 /*
6777 * Reflect that the buffer is now owned by the AIO subsystem.
6778 *
6779 * For local buffers: This can't be done just via LocalRefCount, as
6780 * one might initially think, as this backend could error out while
6781 * AIO is still in progress, releasing all the pins by the backend
6782 * itself.
6783 *
6784 * This pin is released again in TerminateBufferIO().
6785 */
6786 buf_state += BUF_REFCOUNT_ONE;
6787 buf_hdr->io_wref = io_ref;
6788
6789 if (is_temp)
6790 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6791 else
6792 UnlockBufHdr(buf_hdr, buf_state);
6793
6794 /*
6795 * Ensure the content lock that prevents buffer modifications while
6796 * the buffer is being written out is not released early due to an
6797 * error.
6798 */
6799 if (is_write && !is_temp)
6800 {
6801 LWLock *content_lock;
6802
6803 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6804
6805 Assert(LWLockHeldByMe(content_lock));
6806
6807 /*
6808 * Lock is now owned by AIO subsystem.
6809 */
6810 LWLockDisown(content_lock);
6811 }
6812
6813 /*
6814 * Stop tracking this buffer via the resowner - the AIO system now
6815 * keeps track.
6816 */
6817 if (!is_temp)
6819 }
6820}
6821
6822/*
6823 * Decode readv errors as encoded by buffer_readv_encode_error().
6824 */
6825static inline void
6827 bool *zeroed_any,
6828 bool *ignored_any,
6829 uint8 *zeroed_or_error_count,
6830 uint8 *checkfail_count,
6831 uint8 *first_off)
6832{
6833 uint32 rem_error = result.error_data;
6834
6835 /* see static asserts in buffer_readv_encode_error */
6836#define READV_COUNT_BITS 7
6837#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6838
6839 *zeroed_any = rem_error & 1;
6840 rem_error >>= 1;
6841
6842 *ignored_any = rem_error & 1;
6843 rem_error >>= 1;
6844
6845 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6846 rem_error >>= READV_COUNT_BITS;
6847
6848 *checkfail_count = rem_error & READV_COUNT_MASK;
6849 rem_error >>= READV_COUNT_BITS;
6850
6851 *first_off = rem_error & READV_COUNT_MASK;
6852 rem_error >>= READV_COUNT_BITS;
6853}
6854
6855/*
6856 * Helper to encode errors for buffer_readv_complete()
6857 *
6858 * Errors are encoded as follows:
6859 * - bit 0 indicates whether any page was zeroed (1) or not (0)
6860 * - bit 1 indicates whether any checksum failure was ignored (1) or not (0)
6861 * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages
6862 * - next READV_COUNT_BITS bits indicate the number of checksum failures
6863 * - next READV_COUNT_BITS bits indicate the first offset of the first page
6864 * that was errored or zeroed or, if no errors/zeroes, the first ignored
6865 * checksum
6866 */
6867static inline void
6869 bool is_temp,
6870 bool zeroed_any,
6871 bool ignored_any,
6872 uint8 error_count,
6873 uint8 zeroed_count,
6874 uint8 checkfail_count,
6875 uint8 first_error_off,
6876 uint8 first_zeroed_off,
6877 uint8 first_ignored_off)
6878{
6879
6880 uint8 shift = 0;
6881 uint8 zeroed_or_error_count =
6882 error_count > 0 ? error_count : zeroed_count;
6883 uint8 first_off;
6884
6886 "PG_IOV_MAX is bigger than reserved space for error data");
6888 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6889
6890 /*
6891 * We only have space to encode one offset - but luckily that's good
6892 * enough. If there is an error, the error is the interesting offset, same
6893 * with a zeroed buffer vs an ignored buffer.
6894 */
6895 if (error_count > 0)
6896 first_off = first_error_off;
6897 else if (zeroed_count > 0)
6898 first_off = first_zeroed_off;
6899 else
6900 first_off = first_ignored_off;
6901
6902 Assert(!zeroed_any || error_count == 0);
6903
6904 result->error_data = 0;
6905
6906 result->error_data |= zeroed_any << shift;
6907 shift += 1;
6908
6909 result->error_data |= ignored_any << shift;
6910 shift += 1;
6911
6912 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6913 shift += READV_COUNT_BITS;
6914
6915 result->error_data |= ((uint32) checkfail_count) << shift;
6916 shift += READV_COUNT_BITS;
6917
6918 result->error_data |= ((uint32) first_off) << shift;
6919 shift += READV_COUNT_BITS;
6920
6921 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6923
6924 if (error_count > 0)
6925 result->status = PGAIO_RS_ERROR;
6926 else
6927 result->status = PGAIO_RS_WARNING;
6928
6929 /*
6930 * The encoding is complicated enough to warrant cross-checking it against
6931 * the decode function.
6932 */
6933#ifdef USE_ASSERT_CHECKING
6934 {
6935 bool zeroed_any_2,
6936 ignored_any_2;
6937 uint8 zeroed_or_error_count_2,
6938 checkfail_count_2,
6939 first_off_2;
6940
6942 &zeroed_any_2, &ignored_any_2,
6943 &zeroed_or_error_count_2,
6944 &checkfail_count_2,
6945 &first_off_2);
6946 Assert(zeroed_any == zeroed_any_2);
6947 Assert(ignored_any == ignored_any_2);
6948 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
6949 Assert(checkfail_count == checkfail_count_2);
6950 Assert(first_off == first_off_2);
6951 }
6952#endif
6953
6954#undef READV_COUNT_BITS
6955#undef READV_COUNT_MASK
6956}
6957
6958/*
6959 * Helper for AIO readv completion callbacks, supporting both shared and temp
6960 * buffers. Gets called once for each buffer in a multi-page read.
6961 */
6964 uint8 flags, bool failed, bool is_temp,
6965 bool *buffer_invalid,
6966 bool *failed_checksum,
6967 bool *ignored_checksum,
6968 bool *zeroed_buffer)
6969{
6970 BufferDesc *buf_hdr = is_temp ?
6973 BufferTag tag = buf_hdr->tag;
6974 char *bufdata = BufferGetBlock(buffer);
6975 uint32 set_flag_bits;
6976 int piv_flags;
6977
6978 /* check that the buffer is in the expected state for a read */
6979#ifdef USE_ASSERT_CHECKING
6980 {
6981 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6982
6983 Assert(buf_state & BM_TAG_VALID);
6984 Assert(!(buf_state & BM_VALID));
6985 /* temp buffers don't use BM_IO_IN_PROGRESS */
6986 if (!is_temp)
6987 Assert(buf_state & BM_IO_IN_PROGRESS);
6988 Assert(!(buf_state & BM_DIRTY));
6989 }
6990#endif
6991
6992 *buffer_invalid = false;
6993 *failed_checksum = false;
6994 *ignored_checksum = false;
6995 *zeroed_buffer = false;
6996
6997 /*
6998 * We ask PageIsVerified() to only log the message about checksum errors,
6999 * as the completion might be run in any backend (or IO workers). We will
7000 * report checksum errors in buffer_readv_report().
7001 */
7002 piv_flags = PIV_LOG_LOG;
7003
7004 /* the local zero_damaged_pages may differ from the definer's */
7006 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7007
7008 /* Check for garbage data. */
7009 if (!failed)
7010 {
7011 /*
7012 * If the buffer is not currently pinned by this backend, e.g. because
7013 * we're completing this IO after an error, the buffer data will have
7014 * been marked as inaccessible when the buffer was unpinned. The AIO
7015 * subsystem holds a pin, but that doesn't prevent the buffer from
7016 * having been marked as inaccessible. The completion might also be
7017 * executed in a different process.
7018 */
7019#ifdef USE_VALGRIND
7020 if (!BufferIsPinned(buffer))
7021 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7022#endif
7023
7024 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7025 failed_checksum))
7026 {
7027 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7028 {
7029 memset(bufdata, 0, BLCKSZ);
7030 *zeroed_buffer = true;
7031 }
7032 else
7033 {
7034 *buffer_invalid = true;
7035 /* mark buffer as having failed */
7036 failed = true;
7037 }
7038 }
7039 else if (*failed_checksum)
7040 *ignored_checksum = true;
7041
7042 /* undo what we did above */
7043#ifdef USE_VALGRIND
7044 if (!BufferIsPinned(buffer))
7045 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7046#endif
7047
7048 /*
7049 * Immediately log a message about the invalid page, but only to the
7050 * server log. The reason to do so immediately is that this may be
7051 * executed in a different backend than the one that originated the
7052 * request. The reason to do so immediately is that the originator
7053 * might not process the query result immediately (because it is busy
7054 * doing another part of query processing) or at all (e.g. if it was
7055 * cancelled or errored out due to another IO also failing). The
7056 * definer of the IO will emit an ERROR or WARNING when processing the
7057 * IO's results
7058 *
7059 * To avoid duplicating the code to emit these log messages, we reuse
7060 * buffer_readv_report().
7061 */
7062 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7063 {
7064 PgAioResult result_one = {0};
7065
7066 buffer_readv_encode_error(&result_one, is_temp,
7067 *zeroed_buffer,
7068 *ignored_checksum,
7069 *buffer_invalid,
7070 *zeroed_buffer ? 1 : 0,
7071 *failed_checksum ? 1 : 0,
7072 buf_off, buf_off, buf_off);
7073 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7074 }
7075 }
7076
7077 /* Terminate I/O and set BM_VALID. */
7078 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7079 if (is_temp)
7080 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7081 else
7082 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7083
7084 /*
7085 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7086 * callback may not be executed in the same backend that called
7087 * BUFFER_READ_START. The alternative would be to defer calling the
7088 * tracepoint to a later point (e.g. the local completion callback for
7089 * shared buffer reads), which seems even less helpful.
7090 */
7091 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7092 tag.blockNum,
7093 tag.spcOid,
7094 tag.dbOid,
7095 tag.relNumber,
7097 false);
7098}
7099
7100/*
7101 * Perform completion handling of a single AIO read. This read may cover
7102 * multiple blocks / buffers.
7103 *
7104 * Shared between shared and local buffers, to reduce code duplication.
7105 */
7108 uint8 cb_data, bool is_temp)
7109{
7110 PgAioResult result = prior_result;
7112 uint8 first_error_off = 0;
7113 uint8 first_zeroed_off = 0;
7114 uint8 first_ignored_off = 0;
7115 uint8 error_count = 0;
7116 uint8 zeroed_count = 0;
7117 uint8 ignored_count = 0;
7118 uint8 checkfail_count = 0;
7119 uint64 *io_data;
7120 uint8 handle_data_len;
7121
7122 if (is_temp)
7123 {
7124 Assert(td->smgr.is_temp);
7126 }
7127 else
7128 Assert(!td->smgr.is_temp);
7129
7130 /*
7131 * Iterate over all the buffers affected by this IO and call the
7132 * per-buffer completion function for each buffer.
7133 */
7134 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7135 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7136 {
7137 Buffer buf = io_data[buf_off];
7138 bool failed;
7139 bool failed_verification = false;
7140 bool failed_checksum = false;
7141 bool zeroed_buffer = false;
7142 bool ignored_checksum = false;
7143
7145
7146 /*
7147 * If the entire I/O failed on a lower-level, each buffer needs to be
7148 * marked as failed. In case of a partial read, the first few buffers
7149 * may be ok.
7150 */
7151 failed =
7152 prior_result.status == PGAIO_RS_ERROR
7153 || prior_result.result <= buf_off;
7154
7155 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7156 &failed_verification,
7157 &failed_checksum,
7158 &ignored_checksum,
7159 &zeroed_buffer);
7160
7161 /*
7162 * Track information about the number of different kinds of error
7163 * conditions across all pages, as there can be multiple pages failing
7164 * verification as part of one IO.
7165 */
7166 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7167 first_error_off = buf_off;
7168 if (zeroed_buffer && zeroed_count++ == 0)
7169 first_zeroed_off = buf_off;
7170 if (ignored_checksum && ignored_count++ == 0)
7171 first_ignored_off = buf_off;
7172 if (failed_checksum)
7173 checkfail_count++;
7174 }
7175
7176 /*
7177 * If the smgr read succeeded [partially] and page verification failed for
7178 * some of the pages, adjust the IO's result state appropriately.
7179 */
7180 if (prior_result.status != PGAIO_RS_ERROR &&
7181 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7182 {
7183 buffer_readv_encode_error(&result, is_temp,
7184 zeroed_count > 0, ignored_count > 0,
7185 error_count, zeroed_count, checkfail_count,
7186 first_error_off, first_zeroed_off,
7187 first_ignored_off);
7188 pgaio_result_report(result, td, DEBUG1);
7189 }
7190
7191 /*
7192 * For shared relations this reporting is done in
7193 * shared_buffer_readv_complete_local().
7194 */
7195 if (is_temp && checkfail_count > 0)
7197 checkfail_count);
7198
7199 return result;
7200}
7201
7202/*
7203 * AIO error reporting callback for aio_shared_buffer_readv_cb and
7204 * aio_local_buffer_readv_cb.
7205 *
7206 * The error is encoded / decoded in buffer_readv_encode_error() /
7207 * buffer_readv_decode_error().
7208 */
7209static void
7211 int elevel)
7212{
7213 int nblocks = td->smgr.nblocks;
7214 BlockNumber first = td->smgr.blockNum;
7215 BlockNumber last = first + nblocks - 1;
7216 ProcNumber errProc =
7218 RelPathStr rpath =
7219 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7220 bool zeroed_any,
7221 ignored_any;
7222 uint8 zeroed_or_error_count,
7223 checkfail_count,
7224 first_off;
7225 uint8 affected_count;
7226 const char *msg_one,
7227 *msg_mult,
7228 *det_mult,
7229 *hint_mult;
7230
7231 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7232 &zeroed_or_error_count,
7233 &checkfail_count,
7234 &first_off);
7235
7236 /*
7237 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7238 * special case, it's too irregular to be emitted the same way as the
7239 * other cases.
7240 */
7241 if (zeroed_any && ignored_any)
7242 {
7243 Assert(zeroed_any && ignored_any);
7244 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7245 Assert(result.status != PGAIO_RS_ERROR);
7246 affected_count = zeroed_or_error_count;
7247
7248 ereport(elevel,
7250 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s",
7251 affected_count, checkfail_count, first, last, rpath.str),
7252 affected_count > 1 ?
7253 errdetail("Block %u held first zeroed page.",
7254 first + first_off) : 0,
7255 errhint("See server log for details about the other %u invalid block(s).",
7256 affected_count + checkfail_count - 1));
7257 return;
7258 }
7259
7260 /*
7261 * The other messages are highly repetitive. To avoid duplicating a long
7262 * and complicated ereport(), gather the translated format strings
7263 * separately and then do one common ereport.
7264 */
7265 if (result.status == PGAIO_RS_ERROR)
7266 {
7267 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7268 affected_count = zeroed_or_error_count;
7269 msg_one = _("invalid page in block %u of relation %s");
7270 msg_mult = _("%u invalid pages among blocks %u..%u of relation %s");
7271 det_mult = _("Block %u held first invalid page.");
7272 hint_mult = _("See server log for the other %u invalid block(s).");
7273 }
7274 else if (zeroed_any && !ignored_any)
7275 {
7276 affected_count = zeroed_or_error_count;
7277 msg_one = _("invalid page in block %u of relation %s; zeroing out page");
7278 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s");
7279 det_mult = _("Block %u held first zeroed page.");
7280 hint_mult = _("See server log for the other %u zeroed block(s).");
7281 }
7282 else if (!zeroed_any && ignored_any)
7283 {
7284 affected_count = checkfail_count;
7285 msg_one = _("ignoring checksum failure in block %u of relation %s");
7286 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s");
7287 det_mult = _("Block %u held first ignored page.");
7288 hint_mult = _("See server log for the other %u ignored block(s).");
7289 }
7290 else
7292
7293 ereport(elevel,
7295 affected_count == 1 ?
7296 errmsg_internal(msg_one, first + first_off, rpath.str) :
7297 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7298 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7299 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7300}
7301
7302static void
7304{
7305 buffer_stage_common(ioh, false, false);
7306}
7307
7308static PgAioResult
7310 uint8 cb_data)
7311{
7312 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7313}
7314
7315/*
7316 * We need a backend-local completion callback for shared buffers, to be able
7317 * to report checksum errors correctly. Unfortunately that can only safely
7318 * happen if the reporting backend has previously called
7319 * pgstat_prepare_report_checksum_failure(), which we can only guarantee in
7320 * the backend that started the IO. Hence this callback.
7321 */
7322static PgAioResult
7324 uint8 cb_data)
7325{
7326 bool zeroed_any,
7327 ignored_any;
7328 uint8 zeroed_or_error_count,
7329 checkfail_count,
7330 first_off;
7331
7332 if (prior_result.status == PGAIO_RS_OK)
7333 return prior_result;
7334
7335 buffer_readv_decode_error(prior_result,
7336 &zeroed_any,
7337 &ignored_any,
7338 &zeroed_or_error_count,
7339 &checkfail_count,
7340 &first_off);
7341
7342 if (checkfail_count)
7343 {
7345
7347 checkfail_count);
7348 }
7349
7350 return prior_result;
7351}
7352
7353static void
7355{
7356 buffer_stage_common(ioh, false, true);
7357}
7358
7359static PgAioResult
7361 uint8 cb_data)
7362{
7363 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7364}
7365
7366/* readv callback is passed READ_BUFFERS_* flags as callback data */
7369 .complete_shared = shared_buffer_readv_complete,
7370 /* need a local callback to report checksum failures */
7371 .complete_local = shared_buffer_readv_complete_local,
7372 .report = buffer_readv_report,
7373};
7374
7375/* readv callback is passed READ_BUFFERS_* flags as callback data */
7378
7379 /*
7380 * Note that this, in contrast to the shared_buffers case, uses
7381 * complete_local, as only the issuing backend has access to the required
7382 * datastructures. This is important in case the IO completion may be
7383 * consumed incidentally by another backend.
7384 */
7385 .complete_local = local_buffer_readv_complete,
7386 .report = buffer_readv_report,
7387};
int io_method
Definition: aio.c:77
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:873
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:173
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:866
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:354
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
bool pgaio_have_staged(void)
Definition: aio.c:1004
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:907
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:343
void pgaio_submit_staged(void)
Definition: aio.c:1020
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:893
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:242
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:199
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:139
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:154
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:171
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:72
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:349
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:410
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:295
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:239
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
int BgWriterDelay
Definition: bgwriter.c:58
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:224
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:71
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
#define BM_DIRTY
Definition: buf_internals.h:69
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:68
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
static LWLock * BufMappingPartitionLock(uint32 hashcode)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:144
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5584
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:4963
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:5335
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4825
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:6277
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7107
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2884
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:241
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4161
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:322
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1562
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4470
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:3007
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7323
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1255
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1525
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:644
static uint32 PrivateRefCountClock
Definition: bufmgr.c:215
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4221
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6476
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7360
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1487
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6610
int io_max_combine_limit
Definition: bufmgr.c:169
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3065
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:232
bool zero_damaged_pages
Definition: bufmgr.c:141
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:88
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3176
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
Definition: bufmgr.c:6658
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:6963
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6219
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:6243
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5843
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:70
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6484
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:851
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3989
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6091
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Definition: bufmgr.c:7367
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:883
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1186
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1591
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1024
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1998
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4058
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1549
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:5177
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4593
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6162
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:915
struct SMgrSortArray SMgrSortArray
const PgAioHandleCallbacks aio_local_buffer_readv_cb
Definition: bufmgr.c:7376
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2275
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4040
int io_combine_limit_guc
Definition: bufmgr.c:168
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6308
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:4182
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:69
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6717
#define BUF_REUSABLE
Definition: bufmgr.c:78
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6146
static void BufferSync(int flags)
Definition: bufmgr.c:3342
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1762
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7354
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4104
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6507
void CheckPointBuffers(int flags)
Definition: bufmgr.c:4147
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2912
static uint32 MaxProportionalPins
Definition: bufmgr.c:218
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2603
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3618
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3222
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:4393
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:97
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7303
void UnlockBuffers(void)
Definition: bufmgr.c:5509
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:554
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7309
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2343
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5563
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:4361
int bgwriter_flush_after
Definition: bufmgr.c:176
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5303
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4764
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:4423
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5759
int checkpoint_flush_after
Definition: bufmgr.c:175
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5320
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1103
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6032
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3266
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6130
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6343
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1630
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6331
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2945
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:480
double bgwriter_lru_multiplier
Definition: bufmgr.c:143
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6517
int backend_flush_after
Definition: bufmgr.c:177
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2541
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7210
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:256
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:180
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:422
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2559
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5617
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5537
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:216
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5367
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4873
#define READV_COUNT_BITS
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6393
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:445
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
Definition: bufmgr.c:6581
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:835
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:675
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:80
int maintenance_io_concurrency
Definition: bufmgr.c:159
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3257
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:5241
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2176
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5063
int effective_io_concurrency
Definition: bufmgr.c:152
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:348
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:5975
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1506
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:798
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6189
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6492
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:212
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6826
#define READV_COUNT_MASK
int io_combine_limit
Definition: bufmgr.c:167
void InitBufferManagerAccess(void)
Definition: bufmgr.c:4006
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6868
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3916
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2515
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:751
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:213
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:214
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5785
int bgwriter_lru_maxpages
Definition: bufmgr.c:142
uint32 GetPinLimit(void)
Definition: bufmgr.c:2503
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5896
#define BUF_WRITTEN
Definition: bufmgr.c:77
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:5283
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define P_NEW
Definition: bufmgr.h:191
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:112
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:414
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:167
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:381
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:114
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:161
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:116
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:162
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:118
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
ReadBufferMode
Definition: bufmgr.h:45
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
@ RBM_NORMAL
Definition: bufmgr.h:46
#define BMR_REL(p_rel)
Definition: bufmgr.h:108
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:365
bool ignore_checksum_failure
Definition: bufpage.c:27
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define likely(x)
Definition: c.h:346
uint8_t uint8
Definition: c.h:500
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
#define Max(x, y)
Definition: c.h:969
double float8
Definition: c.h:601
#define pg_attribute_always_inline
Definition: c.h:270
int16_t int16
Definition: c.h:497
int32_t int32
Definition: c.h:498
uint64_t uint64
Definition: c.h:503
#define pg_unreachable()
Definition: c.h:332
#define unlikely(x)
Definition: c.h:347
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
#define MemSet(start, val, len)
Definition: c.h:991
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:909
size_t Size
Definition: c.h:576
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:773
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
struct cursor * cur
Definition: ecpg.c:29
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1231
int errdetail(const char *fmt,...)
Definition: elog.c:1204
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint_internal(const char *fmt,...)
Definition: elog.c:1340
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define _(x)
Definition: elog.c:91
#define errcontext
Definition: elog.h:197
#define DEBUG3
Definition: elog.h:28
#define LOG_SERVER_ONLY
Definition: elog.h:32
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:723
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:800
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:840
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
int NBuffers
Definition: globals.c:143
bool enableFsync
Definition: globals.c:130
ProcNumber MyProcNumber
Definition: globals.c:91
int VacuumCostPageMiss
Definition: globals.c:153
bool VacuumCostActive
Definition: globals.c:159
int VacuumCostBalance
Definition: globals.c:158
int MaxBackends
Definition: globals.c:147
int VacuumCostPageDirty
Definition: globals.c:154
int VacuumCostPageHit
Definition: globals.c:152
Assert(PointerIsAligned(start, uint64))
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:74
int a
Definition: isn.c:73
int j
Definition: isn.c:78
int i
Definition: isn.c:77
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
int32 * LocalRefCount
Definition: localbuf.c:48
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:832
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:993
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:663
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1004
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:796
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:693
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
int NLocBuffer
Definition: localbuf.c:44
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:839
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1970
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1891
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2014
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1353
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc(Size size)
Definition: mcxt.c:1940
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
#define PG_IOV_MAX
Definition: pg_iovec.h:41
static char * buf
Definition: pg_test_fsync.c:72
IOObject
Definition: pgstat.h:273
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:275
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:704
IOContext
Definition: pgstat.h:282
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_EXTEND
Definition: pgstat.h:311
@ IOOP_READ
Definition: pgstat.h:312
@ IOOP_WRITEBACK
Definition: pgstat.h:308
@ IOOP_HIT
Definition: pgstat.h:306
@ IOOP_EVICT
Definition: pgstat.h:304
@ IOOP_REUSE
Definition: pgstat.h:307
@ IOOP_WRITE
Definition: pgstat.h:313
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:709
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
#define qsort(a, b, c, d)
Definition: port.h:479
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
#define NUM_AUXILIARY_PROCS
Definition: proc.h:447
#define DELAY_CHKPT_START
Definition: proc.h:120
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:498
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:48
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:423
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:371
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:740
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:578
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:648
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:669
#define RelationIsValid(relation)
Definition: rel.h:489
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:452
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:751
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1987
PGPROC * MyProc
Definition: proc.c:67
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:767
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:755
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1975
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
PgAioWaitRef io_wref
struct SMgrRelationData * smgr
Definition: bufmgr.h:104
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:116
int index
Definition: bufmgr.c:124
int num_scanned
Definition: bufmgr.c:121
float8 progress
Definition: bufmgr.c:115
int num_to_scan
Definition: bufmgr.c:119
Oid tsId
Definition: bufmgr.c:106
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:241
PgAioHandleCallbackStage stage
Definition: aio.h:219
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
PgStat_Counter buf_written_clean
Definition: pgstat.h:239
PgStat_Counter maxwritten_clean
Definition: pgstat.h:240
PgStat_Counter buf_alloc
Definition: pgstat.h:241
PgStat_Counter buffers_written
Definition: pgstat.h:263
Buffer recent_buffer
Definition: bufmgr.h:61
ForkNumber forknum
Definition: bufmgr.h:127
PgAioWaitRef io_wref
Definition: bufmgr.h:140
Buffer * buffers
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:128
BlockNumber blocknum
Definition: bufmgr.h:136
PgAioReturn io_return
Definition: bufmgr.h:141
struct SMgrRelationData * smgr
Definition: bufmgr.h:125
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
SMgrRelation srel
Definition: bufmgr.c:137
RelFileLocator rlocator
Definition: bufmgr.c:136
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1828
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
BlockNumber nblocks
Definition: aio_types.h:67
struct PgAioTargetData::@124 smgr
ForkNumber forkNum
Definition: aio_types.h:68
static volatile sig_atomic_t waiting
Definition: waiteventset.c:170
bool RecoveryInProgress(void)
Definition: xlog.c:6522
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3254
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2923
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
#define InHotStandby
Definition: xlogutils.h:60