PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogutils.h"
39 #include "catalog/catalog.h"
40 #include "catalog/storage.h"
41 #include "catalog/storage_xlog.h"
42 #include "executor/instrument.h"
43 #include "lib/binaryheap.h"
44 #include "miscadmin.h"
45 #include "pg_trace.h"
46 #include "pgstat.h"
47 #include "postmaster/bgwriter.h"
48 #include "storage/buf_internals.h"
49 #include "storage/bufmgr.h"
50 #include "storage/fd.h"
51 #include "storage/ipc.h"
52 #include "storage/lmgr.h"
53 #include "storage/proc.h"
54 #include "storage/smgr.h"
55 #include "storage/standby.h"
56 #include "utils/memdebug.h"
57 #include "utils/ps_status.h"
58 #include "utils/rel.h"
59 #include "utils/resowner.h"
60 #include "utils/timestamp.h"
61 
62 
63 /* Note: these two macros only work on shared buffers, not local ones! */
64 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
65 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
66 
67 /* Note: this macro only works on local buffers, not shared ones! */
68 #define LocalBufHdrGetBlock(bufHdr) \
69  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
70 
71 /* Bits in SyncOneBuffer's return value */
72 #define BUF_WRITTEN 0x01
73 #define BUF_REUSABLE 0x02
74 
75 #define RELS_BSEARCH_THRESHOLD 20
76 
77 /*
78  * This is the size (in the number of blocks) above which we scan the
79  * entire buffer pool to remove the buffers for all the pages of relation
80  * being dropped. For the relations with size below this threshold, we find
81  * the buffers by doing lookups in BufMapping table.
82  */
83 #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
84 
85 typedef struct PrivateRefCountEntry
86 {
90 
91 /* 64 bytes, about the size of a cache line on common systems */
92 #define REFCOUNT_ARRAY_ENTRIES 8
93 
94 /*
95  * Status of buffers to checkpoint for a particular tablespace, used
96  * internally in BufferSync.
97  */
98 typedef struct CkptTsStatus
99 {
100  /* oid of the tablespace */
102 
103  /*
104  * Checkpoint progress for this tablespace. To make progress comparable
105  * between tablespaces the progress is, for each tablespace, measured as a
106  * number between 0 and the total number of to-be-checkpointed pages. Each
107  * page checkpointed in this tablespace increments this space's progress
108  * by progress_slice.
109  */
112 
113  /* number of to-be checkpointed pages in this tablespace */
115  /* already processed pages in this tablespace */
117 
118  /* current offset in CkptBufferIds for this tablespace */
119  int index;
121 
122 /*
123  * Type for array used to sort SMgrRelations
124  *
125  * FlushRelationsAllBuffers shares the same comparator function with
126  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
127  * compatible.
128  */
129 typedef struct SMgrSortArray
130 {
131  RelFileLocator rlocator; /* This must be the first member */
134 
135 /* GUC variables */
136 bool zero_damaged_pages = false;
139 bool track_io_timing = false;
140 
141 /*
142  * How many buffers PrefetchBuffer callers should try to stay ahead of their
143  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
144  * for buffers not belonging to tablespaces that have their
145  * effective_io_concurrency parameter set.
146  */
148 
149 /*
150  * Like effective_io_concurrency, but used by maintenance code paths that might
151  * benefit from a higher setting because they work on behalf of many sessions.
152  * Overridden by the tablespace setting of the same name.
153  */
155 
156 /*
157  * GUC variables about triggering kernel writeback for buffers written; OS
158  * dependent defaults are set via the GUC mechanism.
159  */
163 
164 /* local state for LockBufferForCleanup */
166 
167 /*
168  * Backend-Private refcount management:
169  *
170  * Each buffer also has a private refcount that keeps track of the number of
171  * times the buffer is pinned in the current process. This is so that the
172  * shared refcount needs to be modified only once if a buffer is pinned more
173  * than once by an individual backend. It's also used to check that no buffers
174  * are still pinned at the end of transactions and when exiting.
175  *
176  *
177  * To avoid - as we used to - requiring an array with NBuffers entries to keep
178  * track of local buffers, we use a small sequentially searched array
179  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
180  * keep track of backend local pins.
181  *
182  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
183  * refcounts are kept track of in the array; after that, new array entries
184  * displace old ones into the hash table. That way a frequently used entry
185  * can't get "stuck" in the hashtable while infrequent ones clog the array.
186  *
187  * Note that in most scenarios the number of pinned buffers will not exceed
188  * REFCOUNT_ARRAY_ENTRIES.
189  *
190  *
191  * To enter a buffer into the refcount tracking mechanism first reserve a free
192  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
193  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
194  * memory allocations in NewPrivateRefCountEntry() which can be important
195  * because in some scenarios it's called with a spinlock held...
196  */
198 static HTAB *PrivateRefCountHash = NULL;
202 
203 static void ReservePrivateRefCountEntry(void);
206 static inline int32 GetPrivateRefCount(Buffer buffer);
208 
209 /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
210 static void ResOwnerReleaseBufferIO(Datum res);
211 static char *ResOwnerPrintBufferIO(Datum res);
212 static void ResOwnerReleaseBufferPin(Datum res);
213 static char *ResOwnerPrintBufferPin(Datum res);
214 
216 {
217  .name = "buffer io",
218  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
219  .release_priority = RELEASE_PRIO_BUFFER_IOS,
220  .ReleaseResource = ResOwnerReleaseBufferIO,
221  .DebugPrint = ResOwnerPrintBufferIO
222 };
223 
225 {
226  .name = "buffer pin",
227  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
228  .release_priority = RELEASE_PRIO_BUFFER_PINS,
229  .ReleaseResource = ResOwnerReleaseBufferPin,
230  .DebugPrint = ResOwnerPrintBufferPin
231 };
232 
233 /*
234  * Ensure that the PrivateRefCountArray has sufficient space to store one more
235  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
236  * a new entry - but it's perfectly fine to not use a reserved entry.
237  */
238 static void
240 {
241  /* Already reserved (or freed), nothing to do */
242  if (ReservedRefCountEntry != NULL)
243  return;
244 
245  /*
246  * First search for a free entry the array, that'll be sufficient in the
247  * majority of cases.
248  */
249  {
250  int i;
251 
252  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
253  {
255 
257 
258  if (res->buffer == InvalidBuffer)
259  {
261  return;
262  }
263  }
264  }
265 
266  /*
267  * No luck. All array entries are full. Move one array entry into the hash
268  * table.
269  */
270  {
271  /*
272  * Move entry from the current clock position in the array into the
273  * hashtable. Use that slot.
274  */
275  PrivateRefCountEntry *hashent;
276  bool found;
277 
278  /* select victim slot */
281 
282  /* Better be used, otherwise we shouldn't get here. */
284 
285  /* enter victim array entry into hashtable */
288  HASH_ENTER,
289  &found);
290  Assert(!found);
292 
293  /* clear the now free array slot */
296 
298  }
299 }
300 
301 /*
302  * Fill a previously reserved refcount entry.
303  */
304 static PrivateRefCountEntry *
306 {
308 
309  /* only allowed to be called when a reservation has been made */
310  Assert(ReservedRefCountEntry != NULL);
311 
312  /* use up the reserved entry */
314  ReservedRefCountEntry = NULL;
315 
316  /* and fill it */
317  res->buffer = buffer;
318  res->refcount = 0;
319 
320  return res;
321 }
322 
323 /*
324  * Return the PrivateRefCount entry for the passed buffer.
325  *
326  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
327  * do_move is true, and the entry resides in the hashtable the entry is
328  * optimized for frequent access by moving it to the array.
329  */
330 static PrivateRefCountEntry *
332 {
334  int i;
335 
338 
339  /*
340  * First search for references in the array, that'll be sufficient in the
341  * majority of cases.
342  */
343  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
344  {
346 
347  if (res->buffer == buffer)
348  return res;
349  }
350 
351  /*
352  * By here we know that the buffer, if already pinned, isn't residing in
353  * the array.
354  *
355  * Only look up the buffer in the hashtable if we've previously overflowed
356  * into it.
357  */
358  if (PrivateRefCountOverflowed == 0)
359  return NULL;
360 
362 
363  if (res == NULL)
364  return NULL;
365  else if (!do_move)
366  {
367  /* caller doesn't want us to move the hash entry into the array */
368  return res;
369  }
370  else
371  {
372  /* move buffer from hashtable into the free array slot */
373  bool found;
375 
376  /* Ensure there's a free array slot */
378 
379  /* Use up the reserved slot */
380  Assert(ReservedRefCountEntry != NULL);
382  ReservedRefCountEntry = NULL;
383  Assert(free->buffer == InvalidBuffer);
384 
385  /* and fill it */
386  free->buffer = buffer;
387  free->refcount = res->refcount;
388 
389  /* delete from hashtable */
391  Assert(found);
394 
395  return free;
396  }
397 }
398 
399 /*
400  * Returns how many times the passed buffer is pinned by this backend.
401  *
402  * Only works for shared memory buffers!
403  */
404 static inline int32
406 {
408 
411 
412  /*
413  * Not moving the entry - that's ok for the current users, but we might
414  * want to change this one day.
415  */
416  ref = GetPrivateRefCountEntry(buffer, false);
417 
418  if (ref == NULL)
419  return 0;
420  return ref->refcount;
421 }
422 
423 /*
424  * Release resources used to track the reference count of a buffer which we no
425  * longer have pinned and don't want to pin again immediately.
426  */
427 static void
429 {
430  Assert(ref->refcount == 0);
431 
432  if (ref >= &PrivateRefCountArray[0] &&
434  {
435  ref->buffer = InvalidBuffer;
436 
437  /*
438  * Mark the just used entry as reserved - in many scenarios that
439  * allows us to avoid ever having to search the array/hash for free
440  * entries.
441  */
442  ReservedRefCountEntry = ref;
443  }
444  else
445  {
446  bool found;
447  Buffer buffer = ref->buffer;
448 
450  Assert(found);
453  }
454 }
455 
456 /*
457  * BufferIsPinned
458  * True iff the buffer is pinned (also checks for valid buffer number).
459  *
460  * NOTE: what we check here is that *this* backend holds a pin on
461  * the buffer. We do not care whether some other backend does.
462  */
463 #define BufferIsPinned(bufnum) \
464 ( \
465  !BufferIsValid(bufnum) ? \
466  false \
467  : \
468  BufferIsLocal(bufnum) ? \
469  (LocalRefCount[-(bufnum) - 1] > 0) \
470  : \
471  (GetPrivateRefCount(bufnum) > 0) \
472 )
473 
474 
475 static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
476  ForkNumber forkNum, BlockNumber blockNum,
478  bool *hit);
480  ForkNumber fork,
481  BufferAccessStrategy strategy,
482  uint32 flags,
483  uint32 extend_by,
484  BlockNumber extend_upto,
485  Buffer *buffers,
486  uint32 *extended_by);
488  ForkNumber fork,
489  BufferAccessStrategy strategy,
490  uint32 flags,
491  uint32 extend_by,
492  BlockNumber extend_upto,
493  Buffer *buffers,
494  uint32 *extended_by);
495 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
496 static void PinBuffer_Locked(BufferDesc *buf);
497 static void UnpinBuffer(BufferDesc *buf);
498 static void UnpinBufferNoOwner(BufferDesc *buf);
499 static void BufferSync(int flags);
501 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
502  WritebackContext *wb_context);
503 static void WaitIO(BufferDesc *buf);
504 static bool StartBufferIO(BufferDesc *buf, bool forInput);
505 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
506  uint32 set_flag_bits, bool forget_owner);
507 static void AbortBufferIO(Buffer buffer);
508 static void shared_buffer_write_error_callback(void *arg);
509 static void local_buffer_write_error_callback(void *arg);
510 static BufferDesc *BufferAlloc(SMgrRelation smgr,
511  char relpersistence,
512  ForkNumber forkNum,
513  BlockNumber blockNum,
514  BufferAccessStrategy strategy,
515  bool *foundPtr, IOContext io_context);
516 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
517 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
518  IOObject io_object, IOContext io_context);
519 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
520  ForkNumber forkNum,
521  BlockNumber nForkBlock,
522  BlockNumber firstDelBlock);
523 static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
524  RelFileLocator dstlocator,
525  ForkNumber forkNum, bool permanent);
526 static void AtProcExit_Buffers(int code, Datum arg);
527 static void CheckForBufferLeaks(void);
528 static int rlocator_comparator(const void *p1, const void *p2);
529 static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
530 static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
531 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
532 
533 
534 /*
535  * Implementation of PrefetchBuffer() for shared buffers.
536  */
539  ForkNumber forkNum,
540  BlockNumber blockNum)
541 {
542  PrefetchBufferResult result = {InvalidBuffer, false};
543  BufferTag newTag; /* identity of requested block */
544  uint32 newHash; /* hash value for newTag */
545  LWLock *newPartitionLock; /* buffer partition lock for it */
546  int buf_id;
547 
548  Assert(BlockNumberIsValid(blockNum));
549 
550  /* create a tag so we can lookup the buffer */
551  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
552  forkNum, blockNum);
553 
554  /* determine its hash code and partition lock ID */
555  newHash = BufTableHashCode(&newTag);
556  newPartitionLock = BufMappingPartitionLock(newHash);
557 
558  /* see if the block is in the buffer pool already */
559  LWLockAcquire(newPartitionLock, LW_SHARED);
560  buf_id = BufTableLookup(&newTag, newHash);
561  LWLockRelease(newPartitionLock);
562 
563  /* If not in buffers, initiate prefetch */
564  if (buf_id < 0)
565  {
566 #ifdef USE_PREFETCH
567  /*
568  * Try to initiate an asynchronous read. This returns false in
569  * recovery if the relation file doesn't exist.
570  */
571  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
572  smgrprefetch(smgr_reln, forkNum, blockNum, 1))
573  {
574  result.initiated_io = true;
575  }
576 #endif /* USE_PREFETCH */
577  }
578  else
579  {
580  /*
581  * Report the buffer it was in at that time. The caller may be able
582  * to avoid a buffer table lookup, but it's not pinned and it must be
583  * rechecked!
584  */
585  result.recent_buffer = buf_id + 1;
586  }
587 
588  /*
589  * If the block *is* in buffers, we do nothing. This is not really ideal:
590  * the block might be just about to be evicted, which would be stupid
591  * since we know we are going to need it soon. But the only easy answer
592  * is to bump the usage_count, which does not seem like a great solution:
593  * when the caller does ultimately touch the block, usage_count would get
594  * bumped again, resulting in too much favoritism for blocks that are
595  * involved in a prefetch sequence. A real fix would involve some
596  * additional per-buffer state, and it's not clear that there's enough of
597  * a problem to justify that.
598  */
599 
600  return result;
601 }
602 
603 /*
604  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
605  *
606  * This is named by analogy to ReadBuffer but doesn't actually allocate a
607  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
608  * block will not be delayed by the I/O. Prefetching is optional.
609  *
610  * There are three possible outcomes:
611  *
612  * 1. If the block is already cached, the result includes a valid buffer that
613  * could be used by the caller to avoid the need for a later buffer lookup, but
614  * it's not pinned, so the caller must recheck it.
615  *
616  * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
617  * true. Currently there is no way to know if the data was already cached by
618  * the kernel and therefore didn't really initiate I/O, and no way to know when
619  * the I/O completes other than using synchronous ReadBuffer().
620  *
621  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
622  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
623  * lack of a kernel facility), direct I/O is enabled, or the underlying
624  * relation file wasn't found and we are in recovery. (If the relation file
625  * wasn't found and we are not in recovery, an error is raised).
626  */
629 {
630  Assert(RelationIsValid(reln));
631  Assert(BlockNumberIsValid(blockNum));
632 
633  if (RelationUsesLocalBuffers(reln))
634  {
635  /* see comments in ReadBufferExtended */
636  if (RELATION_IS_OTHER_TEMP(reln))
637  ereport(ERROR,
638  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
639  errmsg("cannot access temporary tables of other sessions")));
640 
641  /* pass it off to localbuf.c */
642  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
643  }
644  else
645  {
646  /* pass it to the shared buffer version */
647  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
648  }
649 }
650 
651 /*
652  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
653  *
654  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
655  * successful. Return true if the buffer is valid and still has the expected
656  * tag. In that case, the buffer is pinned and the usage count is bumped.
657  */
658 bool
660  Buffer recent_buffer)
661 {
662  BufferDesc *bufHdr;
663  BufferTag tag;
664  uint32 buf_state;
665  bool have_private_ref;
666 
667  Assert(BufferIsValid(recent_buffer));
668 
671  InitBufferTag(&tag, &rlocator, forkNum, blockNum);
672 
673  if (BufferIsLocal(recent_buffer))
674  {
675  int b = -recent_buffer - 1;
676 
677  bufHdr = GetLocalBufferDescriptor(b);
678  buf_state = pg_atomic_read_u32(&bufHdr->state);
679 
680  /* Is it still valid and holding the right tag? */
681  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
682  {
683  PinLocalBuffer(bufHdr, true);
684 
686 
687  return true;
688  }
689  }
690  else
691  {
692  bufHdr = GetBufferDescriptor(recent_buffer - 1);
693  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
694 
695  /*
696  * Do we already have this buffer pinned with a private reference? If
697  * so, it must be valid and it is safe to check the tag without
698  * locking. If not, we have to lock the header first and then check.
699  */
700  if (have_private_ref)
701  buf_state = pg_atomic_read_u32(&bufHdr->state);
702  else
703  buf_state = LockBufHdr(bufHdr);
704 
705  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
706  {
707  /*
708  * It's now safe to pin the buffer. We can't pin first and ask
709  * questions later, because it might confuse code paths like
710  * InvalidateBuffer() if we pinned a random non-matching buffer.
711  */
712  if (have_private_ref)
713  PinBuffer(bufHdr, NULL); /* bump pin count */
714  else
715  PinBuffer_Locked(bufHdr); /* pin for first time */
716 
718 
719  return true;
720  }
721 
722  /* If we locked the header above, now unlock. */
723  if (!have_private_ref)
724  UnlockBufHdr(bufHdr, buf_state);
725  }
726 
727  return false;
728 }
729 
730 /*
731  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
732  * fork with RBM_NORMAL mode and default strategy.
733  */
734 Buffer
736 {
737  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
738 }
739 
740 /*
741  * ReadBufferExtended -- returns a buffer containing the requested
742  * block of the requested relation. If the blknum
743  * requested is P_NEW, extend the relation file and
744  * allocate a new block. (Caller is responsible for
745  * ensuring that only one backend tries to extend a
746  * relation at the same time!)
747  *
748  * Returns: the buffer number for the buffer containing
749  * the block read. The returned buffer has been pinned.
750  * Does not return on error --- elog's instead.
751  *
752  * Assume when this function is called, that reln has been opened already.
753  *
754  * In RBM_NORMAL mode, the page is read from disk, and the page header is
755  * validated. An error is thrown if the page header is not valid. (But
756  * note that an all-zero page is considered "valid"; see
757  * PageIsVerifiedExtended().)
758  *
759  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
760  * valid, the page is zeroed instead of throwing an error. This is intended
761  * for non-critical data, where the caller is prepared to repair errors.
762  *
763  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
764  * filled with zeros instead of reading it from disk. Useful when the caller
765  * is going to fill the page from scratch, since this saves I/O and avoids
766  * unnecessary failure if the page-on-disk has corrupt page headers.
767  * The page is returned locked to ensure that the caller has a chance to
768  * initialize the page before it's made visible to others.
769  * Caution: do not use this mode to read a page that is beyond the relation's
770  * current physical EOF; that is likely to cause problems in md.c when
771  * the page is modified and written out. P_NEW is OK, though.
772  *
773  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
774  * a cleanup-strength lock on the page.
775  *
776  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
777  *
778  * If strategy is not NULL, a nondefault buffer access strategy is used.
779  * See buffer/README for details.
780  */
781 Buffer
784 {
785  bool hit;
786  Buffer buf;
787 
788  /*
789  * Reject attempts to read non-local temporary relations; we would be
790  * likely to get wrong data since we have no visibility into the owning
791  * session's local buffers.
792  */
793  if (RELATION_IS_OTHER_TEMP(reln))
794  ereport(ERROR,
795  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
796  errmsg("cannot access temporary tables of other sessions")));
797 
798  /*
799  * Read the buffer, and update pgstat counters to reflect a cache hit or
800  * miss.
801  */
803  buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
804  forkNum, blockNum, mode, strategy, &hit);
805  if (hit)
807  return buf;
808 }
809 
810 
811 /*
812  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
813  * a relcache entry for the relation.
814  *
815  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
816  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
817  * cannot be used for temporary relations (and making that work might be
818  * difficult, unless we only want to read temporary relations for our own
819  * BackendId).
820  */
821 Buffer
823  BlockNumber blockNum, ReadBufferMode mode,
824  BufferAccessStrategy strategy, bool permanent)
825 {
826  bool hit;
827 
828  SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
829 
830  return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
831  RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
832  mode, strategy, &hit);
833 }
834 
835 /*
836  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
837  */
838 Buffer
840  ForkNumber forkNum,
841  BufferAccessStrategy strategy,
842  uint32 flags)
843 {
844  Buffer buf;
845  uint32 extend_by = 1;
846 
847  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
848  &buf, &extend_by);
849 
850  return buf;
851 }
852 
853 /*
854  * Extend relation by multiple blocks.
855  *
856  * Tries to extend the relation by extend_by blocks. Depending on the
857  * availability of resources the relation may end up being extended by a
858  * smaller number of pages (unless an error is thrown, always by at least one
859  * page). *extended_by is updated to the number of pages the relation has been
860  * extended to.
861  *
862  * buffers needs to be an array that is at least extend_by long. Upon
863  * completion, the first extend_by array elements will point to a pinned
864  * buffer.
865  *
866  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
867  * locked. This is useful for callers that want a buffer that is guaranteed to
868  * be empty.
869  */
872  ForkNumber fork,
873  BufferAccessStrategy strategy,
874  uint32 flags,
875  uint32 extend_by,
876  Buffer *buffers,
877  uint32 *extended_by)
878 {
879  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
880  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
881  Assert(extend_by > 0);
882 
883  if (bmr.smgr == NULL)
884  {
885  bmr.smgr = RelationGetSmgr(bmr.rel);
886  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
887  }
888 
889  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
890  extend_by, InvalidBlockNumber,
891  buffers, extended_by);
892 }
893 
894 /*
895  * Extend the relation so it is at least extend_to blocks large, return buffer
896  * (extend_to - 1).
897  *
898  * This is useful for callers that want to write a specific page, regardless
899  * of the current size of the relation (e.g. useful for visibilitymap and for
900  * crash recovery).
901  */
902 Buffer
904  ForkNumber fork,
905  BufferAccessStrategy strategy,
906  uint32 flags,
907  BlockNumber extend_to,
909 {
911  uint32 extended_by = 0;
913  Buffer buffers[64];
914 
915  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
916  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
917  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
918 
919  if (bmr.smgr == NULL)
920  {
921  bmr.smgr = RelationGetSmgr(bmr.rel);
922  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
923  }
924 
925  /*
926  * If desired, create the file if it doesn't exist. If
927  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
928  * an smgrexists call.
929  */
930  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
931  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
933  !smgrexists(bmr.smgr, fork))
934  {
936 
937  /* recheck, fork might have been created concurrently */
938  if (!smgrexists(bmr.smgr, fork))
939  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
940 
942  }
943 
944  /*
945  * If requested, invalidate size cache, so that smgrnblocks asks the
946  * kernel.
947  */
948  if (flags & EB_CLEAR_SIZE_CACHE)
950 
951  /*
952  * Estimate how many pages we'll need to extend by. This avoids acquiring
953  * unnecessarily many victim buffers.
954  */
955  current_size = smgrnblocks(bmr.smgr, fork);
956 
957  /*
958  * Since no-one else can be looking at the page contents yet, there is no
959  * difference between an exclusive lock and a cleanup-strength lock. Note
960  * that we pass the original mode to ReadBuffer_common() below, when
961  * falling back to reading the buffer to a concurrent relation extension.
962  */
964  flags |= EB_LOCK_TARGET;
965 
966  while (current_size < extend_to)
967  {
968  uint32 num_pages = lengthof(buffers);
969  BlockNumber first_block;
970 
971  if ((uint64) current_size + num_pages > extend_to)
972  num_pages = extend_to - current_size;
973 
974  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
975  num_pages, extend_to,
976  buffers, &extended_by);
977 
978  current_size = first_block + extended_by;
979  Assert(num_pages != 0 || current_size >= extend_to);
980 
981  for (uint32 i = 0; i < extended_by; i++)
982  {
983  if (first_block + i != extend_to - 1)
984  ReleaseBuffer(buffers[i]);
985  else
986  buffer = buffers[i];
987  }
988  }
989 
990  /*
991  * It's possible that another backend concurrently extended the relation.
992  * In that case read the buffer.
993  *
994  * XXX: Should we control this via a flag?
995  */
996  if (buffer == InvalidBuffer)
997  {
998  bool hit;
999 
1000  Assert(extended_by == 0);
1002  fork, extend_to - 1, mode, strategy,
1003  &hit);
1004  }
1005 
1006  return buffer;
1007 }
1008 
1009 /*
1010  * ReadBuffer_common -- common logic for all ReadBuffer variants
1011  *
1012  * *hit is set to true if the request was satisfied from shared buffer cache.
1013  */
1014 static Buffer
1015 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1016  BlockNumber blockNum, ReadBufferMode mode,
1017  BufferAccessStrategy strategy, bool *hit)
1018 {
1019  BufferDesc *bufHdr;
1020  Block bufBlock;
1021  bool found;
1022  IOContext io_context;
1023  IOObject io_object;
1024  bool isLocalBuf = SmgrIsTemp(smgr);
1025 
1026  *hit = false;
1027 
1028  /*
1029  * Backward compatibility path, most code should use ExtendBufferedRel()
1030  * instead, as acquiring the extension lock inside ExtendBufferedRel()
1031  * scales a lot better.
1032  */
1033  if (unlikely(blockNum == P_NEW))
1034  {
1036 
1037  /*
1038  * Since no-one else can be looking at the page contents yet, there is
1039  * no difference between an exclusive lock and a cleanup-strength
1040  * lock.
1041  */
1043  flags |= EB_LOCK_FIRST;
1044 
1045  return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
1046  forkNum, strategy, flags);
1047  }
1048 
1049  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1051  smgr->smgr_rlocator.locator.dbOid,
1053  smgr->smgr_rlocator.backend);
1054 
1055  if (isLocalBuf)
1056  {
1057  /*
1058  * We do not use a BufferAccessStrategy for I/O of temporary tables.
1059  * However, in some cases, the "strategy" may not be NULL, so we can't
1060  * rely on IOContextForStrategy() to set the right IOContext for us.
1061  * This may happen in cases like CREATE TEMPORARY TABLE AS...
1062  */
1063  io_context = IOCONTEXT_NORMAL;
1064  io_object = IOOBJECT_TEMP_RELATION;
1065  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
1066  if (found)
1068  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1071  }
1072  else
1073  {
1074  /*
1075  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
1076  * not currently in memory.
1077  */
1078  io_context = IOContextForStrategy(strategy);
1079  io_object = IOOBJECT_RELATION;
1080  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
1081  strategy, &found, io_context);
1082  if (found)
1084  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1087  }
1088 
1089  /* At this point we do NOT hold any locks. */
1090 
1091  /* if it was already in the buffer pool, we're done */
1092  if (found)
1093  {
1094  /* Just need to update stats before we exit */
1095  *hit = true;
1096  VacuumPageHit++;
1097  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1098 
1099  if (VacuumCostActive)
1101 
1102  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1104  smgr->smgr_rlocator.locator.dbOid,
1106  smgr->smgr_rlocator.backend,
1107  found);
1108 
1109  /*
1110  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
1111  * on return.
1112  */
1113  if (!isLocalBuf)
1114  {
1115  if (mode == RBM_ZERO_AND_LOCK)
1117  LW_EXCLUSIVE);
1118  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
1120  }
1121 
1122  return BufferDescriptorGetBuffer(bufHdr);
1123  }
1124 
1125  /*
1126  * if we have gotten to this point, we have allocated a buffer for the
1127  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
1128  * if it's a shared buffer.
1129  */
1130  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
1131 
1132  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
1133 
1134  /*
1135  * Read in the page, unless the caller intends to overwrite it and just
1136  * wants us to allocate a buffer.
1137  */
1139  MemSet((char *) bufBlock, 0, BLCKSZ);
1140  else
1141  {
1143 
1144  smgrread(smgr, forkNum, blockNum, bufBlock);
1145 
1146  pgstat_count_io_op_time(io_object, io_context,
1147  IOOP_READ, io_start, 1);
1148 
1149  /* check for garbage data */
1150  if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1152  {
1154  {
1155  ereport(WARNING,
1157  errmsg("invalid page in block %u of relation %s; zeroing out page",
1158  blockNum,
1159  relpath(smgr->smgr_rlocator, forkNum))));
1160  MemSet((char *) bufBlock, 0, BLCKSZ);
1161  }
1162  else
1163  ereport(ERROR,
1165  errmsg("invalid page in block %u of relation %s",
1166  blockNum,
1167  relpath(smgr->smgr_rlocator, forkNum))));
1168  }
1169  }
1170 
1171  /*
1172  * In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer
1173  * content lock before marking the page as valid, to make sure that no
1174  * other backend sees the zeroed page before the caller has had a chance
1175  * to initialize it.
1176  *
1177  * Since no-one else can be looking at the page contents yet, there is no
1178  * difference between an exclusive lock and a cleanup-strength lock. (Note
1179  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1180  * they assert that the buffer is already valid.)
1181  */
1183  !isLocalBuf)
1184  {
1186  }
1187 
1188  if (isLocalBuf)
1189  {
1190  /* Only need to adjust flags */
1191  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1192 
1193  buf_state |= BM_VALID;
1194  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1195  }
1196  else
1197  {
1198  /* Set BM_VALID, terminate IO, and wake up any waiters */
1199  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1200  }
1201 
1202  VacuumPageMiss++;
1203  if (VacuumCostActive)
1205 
1206  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1208  smgr->smgr_rlocator.locator.dbOid,
1210  smgr->smgr_rlocator.backend,
1211  found);
1212 
1213  return BufferDescriptorGetBuffer(bufHdr);
1214 }
1215 
1216 /*
1217  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
1218  * buffer. If no buffer exists already, selects a replacement
1219  * victim and evicts the old page, but does NOT read in new page.
1220  *
1221  * "strategy" can be a buffer replacement strategy object, or NULL for
1222  * the default strategy. The selected buffer's usage_count is advanced when
1223  * using the default strategy, but otherwise possibly not (see PinBuffer).
1224  *
1225  * The returned buffer is pinned and is already marked as holding the
1226  * desired page. If it already did have the desired page, *foundPtr is
1227  * set true. Otherwise, *foundPtr is set false and the buffer is marked
1228  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
1229  *
1230  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1231  * we keep it for simplicity in ReadBuffer.
1232  *
1233  * io_context is passed as an output parameter to avoid calling
1234  * IOContextForStrategy() when there is a shared buffers hit and no IO
1235  * statistics need be captured.
1236  *
1237  * No locks are held either at entry or exit.
1238  */
1239 static BufferDesc *
1240 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1241  BlockNumber blockNum,
1242  BufferAccessStrategy strategy,
1243  bool *foundPtr, IOContext io_context)
1244 {
1245  BufferTag newTag; /* identity of requested block */
1246  uint32 newHash; /* hash value for newTag */
1247  LWLock *newPartitionLock; /* buffer partition lock for it */
1248  int existing_buf_id;
1249  Buffer victim_buffer;
1250  BufferDesc *victim_buf_hdr;
1251  uint32 victim_buf_state;
1252 
1253  /* Make sure we will have room to remember the buffer pin */
1256 
1257  /* create a tag so we can lookup the buffer */
1258  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1259 
1260  /* determine its hash code and partition lock ID */
1261  newHash = BufTableHashCode(&newTag);
1262  newPartitionLock = BufMappingPartitionLock(newHash);
1263 
1264  /* see if the block is in the buffer pool already */
1265  LWLockAcquire(newPartitionLock, LW_SHARED);
1266  existing_buf_id = BufTableLookup(&newTag, newHash);
1267  if (existing_buf_id >= 0)
1268  {
1269  BufferDesc *buf;
1270  bool valid;
1271 
1272  /*
1273  * Found it. Now, pin the buffer so no one can steal it from the
1274  * buffer pool, and check to see if the correct data has been loaded
1275  * into the buffer.
1276  */
1277  buf = GetBufferDescriptor(existing_buf_id);
1278 
1279  valid = PinBuffer(buf, strategy);
1280 
1281  /* Can release the mapping lock as soon as we've pinned it */
1282  LWLockRelease(newPartitionLock);
1283 
1284  *foundPtr = true;
1285 
1286  if (!valid)
1287  {
1288  /*
1289  * We can only get here if (a) someone else is still reading in
1290  * the page, or (b) a previous read attempt failed. We have to
1291  * wait for any active read attempt to finish, and then set up our
1292  * own read attempt if the page is still not BM_VALID.
1293  * StartBufferIO does it all.
1294  */
1295  if (StartBufferIO(buf, true))
1296  {
1297  /*
1298  * If we get here, previous attempts to read the buffer must
1299  * have failed ... but we shall bravely try again.
1300  */
1301  *foundPtr = false;
1302  }
1303  }
1304 
1305  return buf;
1306  }
1307 
1308  /*
1309  * Didn't find it in the buffer pool. We'll have to initialize a new
1310  * buffer. Remember to unlock the mapping lock while doing the work.
1311  */
1312  LWLockRelease(newPartitionLock);
1313 
1314  /*
1315  * Acquire a victim buffer. Somebody else might try to do the same, we
1316  * don't hold any conflicting locks. If so we'll have to undo our work
1317  * later.
1318  */
1319  victim_buffer = GetVictimBuffer(strategy, io_context);
1320  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1321 
1322  /*
1323  * Try to make a hashtable entry for the buffer under its new tag. If
1324  * somebody else inserted another buffer for the tag, we'll release the
1325  * victim buffer we acquired and use the already inserted one.
1326  */
1327  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1328  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1329  if (existing_buf_id >= 0)
1330  {
1331  BufferDesc *existing_buf_hdr;
1332  bool valid;
1333 
1334  /*
1335  * Got a collision. Someone has already done what we were about to do.
1336  * We'll just handle this as if it were found in the buffer pool in
1337  * the first place. First, give up the buffer we were planning to
1338  * use.
1339  *
1340  * We could do this after releasing the partition lock, but then we'd
1341  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1342  * before acquiring the lock, for the rare case of such a collision.
1343  */
1344  UnpinBuffer(victim_buf_hdr);
1345 
1346  /*
1347  * The victim buffer we acquired previously is clean and unused, let
1348  * it be found again quickly
1349  */
1350  StrategyFreeBuffer(victim_buf_hdr);
1351 
1352  /* remaining code should match code at top of routine */
1353 
1354  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1355 
1356  valid = PinBuffer(existing_buf_hdr, strategy);
1357 
1358  /* Can release the mapping lock as soon as we've pinned it */
1359  LWLockRelease(newPartitionLock);
1360 
1361  *foundPtr = true;
1362 
1363  if (!valid)
1364  {
1365  /*
1366  * We can only get here if (a) someone else is still reading in
1367  * the page, or (b) a previous read attempt failed. We have to
1368  * wait for any active read attempt to finish, and then set up our
1369  * own read attempt if the page is still not BM_VALID.
1370  * StartBufferIO does it all.
1371  */
1372  if (StartBufferIO(existing_buf_hdr, true))
1373  {
1374  /*
1375  * If we get here, previous attempts to read the buffer must
1376  * have failed ... but we shall bravely try again.
1377  */
1378  *foundPtr = false;
1379  }
1380  }
1381 
1382  return existing_buf_hdr;
1383  }
1384 
1385  /*
1386  * Need to lock the buffer header too in order to change its tag.
1387  */
1388  victim_buf_state = LockBufHdr(victim_buf_hdr);
1389 
1390  /* some sanity checks while we hold the buffer header lock */
1391  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1392  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1393 
1394  victim_buf_hdr->tag = newTag;
1395 
1396  /*
1397  * Make sure BM_PERMANENT is set for buffers that must be written at every
1398  * checkpoint. Unlogged buffers only need to be written at shutdown
1399  * checkpoints, except for their "init" forks, which need to be treated
1400  * just like permanent relations.
1401  */
1402  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1403  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1404  victim_buf_state |= BM_PERMANENT;
1405 
1406  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1407 
1408  LWLockRelease(newPartitionLock);
1409 
1410  /*
1411  * Buffer contents are currently invalid. Try to obtain the right to
1412  * start I/O. If StartBufferIO returns false, then someone else managed
1413  * to read it before we did, so there's nothing left for BufferAlloc() to
1414  * do.
1415  */
1416  if (StartBufferIO(victim_buf_hdr, true))
1417  *foundPtr = false;
1418  else
1419  *foundPtr = true;
1420 
1421  return victim_buf_hdr;
1422 }
1423 
1424 /*
1425  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1426  * freelist.
1427  *
1428  * The buffer header spinlock must be held at entry. We drop it before
1429  * returning. (This is sane because the caller must have locked the
1430  * buffer in order to be sure it should be dropped.)
1431  *
1432  * This is used only in contexts such as dropping a relation. We assume
1433  * that no other backend could possibly be interested in using the page,
1434  * so the only reason the buffer might be pinned is if someone else is
1435  * trying to write it out. We have to let them finish before we can
1436  * reclaim the buffer.
1437  *
1438  * The buffer could get reclaimed by someone else while we are waiting
1439  * to acquire the necessary locks; if so, don't mess it up.
1440  */
1441 static void
1443 {
1444  BufferTag oldTag;
1445  uint32 oldHash; /* hash value for oldTag */
1446  LWLock *oldPartitionLock; /* buffer partition lock for it */
1447  uint32 oldFlags;
1448  uint32 buf_state;
1449 
1450  /* Save the original buffer tag before dropping the spinlock */
1451  oldTag = buf->tag;
1452 
1453  buf_state = pg_atomic_read_u32(&buf->state);
1454  Assert(buf_state & BM_LOCKED);
1455  UnlockBufHdr(buf, buf_state);
1456 
1457  /*
1458  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1459  * worth storing the hashcode in BufferDesc so we need not recompute it
1460  * here? Probably not.
1461  */
1462  oldHash = BufTableHashCode(&oldTag);
1463  oldPartitionLock = BufMappingPartitionLock(oldHash);
1464 
1465 retry:
1466 
1467  /*
1468  * Acquire exclusive mapping lock in preparation for changing the buffer's
1469  * association.
1470  */
1471  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1472 
1473  /* Re-lock the buffer header */
1474  buf_state = LockBufHdr(buf);
1475 
1476  /* If it's changed while we were waiting for lock, do nothing */
1477  if (!BufferTagsEqual(&buf->tag, &oldTag))
1478  {
1479  UnlockBufHdr(buf, buf_state);
1480  LWLockRelease(oldPartitionLock);
1481  return;
1482  }
1483 
1484  /*
1485  * We assume the only reason for it to be pinned is that someone else is
1486  * flushing the page out. Wait for them to finish. (This could be an
1487  * infinite loop if the refcount is messed up... it would be nice to time
1488  * out after awhile, but there seems no way to be sure how many loops may
1489  * be needed. Note that if the other guy has pinned the buffer but not
1490  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1491  * be busy-looping here.)
1492  */
1493  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1494  {
1495  UnlockBufHdr(buf, buf_state);
1496  LWLockRelease(oldPartitionLock);
1497  /* safety check: should definitely not be our *own* pin */
1499  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1500  WaitIO(buf);
1501  goto retry;
1502  }
1503 
1504  /*
1505  * Clear out the buffer's tag and flags. We must do this to ensure that
1506  * linear scans of the buffer array don't think the buffer is valid.
1507  */
1508  oldFlags = buf_state & BUF_FLAG_MASK;
1509  ClearBufferTag(&buf->tag);
1510  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1511  UnlockBufHdr(buf, buf_state);
1512 
1513  /*
1514  * Remove the buffer from the lookup hashtable, if it was in there.
1515  */
1516  if (oldFlags & BM_TAG_VALID)
1517  BufTableDelete(&oldTag, oldHash);
1518 
1519  /*
1520  * Done with mapping lock.
1521  */
1522  LWLockRelease(oldPartitionLock);
1523 
1524  /*
1525  * Insert the buffer at the head of the list of free buffers.
1526  */
1528 }
1529 
1530 /*
1531  * Helper routine for GetVictimBuffer()
1532  *
1533  * Needs to be called on a buffer with a valid tag, pinned, but without the
1534  * buffer header spinlock held.
1535  *
1536  * Returns true if the buffer can be reused, in which case the buffer is only
1537  * pinned by this backend and marked as invalid, false otherwise.
1538  */
1539 static bool
1541 {
1542  uint32 buf_state;
1543  uint32 hash;
1544  LWLock *partition_lock;
1545  BufferTag tag;
1546 
1548 
1549  /* have buffer pinned, so it's safe to read tag without lock */
1550  tag = buf_hdr->tag;
1551 
1552  hash = BufTableHashCode(&tag);
1553  partition_lock = BufMappingPartitionLock(hash);
1554 
1555  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1556 
1557  /* lock the buffer header */
1558  buf_state = LockBufHdr(buf_hdr);
1559 
1560  /*
1561  * We have the buffer pinned nobody else should have been able to unset
1562  * this concurrently.
1563  */
1564  Assert(buf_state & BM_TAG_VALID);
1565  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1566  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1567 
1568  /*
1569  * If somebody else pinned the buffer since, or even worse, dirtied it,
1570  * give up on this buffer: It's clearly in use.
1571  */
1572  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1573  {
1574  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1575 
1576  UnlockBufHdr(buf_hdr, buf_state);
1577  LWLockRelease(partition_lock);
1578 
1579  return false;
1580  }
1581 
1582  /*
1583  * Clear out the buffer's tag and flags and usagecount. This is not
1584  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1585  * doing anything with the buffer. But currently it's beneficial, as the
1586  * cheaper pre-check for several linear scans of shared buffers use the
1587  * tag (see e.g. FlushDatabaseBuffers()).
1588  */
1589  ClearBufferTag(&buf_hdr->tag);
1590  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1591  UnlockBufHdr(buf_hdr, buf_state);
1592 
1593  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1594 
1595  /* finally delete buffer from the buffer mapping table */
1596  BufTableDelete(&tag, hash);
1597 
1598  LWLockRelease(partition_lock);
1599 
1600  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1601  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1603 
1604  return true;
1605 }
1606 
1607 static Buffer
1609 {
1610  BufferDesc *buf_hdr;
1611  Buffer buf;
1612  uint32 buf_state;
1613  bool from_ring;
1614 
1615  /*
1616  * Ensure, while the spinlock's not yet held, that there's a free refcount
1617  * entry, and a resource owner slot for the pin.
1618  */
1621 
1622  /* we return here if a prospective victim buffer gets used concurrently */
1623 again:
1624 
1625  /*
1626  * Select a victim buffer. The buffer is returned with its header
1627  * spinlock still held!
1628  */
1629  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1630  buf = BufferDescriptorGetBuffer(buf_hdr);
1631 
1632  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1633 
1634  /* Pin the buffer and then release the buffer spinlock */
1635  PinBuffer_Locked(buf_hdr);
1636 
1637  /*
1638  * We shouldn't have any other pins for this buffer.
1639  */
1641 
1642  /*
1643  * If the buffer was dirty, try to write it out. There is a race
1644  * condition here, in that someone might dirty it after we released the
1645  * buffer header lock above, or even while we are writing it out (since
1646  * our share-lock won't prevent hint-bit updates). We will recheck the
1647  * dirty bit after re-locking the buffer header.
1648  */
1649  if (buf_state & BM_DIRTY)
1650  {
1651  LWLock *content_lock;
1652 
1653  Assert(buf_state & BM_TAG_VALID);
1654  Assert(buf_state & BM_VALID);
1655 
1656  /*
1657  * We need a share-lock on the buffer contents to write it out (else
1658  * we might write invalid data, eg because someone else is compacting
1659  * the page contents while we write). We must use a conditional lock
1660  * acquisition here to avoid deadlock. Even though the buffer was not
1661  * pinned (and therefore surely not locked) when StrategyGetBuffer
1662  * returned it, someone else could have pinned and exclusive-locked it
1663  * by the time we get here. If we try to get the lock unconditionally,
1664  * we'd block waiting for them; if they later block waiting for us,
1665  * deadlock ensues. (This has been observed to happen when two
1666  * backends are both trying to split btree index pages, and the second
1667  * one just happens to be trying to split the page the first one got
1668  * from StrategyGetBuffer.)
1669  */
1670  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1671  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1672  {
1673  /*
1674  * Someone else has locked the buffer, so give it up and loop back
1675  * to get another one.
1676  */
1677  UnpinBuffer(buf_hdr);
1678  goto again;
1679  }
1680 
1681  /*
1682  * If using a nondefault strategy, and writing the buffer would
1683  * require a WAL flush, let the strategy decide whether to go ahead
1684  * and write/reuse the buffer or to choose another victim. We need a
1685  * lock to inspect the page LSN, so this can't be done inside
1686  * StrategyGetBuffer.
1687  */
1688  if (strategy != NULL)
1689  {
1690  XLogRecPtr lsn;
1691 
1692  /* Read the LSN while holding buffer header lock */
1693  buf_state = LockBufHdr(buf_hdr);
1694  lsn = BufferGetLSN(buf_hdr);
1695  UnlockBufHdr(buf_hdr, buf_state);
1696 
1697  if (XLogNeedsFlush(lsn)
1698  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1699  {
1700  LWLockRelease(content_lock);
1701  UnpinBuffer(buf_hdr);
1702  goto again;
1703  }
1704  }
1705 
1706  /* OK, do the I/O */
1707  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1708  LWLockRelease(content_lock);
1709 
1711  &buf_hdr->tag);
1712  }
1713 
1714 
1715  if (buf_state & BM_VALID)
1716  {
1717  /*
1718  * When a BufferAccessStrategy is in use, blocks evicted from shared
1719  * buffers are counted as IOOP_EVICT in the corresponding context
1720  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1721  * strategy in two cases: 1) while initially claiming buffers for the
1722  * strategy ring 2) to replace an existing strategy ring buffer
1723  * because it is pinned or in use and cannot be reused.
1724  *
1725  * Blocks evicted from buffers already in the strategy ring are
1726  * counted as IOOP_REUSE in the corresponding strategy context.
1727  *
1728  * At this point, we can accurately count evictions and reuses,
1729  * because we have successfully claimed the valid buffer. Previously,
1730  * we may have been forced to release the buffer due to concurrent
1731  * pinners or erroring out.
1732  */
1734  from_ring ? IOOP_REUSE : IOOP_EVICT);
1735  }
1736 
1737  /*
1738  * If the buffer has an entry in the buffer mapping table, delete it. This
1739  * can fail because another backend could have pinned or dirtied the
1740  * buffer.
1741  */
1742  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
1743  {
1744  UnpinBuffer(buf_hdr);
1745  goto again;
1746  }
1747 
1748  /* a final set of sanity checks */
1749 #ifdef USE_ASSERT_CHECKING
1750  buf_state = pg_atomic_read_u32(&buf_hdr->state);
1751 
1752  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
1753  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
1754 
1756 #endif
1757 
1758  return buf;
1759 }
1760 
1761 /*
1762  * Limit the number of pins a batch operation may additionally acquire, to
1763  * avoid running out of pinnable buffers.
1764  *
1765  * One additional pin is always allowed, as otherwise the operation likely
1766  * cannot be performed at all.
1767  *
1768  * The number of allowed pins for a backend is computed based on
1769  * shared_buffers and the maximum number of connections possible. That's very
1770  * pessimistic, but outside of toy-sized shared_buffers it should allow
1771  * sufficient pins.
1772  */
1773 static void
1774 LimitAdditionalPins(uint32 *additional_pins)
1775 {
1776  uint32 max_backends;
1777  int max_proportional_pins;
1778 
1779  if (*additional_pins <= 1)
1780  return;
1781 
1782  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
1783  max_proportional_pins = NBuffers / max_backends;
1784 
1785  /*
1786  * Subtract the approximate number of buffers already pinned by this
1787  * backend. We get the number of "overflowed" pins for free, but don't
1788  * know the number of pins in PrivateRefCountArray. The cost of
1789  * calculating that exactly doesn't seem worth it, so just assume the max.
1790  */
1791  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
1792 
1793  if (max_proportional_pins <= 0)
1794  max_proportional_pins = 1;
1795 
1796  if (*additional_pins > max_proportional_pins)
1797  *additional_pins = max_proportional_pins;
1798 }
1799 
1800 /*
1801  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
1802  * avoid duplicating the tracing and relpersistence related logic.
1803  */
1804 static BlockNumber
1806  ForkNumber fork,
1807  BufferAccessStrategy strategy,
1808  uint32 flags,
1809  uint32 extend_by,
1810  BlockNumber extend_upto,
1811  Buffer *buffers,
1812  uint32 *extended_by)
1813 {
1814  BlockNumber first_block;
1815 
1816  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
1820  bmr.smgr->smgr_rlocator.backend,
1821  extend_by);
1822 
1823  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
1824  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
1825  extend_by, extend_upto,
1826  buffers, &extend_by);
1827  else
1828  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
1829  extend_by, extend_upto,
1830  buffers, &extend_by);
1831  *extended_by = extend_by;
1832 
1833  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
1837  bmr.smgr->smgr_rlocator.backend,
1838  *extended_by,
1839  first_block);
1840 
1841  return first_block;
1842 }
1843 
1844 /*
1845  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
1846  * shared buffers.
1847  */
1848 static BlockNumber
1850  ForkNumber fork,
1851  BufferAccessStrategy strategy,
1852  uint32 flags,
1853  uint32 extend_by,
1854  BlockNumber extend_upto,
1855  Buffer *buffers,
1856  uint32 *extended_by)
1857 {
1858  BlockNumber first_block;
1859  IOContext io_context = IOContextForStrategy(strategy);
1860  instr_time io_start;
1861 
1862  LimitAdditionalPins(&extend_by);
1863 
1864  /*
1865  * Acquire victim buffers for extension without holding extension lock.
1866  * Writing out victim buffers is the most expensive part of extending the
1867  * relation, particularly when doing so requires WAL flushes. Zeroing out
1868  * the buffers is also quite expensive, so do that before holding the
1869  * extension lock as well.
1870  *
1871  * These pages are pinned by us and not valid. While we hold the pin they
1872  * can't be acquired as victim buffers by another backend.
1873  */
1874  for (uint32 i = 0; i < extend_by; i++)
1875  {
1876  Block buf_block;
1877 
1878  buffers[i] = GetVictimBuffer(strategy, io_context);
1879  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
1880 
1881  /* new buffers are zero-filled */
1882  MemSet((char *) buf_block, 0, BLCKSZ);
1883  }
1884 
1885  /*
1886  * Lock relation against concurrent extensions, unless requested not to.
1887  *
1888  * We use the same extension lock for all forks. That's unnecessarily
1889  * restrictive, but currently extensions for forks don't happen often
1890  * enough to make it worth locking more granularly.
1891  *
1892  * Note that another backend might have extended the relation by the time
1893  * we get the lock.
1894  */
1895  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1897 
1898  /*
1899  * If requested, invalidate size cache, so that smgrnblocks asks the
1900  * kernel.
1901  */
1902  if (flags & EB_CLEAR_SIZE_CACHE)
1904 
1905  first_block = smgrnblocks(bmr.smgr, fork);
1906 
1907  /*
1908  * Now that we have the accurate relation size, check if the caller wants
1909  * us to extend to only up to a specific size. If there were concurrent
1910  * extensions, we might have acquired too many buffers and need to release
1911  * them.
1912  */
1913  if (extend_upto != InvalidBlockNumber)
1914  {
1915  uint32 orig_extend_by = extend_by;
1916 
1917  if (first_block > extend_upto)
1918  extend_by = 0;
1919  else if ((uint64) first_block + extend_by > extend_upto)
1920  extend_by = extend_upto - first_block;
1921 
1922  for (uint32 i = extend_by; i < orig_extend_by; i++)
1923  {
1924  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
1925 
1926  /*
1927  * The victim buffer we acquired previously is clean and unused,
1928  * let it be found again quickly
1929  */
1930  StrategyFreeBuffer(buf_hdr);
1931  UnpinBuffer(buf_hdr);
1932  }
1933 
1934  if (extend_by == 0)
1935  {
1936  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1938  *extended_by = extend_by;
1939  return first_block;
1940  }
1941  }
1942 
1943  /* Fail if relation is already at maximum possible length */
1944  if ((uint64) first_block + extend_by >= MaxBlockNumber)
1945  ereport(ERROR,
1946  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1947  errmsg("cannot extend relation %s beyond %u blocks",
1948  relpath(bmr.smgr->smgr_rlocator, fork),
1949  MaxBlockNumber)));
1950 
1951  /*
1952  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
1953  *
1954  * This needs to happen before we extend the relation, because as soon as
1955  * we do, other backends can start to read in those pages.
1956  */
1957  for (uint32 i = 0; i < extend_by; i++)
1958  {
1959  Buffer victim_buf = buffers[i];
1960  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
1961  BufferTag tag;
1962  uint32 hash;
1963  LWLock *partition_lock;
1964  int existing_id;
1965 
1966  /* in case we need to pin an existing buffer below */
1969 
1970  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
1971  hash = BufTableHashCode(&tag);
1972  partition_lock = BufMappingPartitionLock(hash);
1973 
1974  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1975 
1976  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
1977 
1978  /*
1979  * We get here only in the corner case where we are trying to extend
1980  * the relation but we found a pre-existing buffer. This can happen
1981  * because a prior attempt at extending the relation failed, and
1982  * because mdread doesn't complain about reads beyond EOF (when
1983  * zero_damaged_pages is ON) and so a previous attempt to read a block
1984  * beyond EOF could have left a "valid" zero-filled buffer.
1985  * Unfortunately, we have also seen this case occurring because of
1986  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
1987  * that doesn't account for a recent write. In that situation, the
1988  * pre-existing buffer would contain valid data that we don't want to
1989  * overwrite. Since the legitimate cases should always have left a
1990  * zero-filled buffer, complain if not PageIsNew.
1991  */
1992  if (existing_id >= 0)
1993  {
1994  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
1995  Block buf_block;
1996  bool valid;
1997 
1998  /*
1999  * Pin the existing buffer before releasing the partition lock,
2000  * preventing it from being evicted.
2001  */
2002  valid = PinBuffer(existing_hdr, strategy);
2003 
2004  LWLockRelease(partition_lock);
2005 
2006  /*
2007  * The victim buffer we acquired previously is clean and unused,
2008  * let it be found again quickly
2009  */
2010  StrategyFreeBuffer(victim_buf_hdr);
2011  UnpinBuffer(victim_buf_hdr);
2012 
2013  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2014  buf_block = BufHdrGetBlock(existing_hdr);
2015 
2016  if (valid && !PageIsNew((Page) buf_block))
2017  ereport(ERROR,
2018  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2019  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2020  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2021 
2022  /*
2023  * We *must* do smgr[zero]extend before succeeding, else the page
2024  * will not be reserved by the kernel, and the next P_NEW call
2025  * will decide to return the same page. Clear the BM_VALID bit,
2026  * do StartBufferIO() and proceed.
2027  *
2028  * Loop to handle the very small possibility that someone re-sets
2029  * BM_VALID between our clearing it and StartBufferIO inspecting
2030  * it.
2031  */
2032  do
2033  {
2034  uint32 buf_state = LockBufHdr(existing_hdr);
2035 
2036  buf_state &= ~BM_VALID;
2037  UnlockBufHdr(existing_hdr, buf_state);
2038  } while (!StartBufferIO(existing_hdr, true));
2039  }
2040  else
2041  {
2042  uint32 buf_state;
2043 
2044  buf_state = LockBufHdr(victim_buf_hdr);
2045 
2046  /* some sanity checks while we hold the buffer header lock */
2047  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2048  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2049 
2050  victim_buf_hdr->tag = tag;
2051 
2052  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2053  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2054  buf_state |= BM_PERMANENT;
2055 
2056  UnlockBufHdr(victim_buf_hdr, buf_state);
2057 
2058  LWLockRelease(partition_lock);
2059 
2060  /* XXX: could combine the locked operations in it with the above */
2061  StartBufferIO(victim_buf_hdr, true);
2062  }
2063  }
2064 
2066 
2067  /*
2068  * Note: if smgrzeroextend fails, we will end up with buffers that are
2069  * allocated but not marked BM_VALID. The next relation extension will
2070  * still select the same block number (because the relation didn't get any
2071  * longer on disk) and so future attempts to extend the relation will find
2072  * the same buffers (if they have not been recycled) but come right back
2073  * here to try smgrzeroextend again.
2074  *
2075  * We don't need to set checksum for all-zero pages.
2076  */
2077  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2078 
2079  /*
2080  * Release the file-extension lock; it's now OK for someone else to extend
2081  * the relation some more.
2082  *
2083  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2084  * take noticeable time.
2085  */
2086  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2088 
2090  io_start, extend_by);
2091 
2092  /* Set BM_VALID, terminate IO, and wake up any waiters */
2093  for (uint32 i = 0; i < extend_by; i++)
2094  {
2095  Buffer buf = buffers[i];
2096  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2097  bool lock = false;
2098 
2099  if (flags & EB_LOCK_FIRST && i == 0)
2100  lock = true;
2101  else if (flags & EB_LOCK_TARGET)
2102  {
2103  Assert(extend_upto != InvalidBlockNumber);
2104  if (first_block + i + 1 == extend_upto)
2105  lock = true;
2106  }
2107 
2108  if (lock)
2110 
2111  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2112  }
2113 
2114  pgBufferUsage.shared_blks_written += extend_by;
2115 
2116  *extended_by = extend_by;
2117 
2118  return first_block;
2119 }
2120 
2121 /*
2122  * BufferIsExclusiveLocked
2123  *
2124  * Checks if buffer is exclusive-locked.
2125  *
2126  * Buffer must be pinned.
2127  */
2128 bool
2130 {
2131  BufferDesc *bufHdr;
2132 
2133  if (BufferIsLocal(buffer))
2134  {
2135  int bufid = -buffer - 1;
2136 
2137  bufHdr = GetLocalBufferDescriptor(bufid);
2138  }
2139  else
2140  {
2141  bufHdr = GetBufferDescriptor(buffer - 1);
2142  }
2143 
2146  LW_EXCLUSIVE);
2147 }
2148 
2149 /*
2150  * BufferIsDirty
2151  *
2152  * Checks if buffer is already dirty.
2153  *
2154  * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2155  * the result may be stale before it's returned.)
2156  */
2157 bool
2159 {
2160  BufferDesc *bufHdr;
2161 
2162  if (BufferIsLocal(buffer))
2163  {
2164  int bufid = -buffer - 1;
2165 
2166  bufHdr = GetLocalBufferDescriptor(bufid);
2167  }
2168  else
2169  {
2170  bufHdr = GetBufferDescriptor(buffer - 1);
2171  }
2172 
2175  LW_EXCLUSIVE));
2176 
2177  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2178 }
2179 
2180 /*
2181  * MarkBufferDirty
2182  *
2183  * Marks buffer contents as dirty (actual write happens later).
2184  *
2185  * Buffer must be pinned and exclusive-locked. (If caller does not hold
2186  * exclusive lock, then somebody could be in process of writing the buffer,
2187  * leading to risk of bad data written to disk.)
2188  */
2189 void
2191 {
2192  BufferDesc *bufHdr;
2193  uint32 buf_state;
2194  uint32 old_buf_state;
2195 
2196  if (!BufferIsValid(buffer))
2197  elog(ERROR, "bad buffer ID: %d", buffer);
2198 
2199  if (BufferIsLocal(buffer))
2200  {
2202  return;
2203  }
2204 
2205  bufHdr = GetBufferDescriptor(buffer - 1);
2206 
2209  LW_EXCLUSIVE));
2210 
2211  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2212  for (;;)
2213  {
2214  if (old_buf_state & BM_LOCKED)
2215  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2216 
2217  buf_state = old_buf_state;
2218 
2219  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2220  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2221 
2222  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2223  buf_state))
2224  break;
2225  }
2226 
2227  /*
2228  * If the buffer was not dirty already, do vacuum accounting.
2229  */
2230  if (!(old_buf_state & BM_DIRTY))
2231  {
2232  VacuumPageDirty++;
2234  if (VacuumCostActive)
2236  }
2237 }
2238 
2239 /*
2240  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2241  *
2242  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2243  * compared to calling the two routines separately. Now it's mainly just
2244  * a convenience function. However, if the passed buffer is valid and
2245  * already contains the desired block, we just return it as-is; and that
2246  * does save considerable work compared to a full release and reacquire.
2247  *
2248  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2249  * buffer actually needs to be released. This case is the same as ReadBuffer,
2250  * but can save some tests in the caller.
2251  */
2252 Buffer
2254  Relation relation,
2255  BlockNumber blockNum)
2256 {
2257  ForkNumber forkNum = MAIN_FORKNUM;
2258  BufferDesc *bufHdr;
2259 
2260  if (BufferIsValid(buffer))
2261  {
2263  if (BufferIsLocal(buffer))
2264  {
2265  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2266  if (bufHdr->tag.blockNum == blockNum &&
2267  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2268  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2269  return buffer;
2271  }
2272  else
2273  {
2274  bufHdr = GetBufferDescriptor(buffer - 1);
2275  /* we have pin, so it's ok to examine tag without spinlock */
2276  if (bufHdr->tag.blockNum == blockNum &&
2277  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2278  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2279  return buffer;
2280  UnpinBuffer(bufHdr);
2281  }
2282  }
2283 
2284  return ReadBuffer(relation, blockNum);
2285 }
2286 
2287 /*
2288  * PinBuffer -- make buffer unavailable for replacement.
2289  *
2290  * For the default access strategy, the buffer's usage_count is incremented
2291  * when we first pin it; for other strategies we just make sure the usage_count
2292  * isn't zero. (The idea of the latter is that we don't want synchronized
2293  * heap scans to inflate the count, but we need it to not be zero to discourage
2294  * other backends from stealing buffers from our ring. As long as we cycle
2295  * through the ring faster than the global clock-sweep cycles, buffers in
2296  * our ring won't be chosen as victims for replacement by other backends.)
2297  *
2298  * This should be applied only to shared buffers, never local ones.
2299  *
2300  * Since buffers are pinned/unpinned very frequently, pin buffers without
2301  * taking the buffer header lock; instead update the state variable in loop of
2302  * CAS operations. Hopefully it's just a single CAS.
2303  *
2304  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2305  * must have been done already.
2306  *
2307  * Returns true if buffer is BM_VALID, else false. This provision allows
2308  * some callers to avoid an extra spinlock cycle.
2309  */
2310 static bool
2312 {
2314  bool result;
2315  PrivateRefCountEntry *ref;
2316 
2317  Assert(!BufferIsLocal(b));
2318  Assert(ReservedRefCountEntry != NULL);
2319 
2320  ref = GetPrivateRefCountEntry(b, true);
2321 
2322  if (ref == NULL)
2323  {
2324  uint32 buf_state;
2325  uint32 old_buf_state;
2326 
2327  ref = NewPrivateRefCountEntry(b);
2328 
2329  old_buf_state = pg_atomic_read_u32(&buf->state);
2330  for (;;)
2331  {
2332  if (old_buf_state & BM_LOCKED)
2333  old_buf_state = WaitBufHdrUnlocked(buf);
2334 
2335  buf_state = old_buf_state;
2336 
2337  /* increase refcount */
2338  buf_state += BUF_REFCOUNT_ONE;
2339 
2340  if (strategy == NULL)
2341  {
2342  /* Default case: increase usagecount unless already max. */
2344  buf_state += BUF_USAGECOUNT_ONE;
2345  }
2346  else
2347  {
2348  /*
2349  * Ring buffers shouldn't evict others from pool. Thus we
2350  * don't make usagecount more than 1.
2351  */
2352  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2353  buf_state += BUF_USAGECOUNT_ONE;
2354  }
2355 
2356  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2357  buf_state))
2358  {
2359  result = (buf_state & BM_VALID) != 0;
2360 
2361  /*
2362  * Assume that we acquired a buffer pin for the purposes of
2363  * Valgrind buffer client checks (even in !result case) to
2364  * keep things simple. Buffers that are unsafe to access are
2365  * not generally guaranteed to be marked undefined or
2366  * non-accessible in any case.
2367  */
2369  break;
2370  }
2371  }
2372  }
2373  else
2374  {
2375  /*
2376  * If we previously pinned the buffer, it must surely be valid.
2377  *
2378  * Note: We deliberately avoid a Valgrind client request here.
2379  * Individual access methods can optionally superimpose buffer page
2380  * client requests on top of our client requests to enforce that
2381  * buffers are only accessed while locked (and pinned). It's possible
2382  * that the buffer page is legitimately non-accessible here. We
2383  * cannot meddle with that.
2384  */
2385  result = true;
2386  }
2387 
2388  ref->refcount++;
2389  Assert(ref->refcount > 0);
2391  return result;
2392 }
2393 
2394 /*
2395  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2396  * The spinlock is released before return.
2397  *
2398  * As this function is called with the spinlock held, the caller has to
2399  * previously call ReservePrivateRefCountEntry() and
2400  * ResourceOwnerEnlarge(CurrentResourceOwner);
2401  *
2402  * Currently, no callers of this function want to modify the buffer's
2403  * usage_count at all, so there's no need for a strategy parameter.
2404  * Also we don't bother with a BM_VALID test (the caller could check that for
2405  * itself).
2406  *
2407  * Also all callers only ever use this function when it's known that the
2408  * buffer can't have a preexisting pin by this backend. That allows us to skip
2409  * searching the private refcount array & hash, which is a boon, because the
2410  * spinlock is still held.
2411  *
2412  * Note: use of this routine is frequently mandatory, not just an optimization
2413  * to save a spin lock/unlock cycle, because we need to pin a buffer before
2414  * its state can change under us.
2415  */
2416 static void
2418 {
2419  Buffer b;
2420  PrivateRefCountEntry *ref;
2421  uint32 buf_state;
2422 
2423  /*
2424  * As explained, We don't expect any preexisting pins. That allows us to
2425  * manipulate the PrivateRefCount after releasing the spinlock
2426  */
2428 
2429  /*
2430  * Buffer can't have a preexisting pin, so mark its page as defined to
2431  * Valgrind (this is similar to the PinBuffer() case where the backend
2432  * doesn't already have a buffer pin)
2433  */
2435 
2436  /*
2437  * Since we hold the buffer spinlock, we can update the buffer state and
2438  * release the lock in one operation.
2439  */
2440  buf_state = pg_atomic_read_u32(&buf->state);
2441  Assert(buf_state & BM_LOCKED);
2442  buf_state += BUF_REFCOUNT_ONE;
2443  UnlockBufHdr(buf, buf_state);
2444 
2446 
2447  ref = NewPrivateRefCountEntry(b);
2448  ref->refcount++;
2449 
2451 }
2452 
2453 /*
2454  * UnpinBuffer -- make buffer available for replacement.
2455  *
2456  * This should be applied only to shared buffers, never local ones. This
2457  * always adjusts CurrentResourceOwner.
2458  */
2459 static void
2461 {
2463 
2466 }
2467 
2468 static void
2470 {
2471  PrivateRefCountEntry *ref;
2473 
2474  Assert(!BufferIsLocal(b));
2475 
2476  /* not moving as we're likely deleting it soon anyway */
2477  ref = GetPrivateRefCountEntry(b, false);
2478  Assert(ref != NULL);
2479  Assert(ref->refcount > 0);
2480  ref->refcount--;
2481  if (ref->refcount == 0)
2482  {
2483  uint32 buf_state;
2484  uint32 old_buf_state;
2485 
2486  /*
2487  * Mark buffer non-accessible to Valgrind.
2488  *
2489  * Note that the buffer may have already been marked non-accessible
2490  * within access method code that enforces that buffers are only
2491  * accessed while a buffer lock is held.
2492  */
2494 
2495  /* I'd better not still hold the buffer content lock */
2497 
2498  /*
2499  * Decrement the shared reference count.
2500  *
2501  * Since buffer spinlock holder can update status using just write,
2502  * it's not safe to use atomic decrement here; thus use a CAS loop.
2503  */
2504  old_buf_state = pg_atomic_read_u32(&buf->state);
2505  for (;;)
2506  {
2507  if (old_buf_state & BM_LOCKED)
2508  old_buf_state = WaitBufHdrUnlocked(buf);
2509 
2510  buf_state = old_buf_state;
2511 
2512  buf_state -= BUF_REFCOUNT_ONE;
2513 
2514  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2515  buf_state))
2516  break;
2517  }
2518 
2519  /* Support LockBufferForCleanup() */
2520  if (buf_state & BM_PIN_COUNT_WAITER)
2521  {
2522  /*
2523  * Acquire the buffer header lock, re-check that there's a waiter.
2524  * Another backend could have unpinned this buffer, and already
2525  * woken up the waiter. There's no danger of the buffer being
2526  * replaced after we unpinned it above, as it's pinned by the
2527  * waiter.
2528  */
2529  buf_state = LockBufHdr(buf);
2530 
2531  if ((buf_state & BM_PIN_COUNT_WAITER) &&
2532  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2533  {
2534  /* we just released the last pin other than the waiter's */
2535  int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2536 
2537  buf_state &= ~BM_PIN_COUNT_WAITER;
2538  UnlockBufHdr(buf, buf_state);
2539  ProcSendSignal(wait_backend_pgprocno);
2540  }
2541  else
2542  UnlockBufHdr(buf, buf_state);
2543  }
2545  }
2546 }
2547 
2548 #define ST_SORT sort_checkpoint_bufferids
2549 #define ST_ELEMENT_TYPE CkptSortItem
2550 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2551 #define ST_SCOPE static
2552 #define ST_DEFINE
2553 #include <lib/sort_template.h>
2554 
2555 /*
2556  * BufferSync -- Write out all dirty buffers in the pool.
2557  *
2558  * This is called at checkpoint time to write out all dirty shared buffers.
2559  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2560  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2561  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2562  * unlogged buffers, which are otherwise skipped. The remaining flags
2563  * currently have no effect here.
2564  */
2565 static void
2566 BufferSync(int flags)
2567 {
2568  uint32 buf_state;
2569  int buf_id;
2570  int num_to_scan;
2571  int num_spaces;
2572  int num_processed;
2573  int num_written;
2574  CkptTsStatus *per_ts_stat = NULL;
2575  Oid last_tsid;
2576  binaryheap *ts_heap;
2577  int i;
2578  int mask = BM_DIRTY;
2579  WritebackContext wb_context;
2580 
2581  /*
2582  * Unless this is a shutdown checkpoint or we have been explicitly told,
2583  * we write only permanent, dirty buffers. But at shutdown or end of
2584  * recovery, we write all dirty buffers.
2585  */
2588  mask |= BM_PERMANENT;
2589 
2590  /*
2591  * Loop over all buffers, and mark the ones that need to be written with
2592  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2593  * can estimate how much work needs to be done.
2594  *
2595  * This allows us to write only those pages that were dirty when the
2596  * checkpoint began, and not those that get dirtied while it proceeds.
2597  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2598  * later in this function, or by normal backends or the bgwriter cleaning
2599  * scan, the flag is cleared. Any buffer dirtied after this point won't
2600  * have the flag set.
2601  *
2602  * Note that if we fail to write some buffer, we may leave buffers with
2603  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2604  * certainly need to be written for the next checkpoint attempt, too.
2605  */
2606  num_to_scan = 0;
2607  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2608  {
2609  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2610 
2611  /*
2612  * Header spinlock is enough to examine BM_DIRTY, see comment in
2613  * SyncOneBuffer.
2614  */
2615  buf_state = LockBufHdr(bufHdr);
2616 
2617  if ((buf_state & mask) == mask)
2618  {
2619  CkptSortItem *item;
2620 
2621  buf_state |= BM_CHECKPOINT_NEEDED;
2622 
2623  item = &CkptBufferIds[num_to_scan++];
2624  item->buf_id = buf_id;
2625  item->tsId = bufHdr->tag.spcOid;
2626  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2627  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2628  item->blockNum = bufHdr->tag.blockNum;
2629  }
2630 
2631  UnlockBufHdr(bufHdr, buf_state);
2632 
2633  /* Check for barrier events in case NBuffers is large. */
2636  }
2637 
2638  if (num_to_scan == 0)
2639  return; /* nothing to do */
2640 
2642 
2643  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2644 
2645  /*
2646  * Sort buffers that need to be written to reduce the likelihood of random
2647  * IO. The sorting is also important for the implementation of balancing
2648  * writes between tablespaces. Without balancing writes we'd potentially
2649  * end up writing to the tablespaces one-by-one; possibly overloading the
2650  * underlying system.
2651  */
2652  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2653 
2654  num_spaces = 0;
2655 
2656  /*
2657  * Allocate progress status for each tablespace with buffers that need to
2658  * be flushed. This requires the to-be-flushed array to be sorted.
2659  */
2660  last_tsid = InvalidOid;
2661  for (i = 0; i < num_to_scan; i++)
2662  {
2663  CkptTsStatus *s;
2664  Oid cur_tsid;
2665 
2666  cur_tsid = CkptBufferIds[i].tsId;
2667 
2668  /*
2669  * Grow array of per-tablespace status structs, every time a new
2670  * tablespace is found.
2671  */
2672  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2673  {
2674  Size sz;
2675 
2676  num_spaces++;
2677 
2678  /*
2679  * Not worth adding grow-by-power-of-2 logic here - even with a
2680  * few hundred tablespaces this should be fine.
2681  */
2682  sz = sizeof(CkptTsStatus) * num_spaces;
2683 
2684  if (per_ts_stat == NULL)
2685  per_ts_stat = (CkptTsStatus *) palloc(sz);
2686  else
2687  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2688 
2689  s = &per_ts_stat[num_spaces - 1];
2690  memset(s, 0, sizeof(*s));
2691  s->tsId = cur_tsid;
2692 
2693  /*
2694  * The first buffer in this tablespace. As CkptBufferIds is sorted
2695  * by tablespace all (s->num_to_scan) buffers in this tablespace
2696  * will follow afterwards.
2697  */
2698  s->index = i;
2699 
2700  /*
2701  * progress_slice will be determined once we know how many buffers
2702  * are in each tablespace, i.e. after this loop.
2703  */
2704 
2705  last_tsid = cur_tsid;
2706  }
2707  else
2708  {
2709  s = &per_ts_stat[num_spaces - 1];
2710  }
2711 
2712  s->num_to_scan++;
2713 
2714  /* Check for barrier events. */
2717  }
2718 
2719  Assert(num_spaces > 0);
2720 
2721  /*
2722  * Build a min-heap over the write-progress in the individual tablespaces,
2723  * and compute how large a portion of the total progress a single
2724  * processed buffer is.
2725  */
2726  ts_heap = binaryheap_allocate(num_spaces,
2728  NULL);
2729 
2730  for (i = 0; i < num_spaces; i++)
2731  {
2732  CkptTsStatus *ts_stat = &per_ts_stat[i];
2733 
2734  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2735 
2736  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2737  }
2738 
2739  binaryheap_build(ts_heap);
2740 
2741  /*
2742  * Iterate through to-be-checkpointed buffers and write the ones (still)
2743  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2744  * tablespaces; otherwise the sorting would lead to only one tablespace
2745  * receiving writes at a time, making inefficient use of the hardware.
2746  */
2747  num_processed = 0;
2748  num_written = 0;
2749  while (!binaryheap_empty(ts_heap))
2750  {
2751  BufferDesc *bufHdr = NULL;
2752  CkptTsStatus *ts_stat = (CkptTsStatus *)
2754 
2755  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2756  Assert(buf_id != -1);
2757 
2758  bufHdr = GetBufferDescriptor(buf_id);
2759 
2760  num_processed++;
2761 
2762  /*
2763  * We don't need to acquire the lock here, because we're only looking
2764  * at a single bit. It's possible that someone else writes the buffer
2765  * and clears the flag right after we check, but that doesn't matter
2766  * since SyncOneBuffer will then do nothing. However, there is a
2767  * further race condition: it's conceivable that between the time we
2768  * examine the bit here and the time SyncOneBuffer acquires the lock,
2769  * someone else not only wrote the buffer but replaced it with another
2770  * page and dirtied it. In that improbable case, SyncOneBuffer will
2771  * write the buffer though we didn't need to. It doesn't seem worth
2772  * guarding against this, though.
2773  */
2775  {
2776  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2777  {
2778  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2780  num_written++;
2781  }
2782  }
2783 
2784  /*
2785  * Measure progress independent of actually having to flush the buffer
2786  * - otherwise writing become unbalanced.
2787  */
2788  ts_stat->progress += ts_stat->progress_slice;
2789  ts_stat->num_scanned++;
2790  ts_stat->index++;
2791 
2792  /* Have all the buffers from the tablespace been processed? */
2793  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2794  {
2795  binaryheap_remove_first(ts_heap);
2796  }
2797  else
2798  {
2799  /* update heap with the new progress */
2800  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2801  }
2802 
2803  /*
2804  * Sleep to throttle our I/O rate.
2805  *
2806  * (This will check for barrier events even if it doesn't sleep.)
2807  */
2808  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2809  }
2810 
2811  /*
2812  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
2813  * IOContext will always be IOCONTEXT_NORMAL.
2814  */
2816 
2817  pfree(per_ts_stat);
2818  per_ts_stat = NULL;
2819  binaryheap_free(ts_heap);
2820 
2821  /*
2822  * Update checkpoint statistics. As noted above, this doesn't include
2823  * buffers written by other backends or bgwriter scan.
2824  */
2825  CheckpointStats.ckpt_bufs_written += num_written;
2826 
2827  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2828 }
2829 
2830 /*
2831  * BgBufferSync -- Write out some dirty buffers in the pool.
2832  *
2833  * This is called periodically by the background writer process.
2834  *
2835  * Returns true if it's appropriate for the bgwriter process to go into
2836  * low-power hibernation mode. (This happens if the strategy clock sweep
2837  * has been "lapped" and no buffer allocations have occurred recently,
2838  * or if the bgwriter has been effectively disabled by setting
2839  * bgwriter_lru_maxpages to 0.)
2840  */
2841 bool
2843 {
2844  /* info obtained from freelist.c */
2845  int strategy_buf_id;
2846  uint32 strategy_passes;
2847  uint32 recent_alloc;
2848 
2849  /*
2850  * Information saved between calls so we can determine the strategy
2851  * point's advance rate and avoid scanning already-cleaned buffers.
2852  */
2853  static bool saved_info_valid = false;
2854  static int prev_strategy_buf_id;
2855  static uint32 prev_strategy_passes;
2856  static int next_to_clean;
2857  static uint32 next_passes;
2858 
2859  /* Moving averages of allocation rate and clean-buffer density */
2860  static float smoothed_alloc = 0;
2861  static float smoothed_density = 10.0;
2862 
2863  /* Potentially these could be tunables, but for now, not */
2864  float smoothing_samples = 16;
2865  float scan_whole_pool_milliseconds = 120000.0;
2866 
2867  /* Used to compute how far we scan ahead */
2868  long strategy_delta;
2869  int bufs_to_lap;
2870  int bufs_ahead;
2871  float scans_per_alloc;
2872  int reusable_buffers_est;
2873  int upcoming_alloc_est;
2874  int min_scan_buffers;
2875 
2876  /* Variables for the scanning loop proper */
2877  int num_to_scan;
2878  int num_written;
2879  int reusable_buffers;
2880 
2881  /* Variables for final smoothed_density update */
2882  long new_strategy_delta;
2883  uint32 new_recent_alloc;
2884 
2885  /*
2886  * Find out where the freelist clock sweep currently is, and how many
2887  * buffer allocations have happened since our last call.
2888  */
2889  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2890 
2891  /* Report buffer alloc counts to pgstat */
2892  PendingBgWriterStats.buf_alloc += recent_alloc;
2893 
2894  /*
2895  * If we're not running the LRU scan, just stop after doing the stats
2896  * stuff. We mark the saved state invalid so that we can recover sanely
2897  * if LRU scan is turned back on later.
2898  */
2899  if (bgwriter_lru_maxpages <= 0)
2900  {
2901  saved_info_valid = false;
2902  return true;
2903  }
2904 
2905  /*
2906  * Compute strategy_delta = how many buffers have been scanned by the
2907  * clock sweep since last time. If first time through, assume none. Then
2908  * see if we are still ahead of the clock sweep, and if so, how many
2909  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2910  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2911  * behavior when the passes counts wrap around.
2912  */
2913  if (saved_info_valid)
2914  {
2915  int32 passes_delta = strategy_passes - prev_strategy_passes;
2916 
2917  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2918  strategy_delta += (long) passes_delta * NBuffers;
2919 
2920  Assert(strategy_delta >= 0);
2921 
2922  if ((int32) (next_passes - strategy_passes) > 0)
2923  {
2924  /* we're one pass ahead of the strategy point */
2925  bufs_to_lap = strategy_buf_id - next_to_clean;
2926 #ifdef BGW_DEBUG
2927  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2928  next_passes, next_to_clean,
2929  strategy_passes, strategy_buf_id,
2930  strategy_delta, bufs_to_lap);
2931 #endif
2932  }
2933  else if (next_passes == strategy_passes &&
2934  next_to_clean >= strategy_buf_id)
2935  {
2936  /* on same pass, but ahead or at least not behind */
2937  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2938 #ifdef BGW_DEBUG
2939  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2940  next_passes, next_to_clean,
2941  strategy_passes, strategy_buf_id,
2942  strategy_delta, bufs_to_lap);
2943 #endif
2944  }
2945  else
2946  {
2947  /*
2948  * We're behind, so skip forward to the strategy point and start
2949  * cleaning from there.
2950  */
2951 #ifdef BGW_DEBUG
2952  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2953  next_passes, next_to_clean,
2954  strategy_passes, strategy_buf_id,
2955  strategy_delta);
2956 #endif
2957  next_to_clean = strategy_buf_id;
2958  next_passes = strategy_passes;
2959  bufs_to_lap = NBuffers;
2960  }
2961  }
2962  else
2963  {
2964  /*
2965  * Initializing at startup or after LRU scanning had been off. Always
2966  * start at the strategy point.
2967  */
2968 #ifdef BGW_DEBUG
2969  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2970  strategy_passes, strategy_buf_id);
2971 #endif
2972  strategy_delta = 0;
2973  next_to_clean = strategy_buf_id;
2974  next_passes = strategy_passes;
2975  bufs_to_lap = NBuffers;
2976  }
2977 
2978  /* Update saved info for next time */
2979  prev_strategy_buf_id = strategy_buf_id;
2980  prev_strategy_passes = strategy_passes;
2981  saved_info_valid = true;
2982 
2983  /*
2984  * Compute how many buffers had to be scanned for each new allocation, ie,
2985  * 1/density of reusable buffers, and track a moving average of that.
2986  *
2987  * If the strategy point didn't move, we don't update the density estimate
2988  */
2989  if (strategy_delta > 0 && recent_alloc > 0)
2990  {
2991  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2992  smoothed_density += (scans_per_alloc - smoothed_density) /
2993  smoothing_samples;
2994  }
2995 
2996  /*
2997  * Estimate how many reusable buffers there are between the current
2998  * strategy point and where we've scanned ahead to, based on the smoothed
2999  * density estimate.
3000  */
3001  bufs_ahead = NBuffers - bufs_to_lap;
3002  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3003 
3004  /*
3005  * Track a moving average of recent buffer allocations. Here, rather than
3006  * a true average we want a fast-attack, slow-decline behavior: we
3007  * immediately follow any increase.
3008  */
3009  if (smoothed_alloc <= (float) recent_alloc)
3010  smoothed_alloc = recent_alloc;
3011  else
3012  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3013  smoothing_samples;
3014 
3015  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3016  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3017 
3018  /*
3019  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3020  * eventually underflow to zero, and the underflows produce annoying
3021  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3022  * zero, there's no point in tracking smaller and smaller values of
3023  * smoothed_alloc, so just reset it to exactly zero to avoid this
3024  * syndrome. It will pop back up as soon as recent_alloc increases.
3025  */
3026  if (upcoming_alloc_est == 0)
3027  smoothed_alloc = 0;
3028 
3029  /*
3030  * Even in cases where there's been little or no buffer allocation
3031  * activity, we want to make a small amount of progress through the buffer
3032  * cache so that as many reusable buffers as possible are clean after an
3033  * idle period.
3034  *
3035  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3036  * the BGW will be called during the scan_whole_pool time; slice the
3037  * buffer pool into that many sections.
3038  */
3039  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3040 
3041  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3042  {
3043 #ifdef BGW_DEBUG
3044  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3045  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3046 #endif
3047  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3048  }
3049 
3050  /*
3051  * Now write out dirty reusable buffers, working forward from the
3052  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3053  * enough buffers to match our estimate of the next cycle's allocation
3054  * requirements, or hit the bgwriter_lru_maxpages limit.
3055  */
3056 
3057  num_to_scan = bufs_to_lap;
3058  num_written = 0;
3059  reusable_buffers = reusable_buffers_est;
3060 
3061  /* Execute the LRU scan */
3062  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3063  {
3064  int sync_state = SyncOneBuffer(next_to_clean, true,
3065  wb_context);
3066 
3067  if (++next_to_clean >= NBuffers)
3068  {
3069  next_to_clean = 0;
3070  next_passes++;
3071  }
3072  num_to_scan--;
3073 
3074  if (sync_state & BUF_WRITTEN)
3075  {
3076  reusable_buffers++;
3077  if (++num_written >= bgwriter_lru_maxpages)
3078  {
3080  break;
3081  }
3082  }
3083  else if (sync_state & BUF_REUSABLE)
3084  reusable_buffers++;
3085  }
3086 
3087  PendingBgWriterStats.buf_written_clean += num_written;
3088 
3089 #ifdef BGW_DEBUG
3090  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3091  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3092  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3093  bufs_to_lap - num_to_scan,
3094  num_written,
3095  reusable_buffers - reusable_buffers_est);
3096 #endif
3097 
3098  /*
3099  * Consider the above scan as being like a new allocation scan.
3100  * Characterize its density and update the smoothed one based on it. This
3101  * effectively halves the moving average period in cases where both the
3102  * strategy and the background writer are doing some useful scanning,
3103  * which is helpful because a long memory isn't as desirable on the
3104  * density estimates.
3105  */
3106  new_strategy_delta = bufs_to_lap - num_to_scan;
3107  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3108  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3109  {
3110  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3111  smoothed_density += (scans_per_alloc - smoothed_density) /
3112  smoothing_samples;
3113 
3114 #ifdef BGW_DEBUG
3115  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3116  new_recent_alloc, new_strategy_delta,
3117  scans_per_alloc, smoothed_density);
3118 #endif
3119  }
3120 
3121  /* Return true if OK to hibernate */
3122  return (bufs_to_lap == 0 && recent_alloc == 0);
3123 }
3124 
3125 /*
3126  * SyncOneBuffer -- process a single buffer during syncing.
3127  *
3128  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3129  * buffers marked recently used, as these are not replacement candidates.
3130  *
3131  * Returns a bitmask containing the following flag bits:
3132  * BUF_WRITTEN: we wrote the buffer.
3133  * BUF_REUSABLE: buffer is available for replacement, ie, it has
3134  * pin count 0 and usage count 0.
3135  *
3136  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3137  * after locking it, but we don't care all that much.)
3138  */
3139 static int
3140 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3141 {
3142  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3143  int result = 0;
3144  uint32 buf_state;
3145  BufferTag tag;
3146 
3147  /* Make sure we can handle the pin */
3150 
3151  /*
3152  * Check whether buffer needs writing.
3153  *
3154  * We can make this check without taking the buffer content lock so long
3155  * as we mark pages dirty in access methods *before* logging changes with
3156  * XLogInsert(): if someone marks the buffer dirty just after our check we
3157  * don't worry because our checkpoint.redo points before log record for
3158  * upcoming changes and so we are not required to write such dirty buffer.
3159  */
3160  buf_state = LockBufHdr(bufHdr);
3161 
3162  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3163  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3164  {
3165  result |= BUF_REUSABLE;
3166  }
3167  else if (skip_recently_used)
3168  {
3169  /* Caller told us not to write recently-used buffers */
3170  UnlockBufHdr(bufHdr, buf_state);
3171  return result;
3172  }
3173 
3174  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3175  {
3176  /* It's clean, so nothing to do */
3177  UnlockBufHdr(bufHdr, buf_state);
3178  return result;
3179  }
3180 
3181  /*
3182  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3183  * buffer is clean by the time we've locked it.)
3184  */
3185  PinBuffer_Locked(bufHdr);
3187 
3189 
3191 
3192  tag = bufHdr->tag;
3193 
3194  UnpinBuffer(bufHdr);
3195 
3196  /*
3197  * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3198  * IOContext will always be IOCONTEXT_NORMAL.
3199  */
3201 
3202  return result | BUF_WRITTEN;
3203 }
3204 
3205 /*
3206  * AtEOXact_Buffers - clean up at end of transaction.
3207  *
3208  * As of PostgreSQL 8.0, buffer pins should get released by the
3209  * ResourceOwner mechanism. This routine is just a debugging
3210  * cross-check that no pins remain.
3211  */
3212 void
3213 AtEOXact_Buffers(bool isCommit)
3214 {
3216 
3217  AtEOXact_LocalBuffers(isCommit);
3218 
3220 }
3221 
3222 /*
3223  * Initialize access to shared buffer pool
3224  *
3225  * This is called during backend startup (whether standalone or under the
3226  * postmaster). It sets up for this backend's access to the already-existing
3227  * buffer pool.
3228  */
3229 void
3231 {
3232  HASHCTL hash_ctl;
3233 
3234  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3235 
3236  hash_ctl.keysize = sizeof(int32);
3237  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3238 
3239  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3240  HASH_ELEM | HASH_BLOBS);
3241 
3242  /*
3243  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3244  * the corresponding phase of backend shutdown.
3245  */
3246  Assert(MyProc != NULL);
3248 }
3249 
3250 /*
3251  * During backend exit, ensure that we released all shared-buffer locks and
3252  * assert that we have no remaining pins.
3253  */
3254 static void
3256 {
3257  UnlockBuffers();
3258 
3260 
3261  /* localbuf.c needs a chance too */
3263 }
3264 
3265 /*
3266  * CheckForBufferLeaks - ensure this backend holds no buffer pins
3267  *
3268  * As of PostgreSQL 8.0, buffer pins should get released by the
3269  * ResourceOwner mechanism. This routine is just a debugging
3270  * cross-check that no pins remain.
3271  */
3272 static void
3274 {
3275 #ifdef USE_ASSERT_CHECKING
3276  int RefCountErrors = 0;
3278  int i;
3279  char *s;
3280 
3281  /* check the array */
3282  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3283  {
3285 
3286  if (res->buffer != InvalidBuffer)
3287  {
3288  s = DebugPrintBufferRefcount(res->buffer);
3289  elog(WARNING, "buffer refcount leak: %s", s);
3290  pfree(s);
3291 
3292  RefCountErrors++;
3293  }
3294  }
3295 
3296  /* if necessary search the hash */
3298  {
3299  HASH_SEQ_STATUS hstat;
3300 
3302  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3303  {
3304  s = DebugPrintBufferRefcount(res->buffer);
3305  elog(WARNING, "buffer refcount leak: %s", s);
3306  pfree(s);
3307  RefCountErrors++;
3308  }
3309  }
3310 
3311  Assert(RefCountErrors == 0);
3312 #endif
3313 }
3314 
3315 /*
3316  * Helper routine to issue warnings when a buffer is unexpectedly pinned
3317  */
3318 char *
3320 {
3321  BufferDesc *buf;
3322  int32 loccount;
3323  char *path;
3324  char *result;
3325  BackendId backend;
3326  uint32 buf_state;
3327 
3329  if (BufferIsLocal(buffer))
3330  {
3332  loccount = LocalRefCount[-buffer - 1];
3333  backend = MyBackendId;
3334  }
3335  else
3336  {
3338  loccount = GetPrivateRefCount(buffer);
3339  backend = InvalidBackendId;
3340  }
3341 
3342  /* theoretically we should lock the bufhdr here */
3343  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3344  BufTagGetForkNum(&buf->tag));
3345  buf_state = pg_atomic_read_u32(&buf->state);
3346 
3347  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3348  buffer, path,
3349  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3350  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3351  pfree(path);
3352  return result;
3353 }
3354 
3355 /*
3356  * CheckPointBuffers
3357  *
3358  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3359  *
3360  * Note: temporary relations do not participate in checkpoints, so they don't
3361  * need to be flushed.
3362  */
3363 void
3365 {
3366  BufferSync(flags);
3367 }
3368 
3369 /*
3370  * BufferGetBlockNumber
3371  * Returns the block number associated with a buffer.
3372  *
3373  * Note:
3374  * Assumes that the buffer is valid and pinned, else the
3375  * value may be obsolete immediately...
3376  */
3379 {
3380  BufferDesc *bufHdr;
3381 
3383 
3384  if (BufferIsLocal(buffer))
3385  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3386  else
3387  bufHdr = GetBufferDescriptor(buffer - 1);
3388 
3389  /* pinned, so OK to read tag without spinlock */
3390  return bufHdr->tag.blockNum;
3391 }
3392 
3393 /*
3394  * BufferGetTag
3395  * Returns the relfilelocator, fork number and block number associated with
3396  * a buffer.
3397  */
3398 void
3400  BlockNumber *blknum)
3401 {
3402  BufferDesc *bufHdr;
3403 
3404  /* Do the same checks as BufferGetBlockNumber. */
3406 
3407  if (BufferIsLocal(buffer))
3408  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3409  else
3410  bufHdr = GetBufferDescriptor(buffer - 1);
3411 
3412  /* pinned, so OK to read tag without spinlock */
3413  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3414  *forknum = BufTagGetForkNum(&bufHdr->tag);
3415  *blknum = bufHdr->tag.blockNum;
3416 }
3417 
3418 /*
3419  * FlushBuffer
3420  * Physically write out a shared buffer.
3421  *
3422  * NOTE: this actually just passes the buffer contents to the kernel; the
3423  * real write to disk won't happen until the kernel feels like it. This
3424  * is okay from our point of view since we can redo the changes from WAL.
3425  * However, we will need to force the changes to disk via fsync before
3426  * we can checkpoint WAL.
3427  *
3428  * The caller must hold a pin on the buffer and have share-locked the
3429  * buffer contents. (Note: a share-lock does not prevent updates of
3430  * hint bits in the buffer, so the page could change while the write
3431  * is in progress, but we assume that that will not invalidate the data
3432  * written.)
3433  *
3434  * If the caller has an smgr reference for the buffer's relation, pass it
3435  * as the second parameter. If not, pass NULL.
3436  */
3437 static void
3439  IOContext io_context)
3440 {
3441  XLogRecPtr recptr;
3442  ErrorContextCallback errcallback;
3443  instr_time io_start;
3444  Block bufBlock;
3445  char *bufToWrite;
3446  uint32 buf_state;
3447 
3448  /*
3449  * Try to start an I/O operation. If StartBufferIO returns false, then
3450  * someone else flushed the buffer before we could, so we need not do
3451  * anything.
3452  */
3453  if (!StartBufferIO(buf, false))
3454  return;
3455 
3456  /* Setup error traceback support for ereport() */
3458  errcallback.arg = (void *) buf;
3459  errcallback.previous = error_context_stack;
3460  error_context_stack = &errcallback;
3461 
3462  /* Find smgr relation for buffer */
3463  if (reln == NULL)
3465 
3466  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3467  buf->tag.blockNum,
3469  reln->smgr_rlocator.locator.dbOid,
3471 
3472  buf_state = LockBufHdr(buf);
3473 
3474  /*
3475  * Run PageGetLSN while holding header lock, since we don't have the
3476  * buffer locked exclusively in all cases.
3477  */
3478  recptr = BufferGetLSN(buf);
3479 
3480  /* To check if block content changes while flushing. - vadim 01/17/97 */
3481  buf_state &= ~BM_JUST_DIRTIED;
3482  UnlockBufHdr(buf, buf_state);
3483 
3484  /*
3485  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3486  * rule that log updates must hit disk before any of the data-file changes
3487  * they describe do.
3488  *
3489  * However, this rule does not apply to unlogged relations, which will be
3490  * lost after a crash anyway. Most unlogged relation pages do not bear
3491  * LSNs since we never emit WAL records for them, and therefore flushing
3492  * up through the buffer LSN would be useless, but harmless. However,
3493  * GiST indexes use LSNs internally to track page-splits, and therefore
3494  * unlogged GiST pages bear "fake" LSNs generated by
3495  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3496  * LSN counter could advance past the WAL insertion point; and if it did
3497  * happen, attempting to flush WAL through that location would fail, with
3498  * disastrous system-wide consequences. To make sure that can't happen,
3499  * skip the flush if the buffer isn't permanent.
3500  */
3501  if (buf_state & BM_PERMANENT)
3502  XLogFlush(recptr);
3503 
3504  /*
3505  * Now it's safe to write buffer to disk. Note that no one else should
3506  * have been able to write it while we were busy with log flushing because
3507  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3508  */
3509  bufBlock = BufHdrGetBlock(buf);
3510 
3511  /*
3512  * Update page checksum if desired. Since we have only shared lock on the
3513  * buffer, other processes might be updating hint bits in it, so we must
3514  * copy the page to private storage if we do checksumming.
3515  */
3516  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3517 
3519 
3520  /*
3521  * bufToWrite is either the shared buffer or a copy, as appropriate.
3522  */
3523  smgrwrite(reln,
3524  BufTagGetForkNum(&buf->tag),
3525  buf->tag.blockNum,
3526  bufToWrite,
3527  false);
3528 
3529  /*
3530  * When a strategy is in use, only flushes of dirty buffers already in the
3531  * strategy ring are counted as strategy writes (IOCONTEXT
3532  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3533  * statistics tracking.
3534  *
3535  * If a shared buffer initially added to the ring must be flushed before
3536  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3537  *
3538  * If a shared buffer which was added to the ring later because the
3539  * current strategy buffer is pinned or in use or because all strategy
3540  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3541  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3542  * (from_ring will be false).
3543  *
3544  * When a strategy is not in use, the write can only be a "regular" write
3545  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3546  */
3548  IOOP_WRITE, io_start, 1);
3549 
3551 
3552  /*
3553  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3554  * end the BM_IO_IN_PROGRESS state.
3555  */
3556  TerminateBufferIO(buf, true, 0, true);
3557 
3558  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3559  buf->tag.blockNum,
3561  reln->smgr_rlocator.locator.dbOid,
3563 
3564  /* Pop the error context stack */
3565  error_context_stack = errcallback.previous;
3566 }
3567 
3568 /*
3569  * RelationGetNumberOfBlocksInFork
3570  * Determines the current number of pages in the specified relation fork.
3571  *
3572  * Note that the accuracy of the result will depend on the details of the
3573  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3574  * it might not be.
3575  */
3578 {
3579  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3580  {
3581  /*
3582  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3583  * tableam returns the size in bytes - but for the purpose of this
3584  * routine, we want the number of blocks. Therefore divide, rounding
3585  * up.
3586  */
3587  uint64 szbytes;
3588 
3589  szbytes = table_relation_size(relation, forkNum);
3590 
3591  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3592  }
3593  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3594  {
3595  return smgrnblocks(RelationGetSmgr(relation), forkNum);
3596  }
3597  else
3598  Assert(false);
3599 
3600  return 0; /* keep compiler quiet */
3601 }
3602 
3603 /*
3604  * BufferIsPermanent
3605  * Determines whether a buffer will potentially still be around after
3606  * a crash. Caller must hold a buffer pin.
3607  */
3608 bool
3610 {
3611  BufferDesc *bufHdr;
3612 
3613  /* Local buffers are used only for temp relations. */
3614  if (BufferIsLocal(buffer))
3615  return false;
3616 
3617  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3620 
3621  /*
3622  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3623  * need not bother with the buffer header spinlock. Even if someone else
3624  * changes the buffer header state while we're doing this, the state is
3625  * changed atomically, so we'll read the old value or the new value, but
3626  * not random garbage.
3627  */
3628  bufHdr = GetBufferDescriptor(buffer - 1);
3629  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3630 }
3631 
3632 /*
3633  * BufferGetLSNAtomic
3634  * Retrieves the LSN of the buffer atomically using a buffer header lock.
3635  * This is necessary for some callers who may not have an exclusive lock
3636  * on the buffer.
3637  */
3638 XLogRecPtr
3640 {
3641  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3642  char *page = BufferGetPage(buffer);
3643  XLogRecPtr lsn;
3644  uint32 buf_state;
3645 
3646  /*
3647  * If we don't need locking for correctness, fastpath out.
3648  */
3650  return PageGetLSN(page);
3651 
3652  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3655 
3656  buf_state = LockBufHdr(bufHdr);
3657  lsn = PageGetLSN(page);
3658  UnlockBufHdr(bufHdr, buf_state);
3659 
3660  return lsn;
3661 }
3662 
3663 /* ---------------------------------------------------------------------
3664  * DropRelationBuffers
3665  *
3666  * This function removes from the buffer pool all the pages of the
3667  * specified relation forks that have block numbers >= firstDelBlock.
3668  * (In particular, with firstDelBlock = 0, all pages are removed.)
3669  * Dirty pages are simply dropped, without bothering to write them
3670  * out first. Therefore, this is NOT rollback-able, and so should be
3671  * used only with extreme caution!
3672  *
3673  * Currently, this is called only from smgr.c when the underlying file
3674  * is about to be deleted or truncated (firstDelBlock is needed for
3675  * the truncation case). The data in the affected pages would therefore
3676  * be deleted momentarily anyway, and there is no point in writing it.
3677  * It is the responsibility of higher-level code to ensure that the
3678  * deletion or truncation does not lose any data that could be needed
3679  * later. It is also the responsibility of higher-level code to ensure
3680  * that no other process could be trying to load more pages of the
3681  * relation into buffers.
3682  * --------------------------------------------------------------------
3683  */
3684 void
3686  int nforks, BlockNumber *firstDelBlock)
3687 {
3688  int i;
3689  int j;
3690  RelFileLocatorBackend rlocator;
3691  BlockNumber nForkBlock[MAX_FORKNUM];
3692  uint64 nBlocksToInvalidate = 0;
3693 
3694  rlocator = smgr_reln->smgr_rlocator;
3695 
3696  /* If it's a local relation, it's localbuf.c's problem. */
3697  if (RelFileLocatorBackendIsTemp(rlocator))
3698  {
3699  if (rlocator.backend == MyBackendId)
3700  {
3701  for (j = 0; j < nforks; j++)
3702  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3703  firstDelBlock[j]);
3704  }
3705  return;
3706  }
3707 
3708  /*
3709  * To remove all the pages of the specified relation forks from the buffer
3710  * pool, we need to scan the entire buffer pool but we can optimize it by
3711  * finding the buffers from BufMapping table provided we know the exact
3712  * size of each fork of the relation. The exact size is required to ensure
3713  * that we don't leave any buffer for the relation being dropped as
3714  * otherwise the background writer or checkpointer can lead to a PANIC
3715  * error while flushing buffers corresponding to files that don't exist.
3716  *
3717  * To know the exact size, we rely on the size cached for each fork by us
3718  * during recovery which limits the optimization to recovery and on
3719  * standbys but we can easily extend it once we have shared cache for
3720  * relation size.
3721  *
3722  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3723  * and the future writes keeps the cached value up-to-date. See
3724  * smgrextend. It is possible that the value of the first lseek is smaller
3725  * than the actual number of existing blocks in the file due to buggy
3726  * Linux kernels that might not have accounted for the recent write. But
3727  * that should be fine because there must not be any buffers after that
3728  * file size.
3729  */
3730  for (i = 0; i < nforks; i++)
3731  {
3732  /* Get the number of blocks for a relation's fork */
3733  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3734 
3735  if (nForkBlock[i] == InvalidBlockNumber)
3736  {
3737  nBlocksToInvalidate = InvalidBlockNumber;
3738  break;
3739  }
3740 
3741  /* calculate the number of blocks to be invalidated */
3742  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3743  }
3744 
3745  /*
3746  * We apply the optimization iff the total number of blocks to invalidate
3747  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3748  */
3749  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3750  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3751  {
3752  for (j = 0; j < nforks; j++)
3753  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3754  nForkBlock[j], firstDelBlock[j]);
3755  return;
3756  }
3757 
3758  for (i = 0; i < NBuffers; i++)
3759  {
3760  BufferDesc *bufHdr = GetBufferDescriptor(i);
3761  uint32 buf_state;
3762 
3763  /*
3764  * We can make this a tad faster by prechecking the buffer tag before
3765  * we attempt to lock the buffer; this saves a lot of lock
3766  * acquisitions in typical cases. It should be safe because the
3767  * caller must have AccessExclusiveLock on the relation, or some other
3768  * reason to be certain that no one is loading new pages of the rel
3769  * into the buffer pool. (Otherwise we might well miss such pages
3770  * entirely.) Therefore, while the tag might be changing while we
3771  * look at it, it can't be changing *to* a value we care about, only
3772  * *away* from such a value. So false negatives are impossible, and
3773  * false positives are safe because we'll recheck after getting the
3774  * buffer lock.
3775  *
3776  * We could check forkNum and blockNum as well as the rlocator, but
3777  * the incremental win from doing so seems small.
3778  */
3779  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3780  continue;
3781 
3782  buf_state = LockBufHdr(bufHdr);
3783 
3784  for (j = 0; j < nforks; j++)
3785  {
3786  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3787  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3788  bufHdr->tag.blockNum >= firstDelBlock[j])
3789  {
3790  InvalidateBuffer(bufHdr); /* releases spinlock */
3791  break;
3792  }
3793  }
3794  if (j >= nforks)
3795  UnlockBufHdr(bufHdr, buf_state);
3796  }
3797 }
3798 
3799 /* ---------------------------------------------------------------------
3800  * DropRelationsAllBuffers
3801  *
3802  * This function removes from the buffer pool all the pages of all
3803  * forks of the specified relations. It's equivalent to calling
3804  * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
3805  * --------------------------------------------------------------------
3806  */
3807 void
3808 DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
3809 {
3810  int i;
3811  int n = 0;
3812  SMgrRelation *rels;
3813  BlockNumber (*block)[MAX_FORKNUM + 1];
3814  uint64 nBlocksToInvalidate = 0;
3815  RelFileLocator *locators;
3816  bool cached = true;
3817  bool use_bsearch;
3818 
3819  if (nlocators == 0)
3820  return;
3821 
3822  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3823 
3824  /* If it's a local relation, it's localbuf.c's problem. */
3825  for (i = 0; i < nlocators; i++)
3826  {
3827  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3828  {
3829  if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
3830  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3831  }
3832  else
3833  rels[n++] = smgr_reln[i];
3834  }
3835 
3836  /*
3837  * If there are no non-local relations, then we're done. Release the
3838  * memory and return.
3839  */
3840  if (n == 0)
3841  {
3842  pfree(rels);
3843  return;
3844  }
3845 
3846  /*
3847  * This is used to remember the number of blocks for all the relations
3848  * forks.
3849  */
3850  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3851  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3852 
3853  /*
3854  * We can avoid scanning the entire buffer pool if we know the exact size
3855  * of each of the given relation forks. See DropRelationBuffers.
3856  */
3857  for (i = 0; i < n && cached; i++)
3858  {
3859  for (int j = 0; j <= MAX_FORKNUM; j++)
3860  {
3861  /* Get the number of blocks for a relation's fork. */
3862  block[i][j] = smgrnblocks_cached(rels[i], j);
3863 
3864  /* We need to only consider the relation forks that exists. */
3865  if (block[i][j] == InvalidBlockNumber)
3866  {
3867  if (!smgrexists(rels[i], j))
3868  continue;
3869  cached = false;
3870  break;
3871  }
3872 
3873  /* calculate the total number of blocks to be invalidated */
3874  nBlocksToInvalidate += block[i][j];
3875  }
3876  }
3877 
3878  /*
3879  * We apply the optimization iff the total number of blocks to invalidate
3880  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3881  */
3882  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3883  {
3884  for (i = 0; i < n; i++)
3885  {
3886  for (int j = 0; j <= MAX_FORKNUM; j++)
3887  {
3888  /* ignore relation forks that doesn't exist */
3889  if (!BlockNumberIsValid(block[i][j]))
3890  continue;
3891 
3892  /* drop all the buffers for a particular relation fork */
3893  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3894  j, block[i][j], 0);
3895  }
3896  }
3897 
3898  pfree(block);
3899  pfree(rels);
3900  return;
3901  }
3902 
3903  pfree(block);
3904  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3905  for (i = 0; i < n; i++)
3906  locators[i] = rels[i]->smgr_rlocator.locator;
3907 
3908  /*
3909  * For low number of relations to drop just use a simple walk through, to
3910  * save the bsearch overhead. The threshold to use is rather a guess than
3911  * an exactly determined value, as it depends on many factors (CPU and RAM
3912  * speeds, amount of shared buffers etc.).
3913  */
3914  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3915 
3916  /* sort the list of rlocators if necessary */
3917  if (use_bsearch)
3918  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3919 
3920  for (i = 0; i < NBuffers; i++)
3921  {
3922  RelFileLocator *rlocator = NULL;
3923  BufferDesc *bufHdr = GetBufferDescriptor(i);
3924  uint32 buf_state;
3925 
3926  /*
3927  * As in DropRelationBuffers, an unlocked precheck should be safe and
3928  * saves some cycles.
3929  */
3930 
3931  if (!use_bsearch)
3932  {
3933  int j;
3934 
3935  for (j = 0; j < n; j++)
3936  {
3937  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3938  {
3939  rlocator = &locators[j];
3940  break;
3941  }
3942  }
3943  }
3944  else
3945  {
3946  RelFileLocator locator;
3947 
3948  locator = BufTagGetRelFileLocator(&bufHdr->tag);
3949  rlocator = bsearch((const void *) &(locator),
3950  locators, n, sizeof(RelFileLocator),
3952  }
3953 
3954  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3955  if (rlocator == NULL)
3956  continue;
3957 
3958  buf_state = LockBufHdr(bufHdr);
3959  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3960  InvalidateBuffer(bufHdr); /* releases spinlock */
3961  else
3962  UnlockBufHdr(bufHdr, buf_state);
3963  }
3964 
3965  pfree(locators);
3966  pfree(rels);
3967 }
3968 
3969 /* ---------------------------------------------------------------------
3970  * FindAndDropRelationBuffers
3971  *
3972  * This function performs look up in BufMapping table and removes from the
3973  * buffer pool all the pages of the specified relation fork that has block
3974  * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3975  * pages are removed.)
3976  * --------------------------------------------------------------------
3977  */
3978 static void
3980  BlockNumber nForkBlock,
3981  BlockNumber firstDelBlock)
3982 {
3983  BlockNumber curBlock;
3984 
3985  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3986  {
3987  uint32 bufHash; /* hash value for tag */
3988  BufferTag bufTag; /* identity of requested block */
3989  LWLock *bufPartitionLock; /* buffer partition lock for it */
3990  int buf_id;
3991  BufferDesc *bufHdr;
3992  uint32 buf_state;
3993 
3994  /* create a tag so we can lookup the buffer */
3995  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3996 
3997  /* determine its hash code and partition lock ID */
3998  bufHash = BufTableHashCode(&bufTag);
3999  bufPartitionLock = BufMappingPartitionLock(bufHash);
4000 
4001  /* Check that it is in the buffer pool. If not, do nothing. */
4002  LWLockAcquire(bufPartitionLock, LW_SHARED);
4003  buf_id = BufTableLookup(&bufTag, bufHash);
4004  LWLockRelease(bufPartitionLock);
4005 
4006  if (buf_id < 0)
4007  continue;
4008 
4009  bufHdr = GetBufferDescriptor(buf_id);
4010 
4011  /*
4012  * We need to lock the buffer header and recheck if the buffer is
4013  * still associated with the same block because the buffer could be
4014  * evicted by some other backend loading blocks for a different
4015  * relation after we release lock on the BufMapping table.
4016  */
4017  buf_state = LockBufHdr(bufHdr);
4018 
4019  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4020  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4021  bufHdr->tag.blockNum >= firstDelBlock)
4022  InvalidateBuffer(bufHdr); /* releases spinlock */
4023  else
4024  UnlockBufHdr(bufHdr, buf_state);
4025  }
4026 }
4027 
4028 /* ---------------------------------------------------------------------
4029  * DropDatabaseBuffers
4030  *
4031  * This function removes all the buffers in the buffer cache for a
4032  * particular database. Dirty pages are simply dropped, without
4033  * bothering to write them out first. This is used when we destroy a
4034  * database, to avoid trying to flush data to disk when the directory
4035  * tree no longer exists. Implementation is pretty similar to
4036  * DropRelationBuffers() which is for destroying just one relation.
4037  * --------------------------------------------------------------------
4038  */
4039 void
4041 {
4042  int i;
4043 
4044  /*
4045  * We needn't consider local buffers, since by assumption the target
4046  * database isn't our own.
4047  */
4048 
4049  for (i = 0; i < NBuffers; i++)
4050  {
4051  BufferDesc *bufHdr = GetBufferDescriptor(i);
4052  uint32 buf_state;
4053 
4054  /*
4055  * As in DropRelationBuffers, an unlocked precheck should be safe and
4056  * saves some cycles.
4057  */
4058  if (bufHdr->tag.dbOid != dbid)
4059  continue;
4060 
4061  buf_state = LockBufHdr(bufHdr);
4062  if (bufHdr->tag.dbOid == dbid)
4063  InvalidateBuffer(bufHdr); /* releases spinlock */
4064  else
4065  UnlockBufHdr(bufHdr, buf_state);
4066  }
4067 }
4068 
4069 /* -----------------------------------------------------------------
4070  * PrintBufferDescs
4071  *
4072  * this function prints all the buffer descriptors, for debugging
4073  * use only.
4074  * -----------------------------------------------------------------
4075  */
4076 #ifdef NOT_USED
4077 void
4078 PrintBufferDescs(void)
4079 {
4080  int i;
4081 
4082  for (i = 0; i < NBuffers; ++i)
4083  {
4086 
4087  /* theoretically we should lock the bufhdr here */
4088  elog(LOG,
4089  "[%02d] (freeNext=%d, rel=%s, "
4090  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4091  i, buf->freeNext,
4094  buf->tag.blockNum, buf->flags,
4095  buf->refcount, GetPrivateRefCount(b));
4096  }
4097 }
4098 #endif
4099 
4100 #ifdef NOT_USED
4101 void
4102 PrintPinnedBufs(void)
4103 {
4104  int i;
4105 
4106  for (i = 0; i < NBuffers; ++i)
4107  {
4110 
4111  if (GetPrivateRefCount(b) > 0)
4112  {
4113  /* theoretically we should lock the bufhdr here */
4114  elog(LOG,
4115  "[%02d] (freeNext=%d, rel=%s, "
4116  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4117  i, buf->freeNext,
4119  BufTagGetForkNum(&buf->tag)),
4120  buf->tag.blockNum, buf->flags,
4121  buf->refcount, GetPrivateRefCount(b));
4122  }
4123  }
4124 }
4125 #endif
4126 
4127 /* ---------------------------------------------------------------------
4128  * FlushRelationBuffers
4129  *
4130  * This function writes all dirty pages of a relation out to disk
4131  * (or more accurately, out to kernel disk buffers), ensuring that the
4132  * kernel has an up-to-date view of the relation.
4133  *
4134  * Generally, the caller should be holding AccessExclusiveLock on the
4135  * target relation to ensure that no other backend is busy dirtying
4136  * more blocks of the relation; the effects can't be expected to last
4137  * after the lock is released.
4138  *
4139  * XXX currently it sequentially searches the buffer pool, should be
4140  * changed to more clever ways of searching. This routine is not
4141  * used in any performance-critical code paths, so it's not worth
4142  * adding additional overhead to normal paths to make it go faster.
4143  * --------------------------------------------------------------------
4144  */
4145 void
4147 {
4148  int i;
4149  BufferDesc *bufHdr;
4150  SMgrRelation srel = RelationGetSmgr(rel);
4151 
4152  if (RelationUsesLocalBuffers(rel))
4153  {
4154  for (i = 0; i < NLocBuffer; i++)
4155  {
4156  uint32 buf_state;
4157  instr_time io_start;
4158 
4159  bufHdr = GetLocalBufferDescriptor(i);
4160  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4161  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4162  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4163  {
4164  ErrorContextCallback errcallback;
4165  Page localpage;
4166 
4167  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4168 
4169  /* Setup error traceback support for ereport() */
4171  errcallback.arg = (void *) bufHdr;
4172  errcallback.previous = error_context_stack;
4173  error_context_stack = &errcallback;
4174 
4175  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4176 
4178 
4179  smgrwrite(srel,
4180  BufTagGetForkNum(&bufHdr->tag),
4181  bufHdr->tag.blockNum,
4182  localpage,
4183  false);
4184 
4187  io_start, 1);
4188 
4189  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4190  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4191 
4193 
4194  /* Pop the error context stack */
4195  error_context_stack = errcallback.previous;
4196  }
4197  }
4198 
4199  return;
4200  }
4201 
4202  for (i = 0; i < NBuffers; i++)
4203  {
4204  uint32 buf_state;
4205 
4206  bufHdr = GetBufferDescriptor(i);
4207 
4208  /*
4209  * As in DropRelationBuffers, an unlocked precheck should be safe and
4210  * saves some cycles.
4211  */
4212  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4213  continue;
4214 
4215  /* Make sure we can handle the pin */
4218 
4219  buf_state = LockBufHdr(bufHdr);
4220  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4221  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4222  {
4223  PinBuffer_Locked(bufHdr);
4227  UnpinBuffer(bufHdr);
4228  }
4229  else
4230  UnlockBufHdr(bufHdr, buf_state);
4231  }
4232 }
4233 
4234 /* ---------------------------------------------------------------------
4235  * FlushRelationsAllBuffers
4236  *
4237  * This function flushes out of the buffer pool all the pages of all
4238  * forks of the specified smgr relations. It's equivalent to calling
4239  * FlushRelationBuffers once per relation. The relations are assumed not
4240  * to use local buffers.
4241  * --------------------------------------------------------------------
4242  */
4243 void
4245 {
4246  int i;
4247  SMgrSortArray *srels;
4248  bool use_bsearch;
4249 
4250  if (nrels == 0)
4251  return;
4252 
4253  /* fill-in array for qsort */
4254  srels = palloc(sizeof(SMgrSortArray) * nrels);
4255 
4256  for (i = 0; i < nrels; i++)
4257  {
4258  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4259 
4260  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4261  srels[i].srel = smgrs[i];
4262  }
4263 
4264  /*
4265  * Save the bsearch overhead for low number of relations to sync. See
4266  * DropRelationsAllBuffers for details.
4267  */
4268  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4269 
4270  /* sort the list of SMgrRelations if necessary */
4271  if (use_bsearch)
4272  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4273 
4274  for (i = 0; i < NBuffers; i++)
4275  {
4276  SMgrSortArray *srelent = NULL;
4277  BufferDesc *bufHdr = GetBufferDescriptor(i);
4278  uint32 buf_state;
4279 
4280  /*
4281  * As in DropRelationBuffers, an unlocked precheck should be safe and
4282  * saves some cycles.
4283  */
4284 
4285  if (!use_bsearch)
4286  {
4287  int j;
4288 
4289  for (j = 0; j < nrels; j++)
4290  {
4291  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4292  {
4293  srelent = &srels[j];
4294  break;
4295  }
4296  }
4297  }
4298  else
4299  {
4300  RelFileLocator rlocator;
4301 
4302  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4303  srelent = bsearch((const void *) &(rlocator),
4304  srels, nrels, sizeof(SMgrSortArray),
4306  }
4307 
4308  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4309  if (srelent == NULL)
4310  continue;
4311 
4312  /* Make sure we can handle the pin */
4315 
4316  buf_state = LockBufHdr(bufHdr);
4317  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4318  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4319  {
4320  PinBuffer_Locked(bufHdr);
4322  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4324  UnpinBuffer(bufHdr);
4325  }
4326  else
4327  UnlockBufHdr(bufHdr, buf_state);
4328  }
4329 
4330  pfree(srels);
4331 }
4332 
4333 /* ---------------------------------------------------------------------
4334  * RelationCopyStorageUsingBuffer
4335  *
4336  * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4337  * of using smgrread and smgrextend this will copy using bufmgr APIs.
4338  *
4339  * Refer comments atop CreateAndCopyRelationData() for details about
4340  * 'permanent' parameter.
4341  * --------------------------------------------------------------------
4342  */
4343 static void
4345  RelFileLocator dstlocator,
4346  ForkNumber forkNum, bool permanent)
4347 {
4348  Buffer srcBuf;
4349  Buffer dstBuf;
4350  Page srcPage;
4351  Page dstPage;
4352  bool use_wal;
4353  BlockNumber nblocks;
4354  BlockNumber blkno;
4356  BufferAccessStrategy bstrategy_src;
4357  BufferAccessStrategy bstrategy_dst;
4358 
4359  /*
4360  * In general, we want to write WAL whenever wal_level > 'minimal', but we
4361  * can skip it when copying any fork of an unlogged relation other than
4362  * the init fork.
4363  */
4364  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4365 
4366  /* Get number of blocks in the source relation. */
4367  nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
4368  forkNum);
4369 
4370  /* Nothing to copy; just return. */
4371  if (nblocks == 0)
4372  return;
4373 
4374  /*
4375  * Bulk extend the destination relation of the same size as the source
4376  * relation before starting to copy block by block.
4377  */
4378  memset(buf.data, 0, BLCKSZ);
4379  smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
4380  buf.data, true);
4381 
4382  /* This is a bulk operation, so use buffer access strategies. */
4383  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4384  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4385 
4386  /* Iterate over each block of the source relation file. */
4387  for (blkno = 0; blkno < nblocks; blkno++)
4388  {
4390 
4391  /* Read block from source relation. */
4392  srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
4393  RBM_NORMAL, bstrategy_src,
4394  permanent);
4395  LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4396  srcPage = BufferGetPage(srcBuf);
4397 
4398  dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
4399  RBM_ZERO_AND_LOCK, bstrategy_dst,
4400  permanent);
4401  dstPage = BufferGetPage(dstBuf);
4402 
4404 
4405  /* Copy page data from the source to the destination. */
4406  memcpy(dstPage, srcPage, BLCKSZ);
4407  MarkBufferDirty(dstBuf);
4408 
4409  /* WAL-log the copied page. */
4410  if (use_wal)
4411  log_newpage_buffer(dstBuf, true);
4412 
4413  END_CRIT_SECTION();
4414 
4415  UnlockReleaseBuffer(dstBuf);
4416  UnlockReleaseBuffer(srcBuf);
4417  }
4418 
4419  FreeAccessStrategy(bstrategy_src);
4420  FreeAccessStrategy(bstrategy_dst);
4421 }
4422 
4423 /* ---------------------------------------------------------------------
4424  * CreateAndCopyRelationData
4425  *
4426  * Create destination relation storage and copy all forks from the
4427  * source relation to the destination.
4428  *
4429  * Pass permanent as true for permanent relations and false for
4430  * unlogged relations. Currently this API is not supported for
4431  * temporary relations.
4432  * --------------------------------------------------------------------
4433  */
4434 void
4436  RelFileLocator dst_rlocator, bool permanent)
4437 {
4438  char relpersistence;
4439  SMgrRelation src_rel;
4440  SMgrRelation dst_rel;
4441 
4442  /* Set the relpersistence. */
4443  relpersistence = permanent ?
4444  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4445 
4446  src_rel = smgropen(src_rlocator, InvalidBackendId);
4447  dst_rel = smgropen(dst_rlocator, InvalidBackendId);
4448 
4449  /*
4450  * Create and copy all forks of the relation. During create database we
4451  * have a separate cleanup mechanism which deletes complete database
4452  * directory. Therefore, each individual relation doesn't need to be
4453  * registered for cleanup.
4454  */
4455  RelationCreateStorage(dst_rlocator, relpersistence, false);
4456 
4457  /* copy main fork. */
4458  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4459  permanent);
4460 
4461  /* copy those extra forks that exist */
4462  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4463  forkNum <= MAX_FORKNUM; forkNum++)
4464  {
4465  if (smgrexists(src_rel, forkNum))
4466  {
4467  smgrcreate(dst_rel, forkNum, false);
4468 
4469  /*
4470  * WAL log creation if the relation is persistent, or this is the
4471  * init fork of an unlogged relation.
4472  */
4473  if (permanent || forkNum == INIT_FORKNUM)
4474  log_smgrcreate(&dst_rlocator, forkNum);
4475 
4476  /* Copy a fork's data, block by block. */
4477  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4478  permanent);
4479  }
4480  }
4481 }
4482 
4483 /* ---------------------------------------------------------------------
4484  * FlushDatabaseBuffers
4485  *
4486  * This function writes all dirty pages of a database out to disk
4487  * (or more accurately, out to kernel disk buffers), ensuring that the
4488  * kernel has an up-to-date view of the database.
4489  *
4490  * Generally, the caller should be holding an appropriate lock to ensure
4491  * no other backend is active in the target database; otherwise more
4492  * pages could get dirtied.
4493  *
4494  * Note we don't worry about flushing any pages of temporary relations.
4495  * It's assumed these wouldn't be interesting.
4496  * --------------------------------------------------------------------
4497  */
4498 void
4500 {
4501  int i;
4502  BufferDesc *bufHdr;
4503 
4504  for (i = 0; i < NBuffers; i++)
4505  {
4506  uint32 buf_state;
4507 
4508  bufHdr = GetBufferDescriptor(i);
4509 
4510  /*
4511  * As in DropRelationBuffers, an unlocked precheck should be safe and
4512  * saves some cycles.
4513  */
4514  if (bufHdr->tag.dbOid != dbid)
4515  continue;
4516 
4517  /* Make sure we can handle the pin */
4520 
4521  buf_state = LockBufHdr(bufHdr);
4522  if (bufHdr->tag.dbOid == dbid &&
4523  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4524  {
4525  PinBuffer_Locked(bufHdr);
4529  UnpinBuffer(bufHdr);
4530  }
4531  else
4532  UnlockBufHdr(bufHdr, buf_state);
4533  }
4534 }
4535 
4536 /*
4537  * Flush a previously, shared or exclusively, locked and pinned buffer to the
4538  * OS.
4539  */
4540 void
4542 {
4543  BufferDesc *bufHdr;
4544 
4545  /* currently not needed, but no fundamental reason not to support */
4547 
4549 
4550  bufHdr = GetBufferDescriptor(buffer - 1);
4551 
4553 
4555 }
4556 
4557 /*
4558  * ReleaseBuffer -- release the pin on a buffer
4559  */
4560 void
4562 {
4563  if (!BufferIsValid(buffer))
4564  elog(ERROR, "bad buffer ID: %d", buffer);
4565 
4566  if (BufferIsLocal(buffer))
4568  else
4570 }
4571 
4572 /*
4573  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4574  *
4575  * This is just a shorthand for a common combination.
4576  */
4577 void
4579 {
4582 }
4583 
4584 /*
4585  * IncrBufferRefCount
4586  * Increment the pin count on a buffer that we have *already* pinned
4587  * at least once.
4588  *
4589  * This function cannot be used on a buffer we do not have pinned,
4590  * because it doesn't change the shared buffer state.
4591  */
4592 void
4594 {
4597  if (BufferIsLocal(buffer))
4598  LocalRefCount[-buffer - 1]++;
4599  else
4600  {
4601  PrivateRefCountEntry *ref;
4602 
4603  ref = GetPrivateRefCountEntry(buffer, true);
4604  Assert(ref != NULL);
4605  ref->refcount++;
4606  }
4608 }
4609 
4610 /*
4611  * MarkBufferDirtyHint
4612  *
4613  * Mark a buffer dirty for non-critical changes.
4614  *
4615  * This is essentially the same as MarkBufferDirty, except:
4616  *
4617  * 1. The caller does not write WAL; so if checksums are enabled, we may need
4618  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4619  * 2. The caller might have only share-lock instead of exclusive-lock on the
4620  * buffer's content lock.
4621  * 3. This function does not guarantee that the buffer is always marked dirty
4622  * (due to a race condition), so it cannot be used for important changes.
4623  */
4624 void
4626 {
4627  BufferDesc *bufHdr;
4628  Page page = BufferGetPage(buffer);
4629 
4630  if (!BufferIsValid(buffer))
4631  elog(ERROR, "bad buffer ID: %d", buffer);
4632 
4633  if (BufferIsLocal(buffer))
4634  {
4636  return;
4637  }
4638 
4639  bufHdr = GetBufferDescriptor(buffer - 1);
4640 
4642  /* here, either share or exclusive lock is OK */
4644 
4645  /*
4646  * This routine might get called many times on the same page, if we are
4647  * making the first scan after commit of an xact that added/deleted many
4648  * tuples. So, be as quick as we can if the buffer is already dirty. We
4649  * do this by not acquiring spinlock if it looks like the status bits are
4650  * already set. Since we make this test unlocked, there's a chance we
4651  * might fail to notice that the flags have just been cleared, and failed
4652  * to reset them, due to memory-ordering issues. But since this function
4653  * is only intended to be used in cases where failing to write out the
4654  * data would be harmless anyway, it doesn't really matter.
4655  */
4656  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4658  {
4660  bool dirtied = false;
4661  bool delayChkptFlags = false;
4662  uint32 buf_state;
4663 
4664  /*
4665  * If we need to protect hint bit updates from torn writes, WAL-log a
4666  * full page image of the page. This full page image is only necessary
4667  * if the hint bit update is the first change to the page since the
4668  * last checkpoint.
4669  *
4670  * We don't check full_page_writes here because that logic is included
4671  * when we call XLogInsert() since the value changes dynamically.
4672  */
4673  if (XLogHintBitIsNeeded() &&
4674  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4675  {
4676  /*
4677  * If we must not write WAL, due to a relfilelocator-specific
4678  * condition or being in recovery, don't dirty the page. We can
4679  * set the hint, just not dirty the page as a result so the hint
4680  * is lost when we evict the page or shutdown.
4681  *
4682  * See src/backend/storage/page/README for longer discussion.
4683  */
4684  if (RecoveryInProgress() ||
4686  return;
4687 
4688  /*
4689  * If the block is already dirty because we either made a change
4690  * or set a hint already, then we don't need to write a full page
4691  * image. Note that aggressive cleaning of blocks dirtied by hint
4692  * bit setting would increase the call rate. Bulk setting of hint
4693  * bits would reduce the call rate...
4694  *
4695  * We must issue the WAL record before we mark the buffer dirty.
4696  * Otherwise we might write the page before we write the WAL. That
4697  * causes a race condition, since a checkpoint might occur between
4698  * writing the WAL record and marking the buffer dirty. We solve
4699  * that with a kluge, but one that is already in use during
4700  * transaction commit to prevent race conditions. Basically, we
4701  * simply prevent the checkpoint WAL record from being written
4702  * until we have marked the buffer dirty. We don't start the
4703  * checkpoint flush until we have marked dirty, so our checkpoint
4704  * must flush the change to disk successfully or the checkpoint
4705  * never gets written, so crash recovery will fix.
4706  *
4707  * It's possible we may enter here without an xid, so it is
4708  * essential that CreateCheckPoint waits for virtual transactions
4709  * rather than full transactionids.
4710  */
4713  delayChkptFlags = true;
4714  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4715  }
4716 
4717  buf_state = LockBufHdr(bufHdr);
4718 
4719  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4720 
4721  if (!(buf_state & BM_DIRTY))
4722  {
4723  dirtied = true; /* Means "will be dirtied by this action" */
4724 
4725  /*
4726  * Set the page LSN if we wrote a backup block. We aren't supposed
4727  * to set this when only holding a share lock but as long as we
4728  * serialise it somehow we're OK. We choose to set LSN while
4729  * holding the buffer header lock, which causes any reader of an
4730  * LSN who holds only a share lock to also obtain a buffer header
4731  * lock before using PageGetLSN(), which is enforced in
4732  * BufferGetLSNAtomic().
4733  *
4734  * If checksums are enabled, you might think we should reset the
4735  * checksum here. That will happen when the page is written
4736  * sometime later in this checkpoint cycle.
4737  */
4738  if (!XLogRecPtrIsInvalid(lsn))
4739  PageSetLSN(page, lsn);
4740  }
4741 
4742  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4743  UnlockBufHdr(bufHdr, buf_state);
4744 
4745  if (delayChkptFlags)
4747 
4748  if (dirtied)
4749  {
4750  VacuumPageDirty++;
4752  if (VacuumCostActive)
4754  }
4755  }
4756 }
4757 
4758 /*
4759  * Release buffer content locks for shared buffers.
4760  *
4761  * Used to clean up after errors.
4762  *
4763  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
4764  * of releasing buffer content locks per se; the only thing we need to deal
4765  * with here is clearing any PIN_COUNT request that was in progress.
4766  */
4767 void
4769 {
4771 
4772  if (buf)
4773  {
4774  uint32 buf_state;
4775 
4776  buf_state = LockBufHdr(buf);
4777 
4778  /*
4779  * Don't complain if flag bit not set; it could have been reset but we
4780  * got a cancel/die interrupt before getting the signal.
4781  */
4782  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4783  buf->wait_backend_pgprocno == MyProc->pgprocno)
4784  buf_state &= ~BM_PIN_COUNT_WAITER;
4785 
4786  UnlockBufHdr(buf, buf_state);
4787 
4788  PinCountWaitBuf = NULL;
4789  }
4790 }
4791 
4792 /*
4793  * Acquire or release the content_lock for the buffer.
4794  */
4795 void
4797 {
4798  BufferDesc *buf;
4799 
4801  if (BufferIsLocal(buffer))
4802  return; /* local buffers need no lock */
4803 
4805 
4806  if (mode == BUFFER_LOCK_UNLOCK)
4808  else if (mode == BUFFER_LOCK_SHARE)
4810  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4812  else
4813  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4814 }
4815 
4816 /*
4817  * Acquire the content_lock for the buffer, but only if we don't have to wait.
4818  *
4819  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
4820  */
4821 bool
4823 {
4824  BufferDesc *buf;
4825 
4827  if (BufferIsLocal(buffer))
4828  return true; /* act as though we got it */
4829 
4831 
4833  LW_EXCLUSIVE);
4834 }
4835 
4836 /*
4837  * Verify that this backend is pinning the buffer exactly once.
4838  *
4839  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
4840  * holds a pin on the buffer. We do not care whether some other backend does.
4841  */
4842 void
4844 {
4845  if (BufferIsLocal(buffer))
4846  {
4847  if (LocalRefCount[-buffer - 1] != 1)
4848  elog(ERROR, "incorrect local pin count: %d",
4849  LocalRefCount[-buffer - 1]);
4850  }
4851  else
4852  {
4853  if (GetPrivateRefCount(buffer) != 1)
4854  elog(ERROR, "incorrect local pin count: %d",
4856  }
4857 }
4858 
4859 /*
4860  * LockBufferForCleanup - lock a buffer in preparation for deleting items
4861  *
4862  * Items may be deleted from a disk page only when the caller (a) holds an
4863  * exclusive lock on the buffer and (b) has observed that no other backend
4864  * holds a pin on the buffer. If there is a pin, then the other backend
4865  * might have a pointer into the buffer (for example, a heapscan reference
4866  * to an item --- see README for more details). It's OK if a pin is added
4867  * after the cleanup starts, however; the newly-arrived backend will be
4868  * unable to look at the page until we release the exclusive lock.
4869  *
4870  * To implement this protocol, a would-be deleter must pin the buffer and
4871  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
4872  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
4873  * it has successfully observed pin count = 1.
4874  */
4875 void
4877 {
4878  BufferDesc *bufHdr;
4879  TimestampTz waitStart = 0;
4880  bool waiting = false;
4881  bool logged_recovery_conflict = false;
4882 
4884  Assert(PinCountWaitBuf == NULL);
4885 
4887 
4888  /* Nobody else to wait for */
4889  if (BufferIsLocal(buffer))
4890  return;
4891 
4892  bufHdr = GetBufferDescriptor(buffer - 1);
4893 
4894  for (;;)
4895  {
4896  uint32 buf_state;
4897 
4898  /* Try to acquire lock */
4900  buf_state = LockBufHdr(bufHdr);
4901 
4902  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4903  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4904  {
4905  /* Successfully acquired exclusive lock with pincount 1 */
4906  UnlockBufHdr(bufHdr, buf_state);
4907 
4908  /*
4909  * Emit the log message if recovery conflict on buffer pin was
4910  * resolved but the startup process waited longer than
4911  * deadlock_timeout for it.
4912  */
4913  if (logged_recovery_conflict)
4915  waitStart, GetCurrentTimestamp(),
4916  NULL, false);
4917 
4918  if (waiting)
4919  {
4920  /* reset ps display to remove the suffix if we added one */
4922  waiting = false;
4923  }
4924  return;
4925  }
4926  /* Failed, so mark myself as waiting for pincount 1 */
4927  if (buf_state & BM_PIN_COUNT_WAITER)
4928  {
4929  UnlockBufHdr(bufHdr, buf_state);
4931  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4932  }
4934  PinCountWaitBuf = bufHdr;
4935  buf_state |= BM_PIN_COUNT_WAITER;
4936  UnlockBufHdr(bufHdr, buf_state);
4938 
4939  /* Wait to be signaled by UnpinBuffer() */
4940  if (InHotStandby)
4941  {
4942  if (!waiting)
4943  {
4944  /* adjust the process title to indicate that it's waiting */
4945  set_ps_display_suffix("waiting");
4946  waiting = true;
4947  }
4948 
4949  /*
4950  * Emit the log message if the startup process is waiting longer
4951  * than deadlock_timeout for recovery conflict on buffer pin.
4952  *
4953  * Skip this if first time through because the startup process has
4954  * not started waiting yet in this case. So, the wait start
4955  * timestamp is set after this logic.
4956  */
4957  if (waitStart != 0 && !logged_recovery_conflict)
4958  {
4960 
4961  if (TimestampDifferenceExceeds(waitStart, now,
4962  DeadlockTimeout))
4963  {
4965  waitStart, now, NULL, true);
4966  logged_recovery_conflict = true;
4967  }
4968  }
4969 
4970  /*
4971  * Set the wait start timestamp if logging is enabled and first
4972  * time through.
4973  */
4974  if (log_recovery_conflict_waits && waitStart == 0)
4975  waitStart = GetCurrentTimestamp();
4976 
4977  /* Publish the bufid that Startup process waits on */
4979  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4981  /* Reset the published bufid */
4983  }
4984  else
4985  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
4986 
4987  /*
4988  * Remove flag marking us as waiter. Normally this will not be set
4989  * anymore, but ProcWaitForSignal() can return for other signals as
4990  * well. We take care to only reset the flag if we're the waiter, as
4991  * theoretically another backend could have started waiting. That's
4992  * impossible with the current usages due to table level locking, but
4993  * better be safe.
4994  */
4995  buf_state = LockBufHdr(bufHdr);
4996  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4998  buf_state &= ~BM_PIN_COUNT_WAITER;
4999  UnlockBufHdr(bufHdr, buf_state);
5000 
5001  PinCountWaitBuf = NULL;
5002  /* Loop back and try again */
5003  }
5004 }
5005 
5006 /*
5007  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5008  * requests cancellation of all pin holders that are blocking it.
5009  */
5010 bool
5012 {
5013  int bufid = GetStartupBufferPinWaitBufId();
5014 
5015  /*
5016  * If we get woken slowly then it's possible that the Startup process was
5017  * already woken by other backends before we got here. Also possible that
5018  * we get here by multiple interrupts or interrupts at inappropriate
5019  * times, so make sure we do nothing if the bufid is not set.
5020  */
5021  if (bufid < 0)
5022  return false;
5023 
5024  if (GetPrivateRefCount(bufid + 1) > 0)
5025  return true;
5026 
5027  return false;
5028 }
5029 
5030 /*
5031  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5032  *
5033  * We won't loop, but just check once to see if the pin count is OK. If
5034  * not, return false with no lock held.
5035  */
5036 bool
5038 {
5039  BufferDesc *bufHdr;
5040  uint32 buf_state,
5041  refcount;
5042 
5044 
5045  if (BufferIsLocal(buffer))
5046  {
5047  refcount = LocalRefCount[-buffer - 1];
5048  /* There should be exactly one pin */
5049  Assert(refcount > 0);
5050  if (refcount != 1)
5051  return false;
5052  /* Nobody else to wait for */
5053  return true;
5054  }
5055 
5056  /* There should be exactly one local pin */
5058  Assert(refcount);
5059  if (refcount != 1)
5060  return false;
5061 
5062  /* Try to acquire lock */
5064  return false;
5065 
5066  bufHdr = GetBufferDescriptor(buffer - 1);
5067  buf_state = LockBufHdr(bufHdr);
5068  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5069 
5070  Assert(refcount > 0);
5071  if (refcount == 1)
5072  {
5073  /* Successfully acquired exclusive lock with pincount 1 */
5074  UnlockBufHdr(bufHdr, buf_state);
5075  return true;
5076  }
5077 
5078  /* Failed, so release the lock */
5079  UnlockBufHdr(bufHdr, buf_state);
5081  return false;
5082 }
5083 
5084 /*
5085  * IsBufferCleanupOK - as above, but we already have the lock
5086  *
5087  * Check whether it's OK to perform cleanup on a buffer we've already
5088  * locked. If we observe that the pin count is 1, our exclusive lock
5089  * happens to be a cleanup lock, and we can proceed with anything that
5090  * would have been allowable had we sought a cleanup lock originally.
5091  */
5092 bool
5094 {
5095  BufferDesc *bufHdr;
5096  uint32 buf_state;
5097 
5099 
5100  if (BufferIsLocal(buffer))
5101  {
5102  /* There should be exactly one pin */
5103  if (LocalRefCount[-buffer - 1] != 1)
5104  return false;
5105  /* Nobody else to wait for */
5106  return true;
5107  }
5108 
5109  /* There should be exactly one local pin */
5110  if (GetPrivateRefCount(buffer) != 1)
5111  return false;
5112 
5113  bufHdr = GetBufferDescriptor(buffer - 1);
5114 
5115  /* caller must hold exclusive lock on buffer */
5117  LW_EXCLUSIVE));
5118 
5119  buf_state = LockBufHdr(bufHdr);
5120 
5121  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5122  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5123  {
5124  /* pincount is OK. */
5125  UnlockBufHdr(bufHdr, buf_state);
5126  return true;
5127  }
5128 
5129  UnlockBufHdr(bufHdr, buf_state);
5130  return false;
5131 }
5132 
5133 
5134 /*
5135  * Functions for buffer I/O handling
5136  *
5137  * Note: We assume that nested buffer I/O never occurs.
5138  * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5139  *
5140  * Also note that these are used only for shared buffers, not local ones.
5141  */
5142 
5143 /*
5144  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5145  */
5146 static void
5148 {
5150 
5152  for (;;)
5153  {
5154  uint32 buf_state;
5155 
5156  /*
5157  * It may not be necessary to acquire the spinlock to check the flag
5158  * here, but since this test is essential for correctness, we'd better
5159  * play it safe.
5160  */
5161  buf_state = LockBufHdr(buf);
5162  UnlockBufHdr(buf, buf_state);
5163 
5164  if (!(buf_state & BM_IO_IN_PROGRESS))
5165  break;
5166  ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5167  }
5169 }
5170 
5171 /*
5172  * StartBufferIO: begin I/O on this buffer
5173  * (Assumptions)
5174  * My process is executing no IO
5175  * The buffer is Pinned
5176  *
5177  * In some scenarios there are race conditions in which multiple backends
5178  * could attempt the same I/O operation concurrently. If someone else
5179  * has already started I/O on this buffer then we will block on the
5180  * I/O condition variable until he's done.
5181  *
5182  * Input operations are only attempted on buffers that are not BM_VALID,
5183  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5184  * so we can always tell if the work is already done.
5185  *
5186  * Returns true if we successfully marked the buffer as I/O busy,
5187  * false if someone else already did the work.
5188  */
5189 static bool
5190 StartBufferIO(BufferDesc *buf, bool forInput)
5191 {
5192  uint32 buf_state;
5193 
5195 
5196  for (;;)
5197  {
5198  buf_state = LockBufHdr(buf);
5199 
5200  if (!(buf_state & BM_IO_IN_PROGRESS))
5201  break;
5202  UnlockBufHdr(buf, buf_state);
5203  WaitIO(buf);
5204  }
5205 
5206  /* Once we get here, there is definitely no I/O active on this buffer */
5207 
5208  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5209  {
5210  /* someone else already did the I/O */
5211  UnlockBufHdr(buf, buf_state);
5212  return false;
5213  }
5214 
5215  buf_state |= BM_IO_IN_PROGRESS;
5216  UnlockBufHdr(buf, buf_state);
5217 
5220 
5221  return true;
5222 }
5223 
5224 /*
5225  * TerminateBufferIO: release a buffer we were doing I/O on
5226  * (Assumptions)
5227  * My process is executing IO for the buffer
5228  * BM_IO_IN_PROGRESS bit is set for the buffer
5229  * The buffer is Pinned
5230  *
5231  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5232  * buffer's BM_DIRTY flag. This is appropriate when terminating a
5233  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5234  * marking the buffer clean if it was re-dirtied while we were writing.
5235  *
5236  * set_flag_bits gets ORed into the buffer's flags. It must include
5237  * BM_IO_ERROR in a failure case. For successful completion it could
5238  * be 0, or BM_VALID if we just finished reading in the page.
5239  *
5240  * If forget_owner is true, we release the buffer I/O from the current
5241  * resource owner. (forget_owner=false is used when the resource owner itself
5242  * is being released)
5243  */
5244 static void
5245 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5246  bool forget_owner)
5247 {
5248  uint32 buf_state;
5249 
5250  buf_state = LockBufHdr(buf);
5251 
5252  Assert(buf_state & BM_IO_IN_PROGRESS);
5253 
5254  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5255  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5256  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5257 
5258  buf_state |= set_flag_bits;
5259  UnlockBufHdr(buf, buf_state);
5260 
5261  if (forget_owner)
5264 
5266 }
5267 
5268 /*
5269  * AbortBufferIO: Clean up active buffer I/O after an error.
5270  *
5271  * All LWLocks we might have held have been released,
5272  * but we haven't yet released buffer pins, so the buffer is still pinned.
5273  *
5274  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5275  * possible the error condition wasn't related to the I/O.
5276  *
5277  * Note: this does not remove the buffer I/O from the resource owner.
5278  * That's correct when we're releasing the whole resource owner, but
5279  * beware if you use this in other contexts.
5280  */
5281 static void
5283 {
5284  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5285  uint32 buf_state;
5286 
5287  buf_state = LockBufHdr(buf_hdr);
5288  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5289 
5290  if (!(buf_state & BM_VALID))
5291  {
5292  Assert(!(buf_state & BM_DIRTY));
5293  UnlockBufHdr(buf_hdr, buf_state);
5294  }
5295  else
5296  {
5297  Assert(buf_state & BM_DIRTY);
5298  UnlockBufHdr(buf_hdr, buf_state);
5299 
5300  /* Issue notice if this is not the first failure... */
5301  if (buf_state & BM_IO_ERROR)
5302  {
5303  /* Buffer is pinned, so we can read tag without spinlock */
5304  char *path;
5305 
5306  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5307  BufTagGetForkNum(&buf_hdr->tag));
5308  ereport(WARNING,
5309  (errcode(ERRCODE_IO_ERROR),
5310  errmsg("could not write block %u of %s",
5311  buf_hdr->tag.blockNum, path),
5312  errdetail("Multiple failures --- write error might be permanent.")));
5313  pfree(path);
5314  }
5315  }
5316 
5317  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5318 }
5319 
5320 /*
5321  * Error context callback for errors occurring during shared buffer writes.
5322  */
5323 static void
5325 {
5326  BufferDesc *bufHdr = (BufferDesc *) arg;
5327 
5328  /* Buffer is pinned, so we can read the tag without locking the spinlock */
5329  if (bufHdr != NULL)
5330  {
5331  char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5332  BufTagGetForkNum(&bufHdr->tag));
5333 
5334  errcontext("writing block %u of relation %s",
5335  bufHdr->tag.blockNum, path);
5336  pfree(path);
5337  }
5338 }
5339 
5340 /*
5341  * Error context callback for errors occurring during local buffer writes.
5342  */
5343 static void
5345 {
5346  BufferDesc *bufHdr = (BufferDesc *) arg;
5347 
5348  if (bufHdr != NULL)
5349  {
5350  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5351  MyBackendId,
5352  BufTagGetForkNum(&bufHdr->tag));
5353 
5354  errcontext("writing block %u of relation %s",
5355  bufHdr->tag.blockNum, path);
5356  pfree(path);
5357  }
5358 }
5359 
5360 /*
5361  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5362  */
5363 static int
5364 rlocator_comparator(const void *p1, const void *p2)
5365 {
5366  RelFileLocator n1 = *(const RelFileLocator *) p1;
5367  RelFileLocator n2 = *(const RelFileLocator *) p2;
5368 
5369  if (n1.relNumber < n2.relNumber)
5370  return -1;
5371  else if (n1.relNumber > n2.relNumber)
5372  return 1;
5373 
5374  if (n1.dbOid < n2.dbOid)
5375  return -1;
5376  else if (n1.dbOid > n2.dbOid)
5377  return 1;
5378 
5379  if (n1.spcOid < n2.spcOid)
5380  return -1;
5381  else if (n1.spcOid > n2.spcOid)
5382  return 1;
5383  else
5384  return 0;
5385 }
5386 
5387 /*
5388  * Lock buffer header - set BM_LOCKED in buffer state.
5389  */
5390 uint32
5392 {
5393  SpinDelayStatus delayStatus;
5394  uint32 old_buf_state;
5395 
5397 
5398  init_local_spin_delay(&delayStatus);
5399 
5400  while (true)
5401  {
5402  /* set BM_LOCKED flag */
5403  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5404  /* if it wasn't set before we're OK */
5405  if (!(old_buf_state & BM_LOCKED))
5406  break;
5407  perform_spin_delay(&delayStatus);
5408  }
5409  finish_spin_delay(&delayStatus);
5410  return old_buf_state | BM_LOCKED;
5411 }
5412 
5413 /*
5414  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5415  * state at that point.
5416  *
5417  * Obviously the buffer could be locked by the time the value is returned, so
5418  * this is primarily useful in CAS style loops.
5419  */
5420 static uint32
5422 {
5423  SpinDelayStatus delayStatus;
5424  uint32 buf_state;
5425 
5426  init_local_spin_delay(&delayStatus);
5427 
5428  buf_state = pg_atomic_read_u32(&buf->state);
5429 
5430  while (buf_state & BM_LOCKED)
5431  {
5432  perform_spin_delay(&delayStatus);
5433  buf_state = pg_atomic_read_u32(&buf->state);
5434  }
5435 
5436  finish_spin_delay(&delayStatus);
5437 
5438  return buf_state;
5439 }
5440 
5441 /*
5442  * BufferTag comparator.
5443  */
5444 static inline int
5446 {
5447  int ret;
5448  RelFileLocator rlocatora;
5449  RelFileLocator rlocatorb;
5450 
5451  rlocatora = BufTagGetRelFileLocator(ba);
5452  rlocatorb = BufTagGetRelFileLocator(bb);
5453 
5454  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5455 
5456  if (ret != 0)
5457  return ret;
5458 
5459  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5460  return -1;
5461  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5462  return 1;
5463 
5464  if (ba->blockNum < bb->blockNum)
5465  return -1;
5466  if (ba->blockNum > bb->blockNum)
5467  return 1;
5468 
5469  return 0;
5470 }
5471 
5472 /*
5473  * Comparator determining the writeout order in a checkpoint.
5474  *
5475  * It is important that tablespaces are compared first, the logic balancing
5476  * writes between tablespaces relies on it.
5477  */
5478 static inline int
5480 {
5481  /* compare tablespace */
5482  if (a->tsId < b->tsId)
5483  return -1;
5484  else if (a->tsId > b->tsId)
5485  return 1;
5486  /* compare relation */
5487  if (a->relNumber < b->relNumber)
5488  return -1;
5489  else if (a->relNumber > b->relNumber)
5490  return 1;
5491  /* compare fork */
5492  else if (a->forkNum < b->forkNum)
5493  return -1;
5494  else if (a->forkNum > b->forkNum)
5495  return 1;
5496  /* compare block number */
5497  else if (a->blockNum < b->blockNum)
5498  return -1;
5499  else if (a->blockNum > b->blockNum)
5500  return 1;
5501  /* equal page IDs are unlikely, but not impossible */
5502  return 0;
5503 }
5504 
5505 /*
5506  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5507  * progress.
5508  */
5509 static int
5511 {
5512  CkptTsStatus *sa = (CkptTsStatus *) a;
5513  CkptTsStatus *sb = (CkptTsStatus *) b;
5514 
5515  /* we want a min-heap, so return 1 for the a < b */
5516  if (sa->progress < sb->progress)
5517  return 1;
5518  else if (sa->progress == sb->progress)
5519  return 0;
5520  else
5521  return -1;
5522 }
5523 
5524 /*
5525  * Initialize a writeback context, discarding potential previous state.
5526  *
5527  * *max_pending is a pointer instead of an immediate value, so the coalesce
5528  * limits can easily changed by the GUC mechanism, and so calling code does
5529  * not have to check the current configuration. A value of 0 means that no
5530  * writeback control will be performed.
5531  */
5532 void
5533 WritebackContextInit(WritebackContext *context, int *max_pending)
5534 {
5535  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5536 
5537  context->max_pending = max_pending;
5538  context->nr_pending = 0;
5539 }
5540 
5541 /*
5542  * Add buffer to list of pending writeback requests.
5543  */
5544 void
5546  BufferTag *tag)
5547 {
5548  PendingWriteback *pending;
5549 
5551  return;
5552 
5553  /*
5554  * Add buffer to the pending writeback array, unless writeback control is
5555  * disabled.
5556  */
5557  if (*wb_context->max_pending > 0)
5558  {
5560 
5561  pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5562 
5563  pending->tag = *tag;
5564  }
5565 
5566  /*
5567  * Perform pending flushes if the writeback limit is exceeded. This
5568  * includes the case where previously an item has been added, but control
5569  * is now disabled.
5570  */
5571  if (wb_context->nr_pending >= *wb_context->max_pending)
5572  IssuePendingWritebacks(wb_context, io_context);
5573 }
5574 
5575 #define ST_SORT sort_pending_writebacks
5576 #define ST_ELEMENT_TYPE PendingWriteback
5577 #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5578 #define ST_SCOPE static
5579 #define ST_DEFINE
5580 #include <lib/sort_template.h>
5581 
5582 /*
5583  * Issue all pending writeback requests, previously scheduled with
5584  * ScheduleBufferTagForWriteback, to the OS.
5585  *
5586  * Because this is only used to improve the OSs IO scheduling we try to never
5587  * error out - it's just a hint.
5588  */
5589 void
5591 {
5592  instr_time io_start;
5593  int i;
5594 
5595  if (wb_context->nr_pending == 0)
5596  return;
5597 
5598  /*
5599  * Executing the writes in-order can make them a lot faster, and allows to
5600  * merge writeback requests to consecutive blocks into larger writebacks.
5601  */
5602  sort_pending_writebacks(wb_context->pending_writebacks,
5603  wb_context->nr_pending);
5604 
5606 
5607  /*
5608  * Coalesce neighbouring writes, but nothing else. For that we iterate
5609  * through the, now sorted, array of pending flushes, and look forward to
5610  * find all neighbouring (or identical) writes.
5611  */
5612  for (i = 0; i < wb_context->nr_pending; i++)
5613  {
5616  SMgrRelation reln;
5617  int ahead;
5618  BufferTag tag;
5619  RelFileLocator currlocator;
5620  Size nblocks = 1;
5621 
5622  cur = &wb_context->pending_writebacks[i];
5623  tag = cur->tag;
5624  currlocator = BufTagGetRelFileLocator(&tag);
5625 
5626  /*
5627  * Peek ahead, into following writeback requests, to see if they can
5628  * be combined with the current one.
5629  */
5630  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5631  {
5632 
5633  next = &wb_context->pending_writebacks[i + ahead + 1];
5634 
5635  /* different file, stop */
5636  if (!RelFileLocatorEquals(currlocator,
5637  BufTagGetRelFileLocator(&next->tag)) ||
5638  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5639  break;
5640 
5641  /* ok, block queued twice, skip */
5642  if (cur->tag.blockNum == next->tag.blockNum)
5643  continue;
5644 
5645  /* only merge consecutive writes */
5646  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5647  break;
5648 
5649  nblocks++;
5650  cur = next;
5651  }
5652 
5653  i += ahead;
5654 
5655  /* and finally tell the kernel to write the data to storage */
5656  reln = smgropen(currlocator, InvalidBackendId);
5657  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5658  }
5659 
5660  /*
5661  * Assume that writeback requests are only issued for buffers containing
5662  * blocks of permanent relations.
5663  */
5665  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5666 
5667  wb_context->nr_pending = 0;
5668 }
5669 
5670 /* ResourceOwner callbacks */
5671 
5672 static void
5674 {
5676 
5678 }
5679 
5680 static char *
5682 {
5684 
5685  return psprintf("lost track of buffer IO on buffer %d", buffer);
5686 }
5687 
5688 static void
5690 {
5692 
5693  /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
5694  if (!BufferIsValid(buffer))
5695  elog(ERROR, "bad buffer ID: %d", buffer);
5696 
5697  if (BufferIsLocal(buffer))
5699  else
5701 }
5702 
5703 static char *
5705 {
5707 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:306
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:367
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:272
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:236
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1791
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1655
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1619
int BackendId
Definition: backendid.h:21
#define InvalidBackendId
Definition: backendid.h:23
int BgWriterDelay
Definition: bgwriter.c:61
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:221
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)