PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogutils.h"
39 #include "catalog/storage.h"
40 #include "catalog/storage_xlog.h"
41 #include "executor/instrument.h"
42 #include "lib/binaryheap.h"
43 #include "miscadmin.h"
44 #include "pg_trace.h"
45 #include "pgstat.h"
46 #include "postmaster/bgwriter.h"
47 #include "storage/buf_internals.h"
48 #include "storage/bufmgr.h"
49 #include "storage/fd.h"
50 #include "storage/ipc.h"
51 #include "storage/lmgr.h"
52 #include "storage/proc.h"
53 #include "storage/smgr.h"
54 #include "storage/standby.h"
55 #include "utils/memdebug.h"
56 #include "utils/ps_status.h"
57 #include "utils/rel.h"
58 #include "utils/resowner.h"
59 #include "utils/timestamp.h"
60 
61 
62 /* Note: these two macros only work on shared buffers, not local ones! */
63 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
64 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
65 
66 /* Note: this macro only works on local buffers, not shared ones! */
67 #define LocalBufHdrGetBlock(bufHdr) \
68  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
69 
70 /* Bits in SyncOneBuffer's return value */
71 #define BUF_WRITTEN 0x01
72 #define BUF_REUSABLE 0x02
73 
74 #define RELS_BSEARCH_THRESHOLD 20
75 
76 /*
77  * This is the size (in the number of blocks) above which we scan the
78  * entire buffer pool to remove the buffers for all the pages of relation
79  * being dropped. For the relations with size below this threshold, we find
80  * the buffers by doing lookups in BufMapping table.
81  */
82 #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
83 
84 typedef struct PrivateRefCountEntry
85 {
89 
90 /* 64 bytes, about the size of a cache line on common systems */
91 #define REFCOUNT_ARRAY_ENTRIES 8
92 
93 /*
94  * Status of buffers to checkpoint for a particular tablespace, used
95  * internally in BufferSync.
96  */
97 typedef struct CkptTsStatus
98 {
99  /* oid of the tablespace */
101 
102  /*
103  * Checkpoint progress for this tablespace. To make progress comparable
104  * between tablespaces the progress is, for each tablespace, measured as a
105  * number between 0 and the total number of to-be-checkpointed pages. Each
106  * page checkpointed in this tablespace increments this space's progress
107  * by progress_slice.
108  */
111 
112  /* number of to-be checkpointed pages in this tablespace */
114  /* already processed pages in this tablespace */
116 
117  /* current offset in CkptBufferIds for this tablespace */
118  int index;
120 
121 /*
122  * Type for array used to sort SMgrRelations
123  *
124  * FlushRelationsAllBuffers shares the same comparator function with
125  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
126  * compatible.
127  */
128 typedef struct SMgrSortArray
129 {
130  RelFileLocator rlocator; /* This must be the first member */
133 
134 /* GUC variables */
135 bool zero_damaged_pages = false;
138 bool track_io_timing = false;
139 
140 /*
141  * How many buffers PrefetchBuffer callers should try to stay ahead of their
142  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
143  * for buffers not belonging to tablespaces that have their
144  * effective_io_concurrency parameter set.
145  */
147 
148 /*
149  * Like effective_io_concurrency, but used by maintenance code paths that might
150  * benefit from a higher setting because they work on behalf of many sessions.
151  * Overridden by the tablespace setting of the same name.
152  */
154 
155 /*
156  * GUC variables about triggering kernel writeback for buffers written; OS
157  * dependent defaults are set via the GUC mechanism.
158  */
162 
163 /* local state for LockBufferForCleanup */
165 
166 /*
167  * Backend-Private refcount management:
168  *
169  * Each buffer also has a private refcount that keeps track of the number of
170  * times the buffer is pinned in the current process. This is so that the
171  * shared refcount needs to be modified only once if a buffer is pinned more
172  * than once by an individual backend. It's also used to check that no buffers
173  * are still pinned at the end of transactions and when exiting.
174  *
175  *
176  * To avoid - as we used to - requiring an array with NBuffers entries to keep
177  * track of local buffers, we use a small sequentially searched array
178  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
179  * keep track of backend local pins.
180  *
181  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
182  * refcounts are kept track of in the array; after that, new array entries
183  * displace old ones into the hash table. That way a frequently used entry
184  * can't get "stuck" in the hashtable while infrequent ones clog the array.
185  *
186  * Note that in most scenarios the number of pinned buffers will not exceed
187  * REFCOUNT_ARRAY_ENTRIES.
188  *
189  *
190  * To enter a buffer into the refcount tracking mechanism first reserve a free
191  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
192  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
193  * memory allocations in NewPrivateRefCountEntry() which can be important
194  * because in some scenarios it's called with a spinlock held...
195  */
197 static HTAB *PrivateRefCountHash = NULL;
201 
202 static void ReservePrivateRefCountEntry(void);
205 static inline int32 GetPrivateRefCount(Buffer buffer);
207 
208 /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
209 static void ResOwnerReleaseBufferIO(Datum res);
210 static char *ResOwnerPrintBufferIO(Datum res);
211 static void ResOwnerReleaseBufferPin(Datum res);
212 static char *ResOwnerPrintBufferPin(Datum res);
213 
215 {
216  .name = "buffer io",
217  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
218  .release_priority = RELEASE_PRIO_BUFFER_IOS,
219  .ReleaseResource = ResOwnerReleaseBufferIO,
220  .DebugPrint = ResOwnerPrintBufferIO
221 };
222 
224 {
225  .name = "buffer pin",
226  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
227  .release_priority = RELEASE_PRIO_BUFFER_PINS,
228  .ReleaseResource = ResOwnerReleaseBufferPin,
229  .DebugPrint = ResOwnerPrintBufferPin
230 };
231 
232 /*
233  * Ensure that the PrivateRefCountArray has sufficient space to store one more
234  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
235  * a new entry - but it's perfectly fine to not use a reserved entry.
236  */
237 static void
239 {
240  /* Already reserved (or freed), nothing to do */
241  if (ReservedRefCountEntry != NULL)
242  return;
243 
244  /*
245  * First search for a free entry the array, that'll be sufficient in the
246  * majority of cases.
247  */
248  {
249  int i;
250 
251  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
252  {
254 
256 
257  if (res->buffer == InvalidBuffer)
258  {
260  return;
261  }
262  }
263  }
264 
265  /*
266  * No luck. All array entries are full. Move one array entry into the hash
267  * table.
268  */
269  {
270  /*
271  * Move entry from the current clock position in the array into the
272  * hashtable. Use that slot.
273  */
274  PrivateRefCountEntry *hashent;
275  bool found;
276 
277  /* select victim slot */
280 
281  /* Better be used, otherwise we shouldn't get here. */
283 
284  /* enter victim array entry into hashtable */
287  HASH_ENTER,
288  &found);
289  Assert(!found);
291 
292  /* clear the now free array slot */
295 
297  }
298 }
299 
300 /*
301  * Fill a previously reserved refcount entry.
302  */
303 static PrivateRefCountEntry *
305 {
307 
308  /* only allowed to be called when a reservation has been made */
309  Assert(ReservedRefCountEntry != NULL);
310 
311  /* use up the reserved entry */
313  ReservedRefCountEntry = NULL;
314 
315  /* and fill it */
316  res->buffer = buffer;
317  res->refcount = 0;
318 
319  return res;
320 }
321 
322 /*
323  * Return the PrivateRefCount entry for the passed buffer.
324  *
325  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
326  * do_move is true, and the entry resides in the hashtable the entry is
327  * optimized for frequent access by moving it to the array.
328  */
329 static PrivateRefCountEntry *
331 {
333  int i;
334 
337 
338  /*
339  * First search for references in the array, that'll be sufficient in the
340  * majority of cases.
341  */
342  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
343  {
345 
346  if (res->buffer == buffer)
347  return res;
348  }
349 
350  /*
351  * By here we know that the buffer, if already pinned, isn't residing in
352  * the array.
353  *
354  * Only look up the buffer in the hashtable if we've previously overflowed
355  * into it.
356  */
357  if (PrivateRefCountOverflowed == 0)
358  return NULL;
359 
361 
362  if (res == NULL)
363  return NULL;
364  else if (!do_move)
365  {
366  /* caller doesn't want us to move the hash entry into the array */
367  return res;
368  }
369  else
370  {
371  /* move buffer from hashtable into the free array slot */
372  bool found;
374 
375  /* Ensure there's a free array slot */
377 
378  /* Use up the reserved slot */
379  Assert(ReservedRefCountEntry != NULL);
381  ReservedRefCountEntry = NULL;
382  Assert(free->buffer == InvalidBuffer);
383 
384  /* and fill it */
385  free->buffer = buffer;
386  free->refcount = res->refcount;
387 
388  /* delete from hashtable */
390  Assert(found);
393 
394  return free;
395  }
396 }
397 
398 /*
399  * Returns how many times the passed buffer is pinned by this backend.
400  *
401  * Only works for shared memory buffers!
402  */
403 static inline int32
405 {
407 
410 
411  /*
412  * Not moving the entry - that's ok for the current users, but we might
413  * want to change this one day.
414  */
415  ref = GetPrivateRefCountEntry(buffer, false);
416 
417  if (ref == NULL)
418  return 0;
419  return ref->refcount;
420 }
421 
422 /*
423  * Release resources used to track the reference count of a buffer which we no
424  * longer have pinned and don't want to pin again immediately.
425  */
426 static void
428 {
429  Assert(ref->refcount == 0);
430 
431  if (ref >= &PrivateRefCountArray[0] &&
433  {
434  ref->buffer = InvalidBuffer;
435 
436  /*
437  * Mark the just used entry as reserved - in many scenarios that
438  * allows us to avoid ever having to search the array/hash for free
439  * entries.
440  */
441  ReservedRefCountEntry = ref;
442  }
443  else
444  {
445  bool found;
446  Buffer buffer = ref->buffer;
447 
449  Assert(found);
452  }
453 }
454 
455 /*
456  * BufferIsPinned
457  * True iff the buffer is pinned (also checks for valid buffer number).
458  *
459  * NOTE: what we check here is that *this* backend holds a pin on
460  * the buffer. We do not care whether some other backend does.
461  */
462 #define BufferIsPinned(bufnum) \
463 ( \
464  !BufferIsValid(bufnum) ? \
465  false \
466  : \
467  BufferIsLocal(bufnum) ? \
468  (LocalRefCount[-(bufnum) - 1] > 0) \
469  : \
470  (GetPrivateRefCount(bufnum) > 0) \
471 )
472 
473 
474 static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
475  ForkNumber forkNum, BlockNumber blockNum,
477  bool *hit);
479  ForkNumber fork,
480  BufferAccessStrategy strategy,
481  uint32 flags,
482  uint32 extend_by,
483  BlockNumber extend_upto,
484  Buffer *buffers,
485  uint32 *extended_by);
487  ForkNumber fork,
488  BufferAccessStrategy strategy,
489  uint32 flags,
490  uint32 extend_by,
491  BlockNumber extend_upto,
492  Buffer *buffers,
493  uint32 *extended_by);
494 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
495 static void PinBuffer_Locked(BufferDesc *buf);
496 static void UnpinBuffer(BufferDesc *buf);
497 static void UnpinBufferNoOwner(BufferDesc *buf);
498 static void BufferSync(int flags);
500 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
501  WritebackContext *wb_context);
502 static void WaitIO(BufferDesc *buf);
503 static bool StartBufferIO(BufferDesc *buf, bool forInput);
504 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
505  uint32 set_flag_bits, bool forget_owner);
506 static void AbortBufferIO(Buffer buffer);
507 static void shared_buffer_write_error_callback(void *arg);
508 static void local_buffer_write_error_callback(void *arg);
509 static BufferDesc *BufferAlloc(SMgrRelation smgr,
510  char relpersistence,
511  ForkNumber forkNum,
512  BlockNumber blockNum,
513  BufferAccessStrategy strategy,
514  bool *foundPtr, IOContext io_context);
515 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
516 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
517  IOObject io_object, IOContext io_context);
518 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
519  ForkNumber forkNum,
520  BlockNumber nForkBlock,
521  BlockNumber firstDelBlock);
522 static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
523  RelFileLocator dstlocator,
524  ForkNumber forkNum, bool permanent);
525 static void AtProcExit_Buffers(int code, Datum arg);
526 static void CheckForBufferLeaks(void);
527 static int rlocator_comparator(const void *p1, const void *p2);
528 static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
529 static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
530 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
531 
532 
533 /*
534  * Implementation of PrefetchBuffer() for shared buffers.
535  */
538  ForkNumber forkNum,
539  BlockNumber blockNum)
540 {
541  PrefetchBufferResult result = {InvalidBuffer, false};
542  BufferTag newTag; /* identity of requested block */
543  uint32 newHash; /* hash value for newTag */
544  LWLock *newPartitionLock; /* buffer partition lock for it */
545  int buf_id;
546 
547  Assert(BlockNumberIsValid(blockNum));
548 
549  /* create a tag so we can lookup the buffer */
550  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
551  forkNum, blockNum);
552 
553  /* determine its hash code and partition lock ID */
554  newHash = BufTableHashCode(&newTag);
555  newPartitionLock = BufMappingPartitionLock(newHash);
556 
557  /* see if the block is in the buffer pool already */
558  LWLockAcquire(newPartitionLock, LW_SHARED);
559  buf_id = BufTableLookup(&newTag, newHash);
560  LWLockRelease(newPartitionLock);
561 
562  /* If not in buffers, initiate prefetch */
563  if (buf_id < 0)
564  {
565 #ifdef USE_PREFETCH
566  /*
567  * Try to initiate an asynchronous read. This returns false in
568  * recovery if the relation file doesn't exist.
569  */
570  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
571  smgrprefetch(smgr_reln, forkNum, blockNum, 1))
572  {
573  result.initiated_io = true;
574  }
575 #endif /* USE_PREFETCH */
576  }
577  else
578  {
579  /*
580  * Report the buffer it was in at that time. The caller may be able
581  * to avoid a buffer table lookup, but it's not pinned and it must be
582  * rechecked!
583  */
584  result.recent_buffer = buf_id + 1;
585  }
586 
587  /*
588  * If the block *is* in buffers, we do nothing. This is not really ideal:
589  * the block might be just about to be evicted, which would be stupid
590  * since we know we are going to need it soon. But the only easy answer
591  * is to bump the usage_count, which does not seem like a great solution:
592  * when the caller does ultimately touch the block, usage_count would get
593  * bumped again, resulting in too much favoritism for blocks that are
594  * involved in a prefetch sequence. A real fix would involve some
595  * additional per-buffer state, and it's not clear that there's enough of
596  * a problem to justify that.
597  */
598 
599  return result;
600 }
601 
602 /*
603  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
604  *
605  * This is named by analogy to ReadBuffer but doesn't actually allocate a
606  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
607  * block will not be delayed by the I/O. Prefetching is optional.
608  *
609  * There are three possible outcomes:
610  *
611  * 1. If the block is already cached, the result includes a valid buffer that
612  * could be used by the caller to avoid the need for a later buffer lookup, but
613  * it's not pinned, so the caller must recheck it.
614  *
615  * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
616  * true. Currently there is no way to know if the data was already cached by
617  * the kernel and therefore didn't really initiate I/O, and no way to know when
618  * the I/O completes other than using synchronous ReadBuffer().
619  *
620  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
621  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
622  * lack of a kernel facility), direct I/O is enabled, or the underlying
623  * relation file wasn't found and we are in recovery. (If the relation file
624  * wasn't found and we are not in recovery, an error is raised).
625  */
628 {
629  Assert(RelationIsValid(reln));
630  Assert(BlockNumberIsValid(blockNum));
631 
632  if (RelationUsesLocalBuffers(reln))
633  {
634  /* see comments in ReadBufferExtended */
635  if (RELATION_IS_OTHER_TEMP(reln))
636  ereport(ERROR,
637  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
638  errmsg("cannot access temporary tables of other sessions")));
639 
640  /* pass it off to localbuf.c */
641  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
642  }
643  else
644  {
645  /* pass it to the shared buffer version */
646  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
647  }
648 }
649 
650 /*
651  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
652  *
653  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
654  * successful. Return true if the buffer is valid and still has the expected
655  * tag. In that case, the buffer is pinned and the usage count is bumped.
656  */
657 bool
659  Buffer recent_buffer)
660 {
661  BufferDesc *bufHdr;
662  BufferTag tag;
663  uint32 buf_state;
664  bool have_private_ref;
665 
666  Assert(BufferIsValid(recent_buffer));
667 
670  InitBufferTag(&tag, &rlocator, forkNum, blockNum);
671 
672  if (BufferIsLocal(recent_buffer))
673  {
674  int b = -recent_buffer - 1;
675 
676  bufHdr = GetLocalBufferDescriptor(b);
677  buf_state = pg_atomic_read_u32(&bufHdr->state);
678 
679  /* Is it still valid and holding the right tag? */
680  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
681  {
682  PinLocalBuffer(bufHdr, true);
683 
685 
686  return true;
687  }
688  }
689  else
690  {
691  bufHdr = GetBufferDescriptor(recent_buffer - 1);
692  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
693 
694  /*
695  * Do we already have this buffer pinned with a private reference? If
696  * so, it must be valid and it is safe to check the tag without
697  * locking. If not, we have to lock the header first and then check.
698  */
699  if (have_private_ref)
700  buf_state = pg_atomic_read_u32(&bufHdr->state);
701  else
702  buf_state = LockBufHdr(bufHdr);
703 
704  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
705  {
706  /*
707  * It's now safe to pin the buffer. We can't pin first and ask
708  * questions later, because it might confuse code paths like
709  * InvalidateBuffer() if we pinned a random non-matching buffer.
710  */
711  if (have_private_ref)
712  PinBuffer(bufHdr, NULL); /* bump pin count */
713  else
714  PinBuffer_Locked(bufHdr); /* pin for first time */
715 
717 
718  return true;
719  }
720 
721  /* If we locked the header above, now unlock. */
722  if (!have_private_ref)
723  UnlockBufHdr(bufHdr, buf_state);
724  }
725 
726  return false;
727 }
728 
729 /*
730  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
731  * fork with RBM_NORMAL mode and default strategy.
732  */
733 Buffer
735 {
736  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
737 }
738 
739 /*
740  * ReadBufferExtended -- returns a buffer containing the requested
741  * block of the requested relation. If the blknum
742  * requested is P_NEW, extend the relation file and
743  * allocate a new block. (Caller is responsible for
744  * ensuring that only one backend tries to extend a
745  * relation at the same time!)
746  *
747  * Returns: the buffer number for the buffer containing
748  * the block read. The returned buffer has been pinned.
749  * Does not return on error --- elog's instead.
750  *
751  * Assume when this function is called, that reln has been opened already.
752  *
753  * In RBM_NORMAL mode, the page is read from disk, and the page header is
754  * validated. An error is thrown if the page header is not valid. (But
755  * note that an all-zero page is considered "valid"; see
756  * PageIsVerifiedExtended().)
757  *
758  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
759  * valid, the page is zeroed instead of throwing an error. This is intended
760  * for non-critical data, where the caller is prepared to repair errors.
761  *
762  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
763  * filled with zeros instead of reading it from disk. Useful when the caller
764  * is going to fill the page from scratch, since this saves I/O and avoids
765  * unnecessary failure if the page-on-disk has corrupt page headers.
766  * The page is returned locked to ensure that the caller has a chance to
767  * initialize the page before it's made visible to others.
768  * Caution: do not use this mode to read a page that is beyond the relation's
769  * current physical EOF; that is likely to cause problems in md.c when
770  * the page is modified and written out. P_NEW is OK, though.
771  *
772  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
773  * a cleanup-strength lock on the page.
774  *
775  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
776  *
777  * If strategy is not NULL, a nondefault buffer access strategy is used.
778  * See buffer/README for details.
779  */
780 Buffer
783 {
784  bool hit;
785  Buffer buf;
786 
787  /*
788  * Reject attempts to read non-local temporary relations; we would be
789  * likely to get wrong data since we have no visibility into the owning
790  * session's local buffers.
791  */
792  if (RELATION_IS_OTHER_TEMP(reln))
793  ereport(ERROR,
794  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
795  errmsg("cannot access temporary tables of other sessions")));
796 
797  /*
798  * Read the buffer, and update pgstat counters to reflect a cache hit or
799  * miss.
800  */
802  buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
803  forkNum, blockNum, mode, strategy, &hit);
804  if (hit)
806  return buf;
807 }
808 
809 
810 /*
811  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
812  * a relcache entry for the relation.
813  *
814  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
815  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
816  * cannot be used for temporary relations (and making that work might be
817  * difficult, unless we only want to read temporary relations for our own
818  * ProcNumber).
819  */
820 Buffer
822  BlockNumber blockNum, ReadBufferMode mode,
823  BufferAccessStrategy strategy, bool permanent)
824 {
825  bool hit;
826 
827  SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
828 
829  return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
830  RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
831  mode, strategy, &hit);
832 }
833 
834 /*
835  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
836  */
837 Buffer
839  ForkNumber forkNum,
840  BufferAccessStrategy strategy,
841  uint32 flags)
842 {
843  Buffer buf;
844  uint32 extend_by = 1;
845 
846  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
847  &buf, &extend_by);
848 
849  return buf;
850 }
851 
852 /*
853  * Extend relation by multiple blocks.
854  *
855  * Tries to extend the relation by extend_by blocks. Depending on the
856  * availability of resources the relation may end up being extended by a
857  * smaller number of pages (unless an error is thrown, always by at least one
858  * page). *extended_by is updated to the number of pages the relation has been
859  * extended to.
860  *
861  * buffers needs to be an array that is at least extend_by long. Upon
862  * completion, the first extend_by array elements will point to a pinned
863  * buffer.
864  *
865  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
866  * locked. This is useful for callers that want a buffer that is guaranteed to
867  * be empty.
868  */
871  ForkNumber fork,
872  BufferAccessStrategy strategy,
873  uint32 flags,
874  uint32 extend_by,
875  Buffer *buffers,
876  uint32 *extended_by)
877 {
878  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
879  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
880  Assert(extend_by > 0);
881 
882  if (bmr.smgr == NULL)
883  {
884  bmr.smgr = RelationGetSmgr(bmr.rel);
885  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
886  }
887 
888  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
889  extend_by, InvalidBlockNumber,
890  buffers, extended_by);
891 }
892 
893 /*
894  * Extend the relation so it is at least extend_to blocks large, return buffer
895  * (extend_to - 1).
896  *
897  * This is useful for callers that want to write a specific page, regardless
898  * of the current size of the relation (e.g. useful for visibilitymap and for
899  * crash recovery).
900  */
901 Buffer
903  ForkNumber fork,
904  BufferAccessStrategy strategy,
905  uint32 flags,
906  BlockNumber extend_to,
908 {
910  uint32 extended_by = 0;
912  Buffer buffers[64];
913 
914  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
915  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
916  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
917 
918  if (bmr.smgr == NULL)
919  {
920  bmr.smgr = RelationGetSmgr(bmr.rel);
921  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
922  }
923 
924  /*
925  * If desired, create the file if it doesn't exist. If
926  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
927  * an smgrexists call.
928  */
929  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
930  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
932  !smgrexists(bmr.smgr, fork))
933  {
935 
936  /* recheck, fork might have been created concurrently */
937  if (!smgrexists(bmr.smgr, fork))
938  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
939 
941  }
942 
943  /*
944  * If requested, invalidate size cache, so that smgrnblocks asks the
945  * kernel.
946  */
947  if (flags & EB_CLEAR_SIZE_CACHE)
949 
950  /*
951  * Estimate how many pages we'll need to extend by. This avoids acquiring
952  * unnecessarily many victim buffers.
953  */
954  current_size = smgrnblocks(bmr.smgr, fork);
955 
956  /*
957  * Since no-one else can be looking at the page contents yet, there is no
958  * difference between an exclusive lock and a cleanup-strength lock. Note
959  * that we pass the original mode to ReadBuffer_common() below, when
960  * falling back to reading the buffer to a concurrent relation extension.
961  */
963  flags |= EB_LOCK_TARGET;
964 
965  while (current_size < extend_to)
966  {
967  uint32 num_pages = lengthof(buffers);
968  BlockNumber first_block;
969 
970  if ((uint64) current_size + num_pages > extend_to)
971  num_pages = extend_to - current_size;
972 
973  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
974  num_pages, extend_to,
975  buffers, &extended_by);
976 
977  current_size = first_block + extended_by;
978  Assert(num_pages != 0 || current_size >= extend_to);
979 
980  for (uint32 i = 0; i < extended_by; i++)
981  {
982  if (first_block + i != extend_to - 1)
983  ReleaseBuffer(buffers[i]);
984  else
985  buffer = buffers[i];
986  }
987  }
988 
989  /*
990  * It's possible that another backend concurrently extended the relation.
991  * In that case read the buffer.
992  *
993  * XXX: Should we control this via a flag?
994  */
995  if (buffer == InvalidBuffer)
996  {
997  bool hit;
998 
999  Assert(extended_by == 0);
1001  fork, extend_to - 1, mode, strategy,
1002  &hit);
1003  }
1004 
1005  return buffer;
1006 }
1007 
1008 /*
1009  * ReadBuffer_common -- common logic for all ReadBuffer variants
1010  *
1011  * *hit is set to true if the request was satisfied from shared buffer cache.
1012  */
1013 static Buffer
1014 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1015  BlockNumber blockNum, ReadBufferMode mode,
1016  BufferAccessStrategy strategy, bool *hit)
1017 {
1018  BufferDesc *bufHdr;
1019  Block bufBlock;
1020  bool found;
1021  IOContext io_context;
1022  IOObject io_object;
1023  bool isLocalBuf = SmgrIsTemp(smgr);
1024 
1025  *hit = false;
1026 
1027  /*
1028  * Backward compatibility path, most code should use ExtendBufferedRel()
1029  * instead, as acquiring the extension lock inside ExtendBufferedRel()
1030  * scales a lot better.
1031  */
1032  if (unlikely(blockNum == P_NEW))
1033  {
1035 
1036  /*
1037  * Since no-one else can be looking at the page contents yet, there is
1038  * no difference between an exclusive lock and a cleanup-strength
1039  * lock.
1040  */
1042  flags |= EB_LOCK_FIRST;
1043 
1044  return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
1045  forkNum, strategy, flags);
1046  }
1047 
1048  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1050  smgr->smgr_rlocator.locator.dbOid,
1052  smgr->smgr_rlocator.backend);
1053 
1054  if (isLocalBuf)
1055  {
1056  /*
1057  * We do not use a BufferAccessStrategy for I/O of temporary tables.
1058  * However, in some cases, the "strategy" may not be NULL, so we can't
1059  * rely on IOContextForStrategy() to set the right IOContext for us.
1060  * This may happen in cases like CREATE TEMPORARY TABLE AS...
1061  */
1062  io_context = IOCONTEXT_NORMAL;
1063  io_object = IOOBJECT_TEMP_RELATION;
1064  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
1065  if (found)
1067  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1070  }
1071  else
1072  {
1073  /*
1074  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
1075  * not currently in memory.
1076  */
1077  io_context = IOContextForStrategy(strategy);
1078  io_object = IOOBJECT_RELATION;
1079  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
1080  strategy, &found, io_context);
1081  if (found)
1083  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
1086  }
1087 
1088  /* At this point we do NOT hold any locks. */
1089 
1090  /* if it was already in the buffer pool, we're done */
1091  if (found)
1092  {
1093  /* Just need to update stats before we exit */
1094  *hit = true;
1095  VacuumPageHit++;
1096  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1097 
1098  if (VacuumCostActive)
1100 
1101  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1103  smgr->smgr_rlocator.locator.dbOid,
1105  smgr->smgr_rlocator.backend,
1106  found);
1107 
1108  /*
1109  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
1110  * on return.
1111  */
1112  if (!isLocalBuf)
1113  {
1114  if (mode == RBM_ZERO_AND_LOCK)
1116  LW_EXCLUSIVE);
1117  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
1119  }
1120 
1121  return BufferDescriptorGetBuffer(bufHdr);
1122  }
1123 
1124  /*
1125  * if we have gotten to this point, we have allocated a buffer for the
1126  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
1127  * if it's a shared buffer.
1128  */
1129  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
1130 
1131  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
1132 
1133  /*
1134  * Read in the page, unless the caller intends to overwrite it and just
1135  * wants us to allocate a buffer.
1136  */
1138  MemSet((char *) bufBlock, 0, BLCKSZ);
1139  else
1140  {
1142 
1143  smgrread(smgr, forkNum, blockNum, bufBlock);
1144 
1145  pgstat_count_io_op_time(io_object, io_context,
1146  IOOP_READ, io_start, 1);
1147 
1148  /* check for garbage data */
1149  if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1151  {
1153  {
1154  ereport(WARNING,
1156  errmsg("invalid page in block %u of relation %s; zeroing out page",
1157  blockNum,
1158  relpath(smgr->smgr_rlocator, forkNum))));
1159  MemSet((char *) bufBlock, 0, BLCKSZ);
1160  }
1161  else
1162  ereport(ERROR,
1164  errmsg("invalid page in block %u of relation %s",
1165  blockNum,
1166  relpath(smgr->smgr_rlocator, forkNum))));
1167  }
1168  }
1169 
1170  /*
1171  * In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer
1172  * content lock before marking the page as valid, to make sure that no
1173  * other backend sees the zeroed page before the caller has had a chance
1174  * to initialize it.
1175  *
1176  * Since no-one else can be looking at the page contents yet, there is no
1177  * difference between an exclusive lock and a cleanup-strength lock. (Note
1178  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1179  * they assert that the buffer is already valid.)
1180  */
1182  !isLocalBuf)
1183  {
1185  }
1186 
1187  if (isLocalBuf)
1188  {
1189  /* Only need to adjust flags */
1190  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1191 
1192  buf_state |= BM_VALID;
1193  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1194  }
1195  else
1196  {
1197  /* Set BM_VALID, terminate IO, and wake up any waiters */
1198  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1199  }
1200 
1201  VacuumPageMiss++;
1202  if (VacuumCostActive)
1204 
1205  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1207  smgr->smgr_rlocator.locator.dbOid,
1209  smgr->smgr_rlocator.backend,
1210  found);
1211 
1212  return BufferDescriptorGetBuffer(bufHdr);
1213 }
1214 
1215 /*
1216  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
1217  * buffer. If no buffer exists already, selects a replacement
1218  * victim and evicts the old page, but does NOT read in new page.
1219  *
1220  * "strategy" can be a buffer replacement strategy object, or NULL for
1221  * the default strategy. The selected buffer's usage_count is advanced when
1222  * using the default strategy, but otherwise possibly not (see PinBuffer).
1223  *
1224  * The returned buffer is pinned and is already marked as holding the
1225  * desired page. If it already did have the desired page, *foundPtr is
1226  * set true. Otherwise, *foundPtr is set false and the buffer is marked
1227  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
1228  *
1229  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1230  * we keep it for simplicity in ReadBuffer.
1231  *
1232  * io_context is passed as an output parameter to avoid calling
1233  * IOContextForStrategy() when there is a shared buffers hit and no IO
1234  * statistics need be captured.
1235  *
1236  * No locks are held either at entry or exit.
1237  */
1238 static BufferDesc *
1239 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1240  BlockNumber blockNum,
1241  BufferAccessStrategy strategy,
1242  bool *foundPtr, IOContext io_context)
1243 {
1244  BufferTag newTag; /* identity of requested block */
1245  uint32 newHash; /* hash value for newTag */
1246  LWLock *newPartitionLock; /* buffer partition lock for it */
1247  int existing_buf_id;
1248  Buffer victim_buffer;
1249  BufferDesc *victim_buf_hdr;
1250  uint32 victim_buf_state;
1251 
1252  /* Make sure we will have room to remember the buffer pin */
1255 
1256  /* create a tag so we can lookup the buffer */
1257  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1258 
1259  /* determine its hash code and partition lock ID */
1260  newHash = BufTableHashCode(&newTag);
1261  newPartitionLock = BufMappingPartitionLock(newHash);
1262 
1263  /* see if the block is in the buffer pool already */
1264  LWLockAcquire(newPartitionLock, LW_SHARED);
1265  existing_buf_id = BufTableLookup(&newTag, newHash);
1266  if (existing_buf_id >= 0)
1267  {
1268  BufferDesc *buf;
1269  bool valid;
1270 
1271  /*
1272  * Found it. Now, pin the buffer so no one can steal it from the
1273  * buffer pool, and check to see if the correct data has been loaded
1274  * into the buffer.
1275  */
1276  buf = GetBufferDescriptor(existing_buf_id);
1277 
1278  valid = PinBuffer(buf, strategy);
1279 
1280  /* Can release the mapping lock as soon as we've pinned it */
1281  LWLockRelease(newPartitionLock);
1282 
1283  *foundPtr = true;
1284 
1285  if (!valid)
1286  {
1287  /*
1288  * We can only get here if (a) someone else is still reading in
1289  * the page, or (b) a previous read attempt failed. We have to
1290  * wait for any active read attempt to finish, and then set up our
1291  * own read attempt if the page is still not BM_VALID.
1292  * StartBufferIO does it all.
1293  */
1294  if (StartBufferIO(buf, true))
1295  {
1296  /*
1297  * If we get here, previous attempts to read the buffer must
1298  * have failed ... but we shall bravely try again.
1299  */
1300  *foundPtr = false;
1301  }
1302  }
1303 
1304  return buf;
1305  }
1306 
1307  /*
1308  * Didn't find it in the buffer pool. We'll have to initialize a new
1309  * buffer. Remember to unlock the mapping lock while doing the work.
1310  */
1311  LWLockRelease(newPartitionLock);
1312 
1313  /*
1314  * Acquire a victim buffer. Somebody else might try to do the same, we
1315  * don't hold any conflicting locks. If so we'll have to undo our work
1316  * later.
1317  */
1318  victim_buffer = GetVictimBuffer(strategy, io_context);
1319  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1320 
1321  /*
1322  * Try to make a hashtable entry for the buffer under its new tag. If
1323  * somebody else inserted another buffer for the tag, we'll release the
1324  * victim buffer we acquired and use the already inserted one.
1325  */
1326  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1327  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1328  if (existing_buf_id >= 0)
1329  {
1330  BufferDesc *existing_buf_hdr;
1331  bool valid;
1332 
1333  /*
1334  * Got a collision. Someone has already done what we were about to do.
1335  * We'll just handle this as if it were found in the buffer pool in
1336  * the first place. First, give up the buffer we were planning to
1337  * use.
1338  *
1339  * We could do this after releasing the partition lock, but then we'd
1340  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1341  * before acquiring the lock, for the rare case of such a collision.
1342  */
1343  UnpinBuffer(victim_buf_hdr);
1344 
1345  /*
1346  * The victim buffer we acquired previously is clean and unused, let
1347  * it be found again quickly
1348  */
1349  StrategyFreeBuffer(victim_buf_hdr);
1350 
1351  /* remaining code should match code at top of routine */
1352 
1353  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1354 
1355  valid = PinBuffer(existing_buf_hdr, strategy);
1356 
1357  /* Can release the mapping lock as soon as we've pinned it */
1358  LWLockRelease(newPartitionLock);
1359 
1360  *foundPtr = true;
1361 
1362  if (!valid)
1363  {
1364  /*
1365  * We can only get here if (a) someone else is still reading in
1366  * the page, or (b) a previous read attempt failed. We have to
1367  * wait for any active read attempt to finish, and then set up our
1368  * own read attempt if the page is still not BM_VALID.
1369  * StartBufferIO does it all.
1370  */
1371  if (StartBufferIO(existing_buf_hdr, true))
1372  {
1373  /*
1374  * If we get here, previous attempts to read the buffer must
1375  * have failed ... but we shall bravely try again.
1376  */
1377  *foundPtr = false;
1378  }
1379  }
1380 
1381  return existing_buf_hdr;
1382  }
1383 
1384  /*
1385  * Need to lock the buffer header too in order to change its tag.
1386  */
1387  victim_buf_state = LockBufHdr(victim_buf_hdr);
1388 
1389  /* some sanity checks while we hold the buffer header lock */
1390  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1391  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1392 
1393  victim_buf_hdr->tag = newTag;
1394 
1395  /*
1396  * Make sure BM_PERMANENT is set for buffers that must be written at every
1397  * checkpoint. Unlogged buffers only need to be written at shutdown
1398  * checkpoints, except for their "init" forks, which need to be treated
1399  * just like permanent relations.
1400  */
1401  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1402  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1403  victim_buf_state |= BM_PERMANENT;
1404 
1405  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1406 
1407  LWLockRelease(newPartitionLock);
1408 
1409  /*
1410  * Buffer contents are currently invalid. Try to obtain the right to
1411  * start I/O. If StartBufferIO returns false, then someone else managed
1412  * to read it before we did, so there's nothing left for BufferAlloc() to
1413  * do.
1414  */
1415  if (StartBufferIO(victim_buf_hdr, true))
1416  *foundPtr = false;
1417  else
1418  *foundPtr = true;
1419 
1420  return victim_buf_hdr;
1421 }
1422 
1423 /*
1424  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1425  * freelist.
1426  *
1427  * The buffer header spinlock must be held at entry. We drop it before
1428  * returning. (This is sane because the caller must have locked the
1429  * buffer in order to be sure it should be dropped.)
1430  *
1431  * This is used only in contexts such as dropping a relation. We assume
1432  * that no other backend could possibly be interested in using the page,
1433  * so the only reason the buffer might be pinned is if someone else is
1434  * trying to write it out. We have to let them finish before we can
1435  * reclaim the buffer.
1436  *
1437  * The buffer could get reclaimed by someone else while we are waiting
1438  * to acquire the necessary locks; if so, don't mess it up.
1439  */
1440 static void
1442 {
1443  BufferTag oldTag;
1444  uint32 oldHash; /* hash value for oldTag */
1445  LWLock *oldPartitionLock; /* buffer partition lock for it */
1446  uint32 oldFlags;
1447  uint32 buf_state;
1448 
1449  /* Save the original buffer tag before dropping the spinlock */
1450  oldTag = buf->tag;
1451 
1452  buf_state = pg_atomic_read_u32(&buf->state);
1453  Assert(buf_state & BM_LOCKED);
1454  UnlockBufHdr(buf, buf_state);
1455 
1456  /*
1457  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1458  * worth storing the hashcode in BufferDesc so we need not recompute it
1459  * here? Probably not.
1460  */
1461  oldHash = BufTableHashCode(&oldTag);
1462  oldPartitionLock = BufMappingPartitionLock(oldHash);
1463 
1464 retry:
1465 
1466  /*
1467  * Acquire exclusive mapping lock in preparation for changing the buffer's
1468  * association.
1469  */
1470  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1471 
1472  /* Re-lock the buffer header */
1473  buf_state = LockBufHdr(buf);
1474 
1475  /* If it's changed while we were waiting for lock, do nothing */
1476  if (!BufferTagsEqual(&buf->tag, &oldTag))
1477  {
1478  UnlockBufHdr(buf, buf_state);
1479  LWLockRelease(oldPartitionLock);
1480  return;
1481  }
1482 
1483  /*
1484  * We assume the only reason for it to be pinned is that someone else is
1485  * flushing the page out. Wait for them to finish. (This could be an
1486  * infinite loop if the refcount is messed up... it would be nice to time
1487  * out after awhile, but there seems no way to be sure how many loops may
1488  * be needed. Note that if the other guy has pinned the buffer but not
1489  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1490  * be busy-looping here.)
1491  */
1492  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1493  {
1494  UnlockBufHdr(buf, buf_state);
1495  LWLockRelease(oldPartitionLock);
1496  /* safety check: should definitely not be our *own* pin */
1498  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1499  WaitIO(buf);
1500  goto retry;
1501  }
1502 
1503  /*
1504  * Clear out the buffer's tag and flags. We must do this to ensure that
1505  * linear scans of the buffer array don't think the buffer is valid.
1506  */
1507  oldFlags = buf_state & BUF_FLAG_MASK;
1508  ClearBufferTag(&buf->tag);
1509  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1510  UnlockBufHdr(buf, buf_state);
1511 
1512  /*
1513  * Remove the buffer from the lookup hashtable, if it was in there.
1514  */
1515  if (oldFlags & BM_TAG_VALID)
1516  BufTableDelete(&oldTag, oldHash);
1517 
1518  /*
1519  * Done with mapping lock.
1520  */
1521  LWLockRelease(oldPartitionLock);
1522 
1523  /*
1524  * Insert the buffer at the head of the list of free buffers.
1525  */
1527 }
1528 
1529 /*
1530  * Helper routine for GetVictimBuffer()
1531  *
1532  * Needs to be called on a buffer with a valid tag, pinned, but without the
1533  * buffer header spinlock held.
1534  *
1535  * Returns true if the buffer can be reused, in which case the buffer is only
1536  * pinned by this backend and marked as invalid, false otherwise.
1537  */
1538 static bool
1540 {
1541  uint32 buf_state;
1542  uint32 hash;
1543  LWLock *partition_lock;
1544  BufferTag tag;
1545 
1547 
1548  /* have buffer pinned, so it's safe to read tag without lock */
1549  tag = buf_hdr->tag;
1550 
1551  hash = BufTableHashCode(&tag);
1552  partition_lock = BufMappingPartitionLock(hash);
1553 
1554  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1555 
1556  /* lock the buffer header */
1557  buf_state = LockBufHdr(buf_hdr);
1558 
1559  /*
1560  * We have the buffer pinned nobody else should have been able to unset
1561  * this concurrently.
1562  */
1563  Assert(buf_state & BM_TAG_VALID);
1564  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1565  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1566 
1567  /*
1568  * If somebody else pinned the buffer since, or even worse, dirtied it,
1569  * give up on this buffer: It's clearly in use.
1570  */
1571  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1572  {
1573  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1574 
1575  UnlockBufHdr(buf_hdr, buf_state);
1576  LWLockRelease(partition_lock);
1577 
1578  return false;
1579  }
1580 
1581  /*
1582  * Clear out the buffer's tag and flags and usagecount. This is not
1583  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1584  * doing anything with the buffer. But currently it's beneficial, as the
1585  * cheaper pre-check for several linear scans of shared buffers use the
1586  * tag (see e.g. FlushDatabaseBuffers()).
1587  */
1588  ClearBufferTag(&buf_hdr->tag);
1589  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1590  UnlockBufHdr(buf_hdr, buf_state);
1591 
1592  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1593 
1594  /* finally delete buffer from the buffer mapping table */
1595  BufTableDelete(&tag, hash);
1596 
1597  LWLockRelease(partition_lock);
1598 
1599  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1600  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1602 
1603  return true;
1604 }
1605 
1606 static Buffer
1608 {
1609  BufferDesc *buf_hdr;
1610  Buffer buf;
1611  uint32 buf_state;
1612  bool from_ring;
1613 
1614  /*
1615  * Ensure, while the spinlock's not yet held, that there's a free refcount
1616  * entry, and a resource owner slot for the pin.
1617  */
1620 
1621  /* we return here if a prospective victim buffer gets used concurrently */
1622 again:
1623 
1624  /*
1625  * Select a victim buffer. The buffer is returned with its header
1626  * spinlock still held!
1627  */
1628  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1629  buf = BufferDescriptorGetBuffer(buf_hdr);
1630 
1631  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1632 
1633  /* Pin the buffer and then release the buffer spinlock */
1634  PinBuffer_Locked(buf_hdr);
1635 
1636  /*
1637  * We shouldn't have any other pins for this buffer.
1638  */
1640 
1641  /*
1642  * If the buffer was dirty, try to write it out. There is a race
1643  * condition here, in that someone might dirty it after we released the
1644  * buffer header lock above, or even while we are writing it out (since
1645  * our share-lock won't prevent hint-bit updates). We will recheck the
1646  * dirty bit after re-locking the buffer header.
1647  */
1648  if (buf_state & BM_DIRTY)
1649  {
1650  LWLock *content_lock;
1651 
1652  Assert(buf_state & BM_TAG_VALID);
1653  Assert(buf_state & BM_VALID);
1654 
1655  /*
1656  * We need a share-lock on the buffer contents to write it out (else
1657  * we might write invalid data, eg because someone else is compacting
1658  * the page contents while we write). We must use a conditional lock
1659  * acquisition here to avoid deadlock. Even though the buffer was not
1660  * pinned (and therefore surely not locked) when StrategyGetBuffer
1661  * returned it, someone else could have pinned and exclusive-locked it
1662  * by the time we get here. If we try to get the lock unconditionally,
1663  * we'd block waiting for them; if they later block waiting for us,
1664  * deadlock ensues. (This has been observed to happen when two
1665  * backends are both trying to split btree index pages, and the second
1666  * one just happens to be trying to split the page the first one got
1667  * from StrategyGetBuffer.)
1668  */
1669  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1670  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1671  {
1672  /*
1673  * Someone else has locked the buffer, so give it up and loop back
1674  * to get another one.
1675  */
1676  UnpinBuffer(buf_hdr);
1677  goto again;
1678  }
1679 
1680  /*
1681  * If using a nondefault strategy, and writing the buffer would
1682  * require a WAL flush, let the strategy decide whether to go ahead
1683  * and write/reuse the buffer or to choose another victim. We need a
1684  * lock to inspect the page LSN, so this can't be done inside
1685  * StrategyGetBuffer.
1686  */
1687  if (strategy != NULL)
1688  {
1689  XLogRecPtr lsn;
1690 
1691  /* Read the LSN while holding buffer header lock */
1692  buf_state = LockBufHdr(buf_hdr);
1693  lsn = BufferGetLSN(buf_hdr);
1694  UnlockBufHdr(buf_hdr, buf_state);
1695 
1696  if (XLogNeedsFlush(lsn)
1697  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1698  {
1699  LWLockRelease(content_lock);
1700  UnpinBuffer(buf_hdr);
1701  goto again;
1702  }
1703  }
1704 
1705  /* OK, do the I/O */
1706  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1707  LWLockRelease(content_lock);
1708 
1710  &buf_hdr->tag);
1711  }
1712 
1713 
1714  if (buf_state & BM_VALID)
1715  {
1716  /*
1717  * When a BufferAccessStrategy is in use, blocks evicted from shared
1718  * buffers are counted as IOOP_EVICT in the corresponding context
1719  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1720  * strategy in two cases: 1) while initially claiming buffers for the
1721  * strategy ring 2) to replace an existing strategy ring buffer
1722  * because it is pinned or in use and cannot be reused.
1723  *
1724  * Blocks evicted from buffers already in the strategy ring are
1725  * counted as IOOP_REUSE in the corresponding strategy context.
1726  *
1727  * At this point, we can accurately count evictions and reuses,
1728  * because we have successfully claimed the valid buffer. Previously,
1729  * we may have been forced to release the buffer due to concurrent
1730  * pinners or erroring out.
1731  */
1733  from_ring ? IOOP_REUSE : IOOP_EVICT);
1734  }
1735 
1736  /*
1737  * If the buffer has an entry in the buffer mapping table, delete it. This
1738  * can fail because another backend could have pinned or dirtied the
1739  * buffer.
1740  */
1741  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
1742  {
1743  UnpinBuffer(buf_hdr);
1744  goto again;
1745  }
1746 
1747  /* a final set of sanity checks */
1748 #ifdef USE_ASSERT_CHECKING
1749  buf_state = pg_atomic_read_u32(&buf_hdr->state);
1750 
1751  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
1752  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
1753 
1755 #endif
1756 
1757  return buf;
1758 }
1759 
1760 /*
1761  * Limit the number of pins a batch operation may additionally acquire, to
1762  * avoid running out of pinnable buffers.
1763  *
1764  * One additional pin is always allowed, as otherwise the operation likely
1765  * cannot be performed at all.
1766  *
1767  * The number of allowed pins for a backend is computed based on
1768  * shared_buffers and the maximum number of connections possible. That's very
1769  * pessimistic, but outside of toy-sized shared_buffers it should allow
1770  * sufficient pins.
1771  */
1772 static void
1773 LimitAdditionalPins(uint32 *additional_pins)
1774 {
1775  uint32 max_backends;
1776  int max_proportional_pins;
1777 
1778  if (*additional_pins <= 1)
1779  return;
1780 
1781  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
1782  max_proportional_pins = NBuffers / max_backends;
1783 
1784  /*
1785  * Subtract the approximate number of buffers already pinned by this
1786  * backend. We get the number of "overflowed" pins for free, but don't
1787  * know the number of pins in PrivateRefCountArray. The cost of
1788  * calculating that exactly doesn't seem worth it, so just assume the max.
1789  */
1790  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
1791 
1792  if (max_proportional_pins <= 0)
1793  max_proportional_pins = 1;
1794 
1795  if (*additional_pins > max_proportional_pins)
1796  *additional_pins = max_proportional_pins;
1797 }
1798 
1799 /*
1800  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
1801  * avoid duplicating the tracing and relpersistence related logic.
1802  */
1803 static BlockNumber
1805  ForkNumber fork,
1806  BufferAccessStrategy strategy,
1807  uint32 flags,
1808  uint32 extend_by,
1809  BlockNumber extend_upto,
1810  Buffer *buffers,
1811  uint32 *extended_by)
1812 {
1813  BlockNumber first_block;
1814 
1815  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
1819  bmr.smgr->smgr_rlocator.backend,
1820  extend_by);
1821 
1822  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
1823  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
1824  extend_by, extend_upto,
1825  buffers, &extend_by);
1826  else
1827  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
1828  extend_by, extend_upto,
1829  buffers, &extend_by);
1830  *extended_by = extend_by;
1831 
1832  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
1836  bmr.smgr->smgr_rlocator.backend,
1837  *extended_by,
1838  first_block);
1839 
1840  return first_block;
1841 }
1842 
1843 /*
1844  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
1845  * shared buffers.
1846  */
1847 static BlockNumber
1849  ForkNumber fork,
1850  BufferAccessStrategy strategy,
1851  uint32 flags,
1852  uint32 extend_by,
1853  BlockNumber extend_upto,
1854  Buffer *buffers,
1855  uint32 *extended_by)
1856 {
1857  BlockNumber first_block;
1858  IOContext io_context = IOContextForStrategy(strategy);
1859  instr_time io_start;
1860 
1861  LimitAdditionalPins(&extend_by);
1862 
1863  /*
1864  * Acquire victim buffers for extension without holding extension lock.
1865  * Writing out victim buffers is the most expensive part of extending the
1866  * relation, particularly when doing so requires WAL flushes. Zeroing out
1867  * the buffers is also quite expensive, so do that before holding the
1868  * extension lock as well.
1869  *
1870  * These pages are pinned by us and not valid. While we hold the pin they
1871  * can't be acquired as victim buffers by another backend.
1872  */
1873  for (uint32 i = 0; i < extend_by; i++)
1874  {
1875  Block buf_block;
1876 
1877  buffers[i] = GetVictimBuffer(strategy, io_context);
1878  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
1879 
1880  /* new buffers are zero-filled */
1881  MemSet((char *) buf_block, 0, BLCKSZ);
1882  }
1883 
1884  /*
1885  * Lock relation against concurrent extensions, unless requested not to.
1886  *
1887  * We use the same extension lock for all forks. That's unnecessarily
1888  * restrictive, but currently extensions for forks don't happen often
1889  * enough to make it worth locking more granularly.
1890  *
1891  * Note that another backend might have extended the relation by the time
1892  * we get the lock.
1893  */
1894  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1896 
1897  /*
1898  * If requested, invalidate size cache, so that smgrnblocks asks the
1899  * kernel.
1900  */
1901  if (flags & EB_CLEAR_SIZE_CACHE)
1903 
1904  first_block = smgrnblocks(bmr.smgr, fork);
1905 
1906  /*
1907  * Now that we have the accurate relation size, check if the caller wants
1908  * us to extend to only up to a specific size. If there were concurrent
1909  * extensions, we might have acquired too many buffers and need to release
1910  * them.
1911  */
1912  if (extend_upto != InvalidBlockNumber)
1913  {
1914  uint32 orig_extend_by = extend_by;
1915 
1916  if (first_block > extend_upto)
1917  extend_by = 0;
1918  else if ((uint64) first_block + extend_by > extend_upto)
1919  extend_by = extend_upto - first_block;
1920 
1921  for (uint32 i = extend_by; i < orig_extend_by; i++)
1922  {
1923  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
1924 
1925  /*
1926  * The victim buffer we acquired previously is clean and unused,
1927  * let it be found again quickly
1928  */
1929  StrategyFreeBuffer(buf_hdr);
1930  UnpinBuffer(buf_hdr);
1931  }
1932 
1933  if (extend_by == 0)
1934  {
1935  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1937  *extended_by = extend_by;
1938  return first_block;
1939  }
1940  }
1941 
1942  /* Fail if relation is already at maximum possible length */
1943  if ((uint64) first_block + extend_by >= MaxBlockNumber)
1944  ereport(ERROR,
1945  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1946  errmsg("cannot extend relation %s beyond %u blocks",
1947  relpath(bmr.smgr->smgr_rlocator, fork),
1948  MaxBlockNumber)));
1949 
1950  /*
1951  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
1952  *
1953  * This needs to happen before we extend the relation, because as soon as
1954  * we do, other backends can start to read in those pages.
1955  */
1956  for (uint32 i = 0; i < extend_by; i++)
1957  {
1958  Buffer victim_buf = buffers[i];
1959  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
1960  BufferTag tag;
1961  uint32 hash;
1962  LWLock *partition_lock;
1963  int existing_id;
1964 
1965  /* in case we need to pin an existing buffer below */
1968 
1969  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
1970  hash = BufTableHashCode(&tag);
1971  partition_lock = BufMappingPartitionLock(hash);
1972 
1973  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1974 
1975  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
1976 
1977  /*
1978  * We get here only in the corner case where we are trying to extend
1979  * the relation but we found a pre-existing buffer. This can happen
1980  * because a prior attempt at extending the relation failed, and
1981  * because mdread doesn't complain about reads beyond EOF (when
1982  * zero_damaged_pages is ON) and so a previous attempt to read a block
1983  * beyond EOF could have left a "valid" zero-filled buffer.
1984  * Unfortunately, we have also seen this case occurring because of
1985  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
1986  * that doesn't account for a recent write. In that situation, the
1987  * pre-existing buffer would contain valid data that we don't want to
1988  * overwrite. Since the legitimate cases should always have left a
1989  * zero-filled buffer, complain if not PageIsNew.
1990  */
1991  if (existing_id >= 0)
1992  {
1993  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
1994  Block buf_block;
1995  bool valid;
1996 
1997  /*
1998  * Pin the existing buffer before releasing the partition lock,
1999  * preventing it from being evicted.
2000  */
2001  valid = PinBuffer(existing_hdr, strategy);
2002 
2003  LWLockRelease(partition_lock);
2004 
2005  /*
2006  * The victim buffer we acquired previously is clean and unused,
2007  * let it be found again quickly
2008  */
2009  StrategyFreeBuffer(victim_buf_hdr);
2010  UnpinBuffer(victim_buf_hdr);
2011 
2012  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2013  buf_block = BufHdrGetBlock(existing_hdr);
2014 
2015  if (valid && !PageIsNew((Page) buf_block))
2016  ereport(ERROR,
2017  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2018  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2019  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2020 
2021  /*
2022  * We *must* do smgr[zero]extend before succeeding, else the page
2023  * will not be reserved by the kernel, and the next P_NEW call
2024  * will decide to return the same page. Clear the BM_VALID bit,
2025  * do StartBufferIO() and proceed.
2026  *
2027  * Loop to handle the very small possibility that someone re-sets
2028  * BM_VALID between our clearing it and StartBufferIO inspecting
2029  * it.
2030  */
2031  do
2032  {
2033  uint32 buf_state = LockBufHdr(existing_hdr);
2034 
2035  buf_state &= ~BM_VALID;
2036  UnlockBufHdr(existing_hdr, buf_state);
2037  } while (!StartBufferIO(existing_hdr, true));
2038  }
2039  else
2040  {
2041  uint32 buf_state;
2042 
2043  buf_state = LockBufHdr(victim_buf_hdr);
2044 
2045  /* some sanity checks while we hold the buffer header lock */
2046  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2047  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2048 
2049  victim_buf_hdr->tag = tag;
2050 
2051  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2052  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2053  buf_state |= BM_PERMANENT;
2054 
2055  UnlockBufHdr(victim_buf_hdr, buf_state);
2056 
2057  LWLockRelease(partition_lock);
2058 
2059  /* XXX: could combine the locked operations in it with the above */
2060  StartBufferIO(victim_buf_hdr, true);
2061  }
2062  }
2063 
2065 
2066  /*
2067  * Note: if smgrzeroextend fails, we will end up with buffers that are
2068  * allocated but not marked BM_VALID. The next relation extension will
2069  * still select the same block number (because the relation didn't get any
2070  * longer on disk) and so future attempts to extend the relation will find
2071  * the same buffers (if they have not been recycled) but come right back
2072  * here to try smgrzeroextend again.
2073  *
2074  * We don't need to set checksum for all-zero pages.
2075  */
2076  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2077 
2078  /*
2079  * Release the file-extension lock; it's now OK for someone else to extend
2080  * the relation some more.
2081  *
2082  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2083  * take noticeable time.
2084  */
2085  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2087 
2089  io_start, extend_by);
2090 
2091  /* Set BM_VALID, terminate IO, and wake up any waiters */
2092  for (uint32 i = 0; i < extend_by; i++)
2093  {
2094  Buffer buf = buffers[i];
2095  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2096  bool lock = false;
2097 
2098  if (flags & EB_LOCK_FIRST && i == 0)
2099  lock = true;
2100  else if (flags & EB_LOCK_TARGET)
2101  {
2102  Assert(extend_upto != InvalidBlockNumber);
2103  if (first_block + i + 1 == extend_upto)
2104  lock = true;
2105  }
2106 
2107  if (lock)
2109 
2110  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2111  }
2112 
2113  pgBufferUsage.shared_blks_written += extend_by;
2114 
2115  *extended_by = extend_by;
2116 
2117  return first_block;
2118 }
2119 
2120 /*
2121  * BufferIsExclusiveLocked
2122  *
2123  * Checks if buffer is exclusive-locked.
2124  *
2125  * Buffer must be pinned.
2126  */
2127 bool
2129 {
2130  BufferDesc *bufHdr;
2131 
2132  if (BufferIsLocal(buffer))
2133  {
2134  int bufid = -buffer - 1;
2135 
2136  bufHdr = GetLocalBufferDescriptor(bufid);
2137  }
2138  else
2139  {
2140  bufHdr = GetBufferDescriptor(buffer - 1);
2141  }
2142 
2145  LW_EXCLUSIVE);
2146 }
2147 
2148 /*
2149  * BufferIsDirty
2150  *
2151  * Checks if buffer is already dirty.
2152  *
2153  * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2154  * the result may be stale before it's returned.)
2155  */
2156 bool
2158 {
2159  BufferDesc *bufHdr;
2160 
2161  if (BufferIsLocal(buffer))
2162  {
2163  int bufid = -buffer - 1;
2164 
2165  bufHdr = GetLocalBufferDescriptor(bufid);
2166  }
2167  else
2168  {
2169  bufHdr = GetBufferDescriptor(buffer - 1);
2170  }
2171 
2174  LW_EXCLUSIVE));
2175 
2176  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2177 }
2178 
2179 /*
2180  * MarkBufferDirty
2181  *
2182  * Marks buffer contents as dirty (actual write happens later).
2183  *
2184  * Buffer must be pinned and exclusive-locked. (If caller does not hold
2185  * exclusive lock, then somebody could be in process of writing the buffer,
2186  * leading to risk of bad data written to disk.)
2187  */
2188 void
2190 {
2191  BufferDesc *bufHdr;
2192  uint32 buf_state;
2193  uint32 old_buf_state;
2194 
2195  if (!BufferIsValid(buffer))
2196  elog(ERROR, "bad buffer ID: %d", buffer);
2197 
2198  if (BufferIsLocal(buffer))
2199  {
2201  return;
2202  }
2203 
2204  bufHdr = GetBufferDescriptor(buffer - 1);
2205 
2208  LW_EXCLUSIVE));
2209 
2210  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2211  for (;;)
2212  {
2213  if (old_buf_state & BM_LOCKED)
2214  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2215 
2216  buf_state = old_buf_state;
2217 
2218  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2219  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2220 
2221  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2222  buf_state))
2223  break;
2224  }
2225 
2226  /*
2227  * If the buffer was not dirty already, do vacuum accounting.
2228  */
2229  if (!(old_buf_state & BM_DIRTY))
2230  {
2231  VacuumPageDirty++;
2233  if (VacuumCostActive)
2235  }
2236 }
2237 
2238 /*
2239  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2240  *
2241  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2242  * compared to calling the two routines separately. Now it's mainly just
2243  * a convenience function. However, if the passed buffer is valid and
2244  * already contains the desired block, we just return it as-is; and that
2245  * does save considerable work compared to a full release and reacquire.
2246  *
2247  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2248  * buffer actually needs to be released. This case is the same as ReadBuffer,
2249  * but can save some tests in the caller.
2250  */
2251 Buffer
2253  Relation relation,
2254  BlockNumber blockNum)
2255 {
2256  ForkNumber forkNum = MAIN_FORKNUM;
2257  BufferDesc *bufHdr;
2258 
2259  if (BufferIsValid(buffer))
2260  {
2262  if (BufferIsLocal(buffer))
2263  {
2264  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2265  if (bufHdr->tag.blockNum == blockNum &&
2266  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2267  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2268  return buffer;
2270  }
2271  else
2272  {
2273  bufHdr = GetBufferDescriptor(buffer - 1);
2274  /* we have pin, so it's ok to examine tag without spinlock */
2275  if (bufHdr->tag.blockNum == blockNum &&
2276  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2277  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2278  return buffer;
2279  UnpinBuffer(bufHdr);
2280  }
2281  }
2282 
2283  return ReadBuffer(relation, blockNum);
2284 }
2285 
2286 /*
2287  * PinBuffer -- make buffer unavailable for replacement.
2288  *
2289  * For the default access strategy, the buffer's usage_count is incremented
2290  * when we first pin it; for other strategies we just make sure the usage_count
2291  * isn't zero. (The idea of the latter is that we don't want synchronized
2292  * heap scans to inflate the count, but we need it to not be zero to discourage
2293  * other backends from stealing buffers from our ring. As long as we cycle
2294  * through the ring faster than the global clock-sweep cycles, buffers in
2295  * our ring won't be chosen as victims for replacement by other backends.)
2296  *
2297  * This should be applied only to shared buffers, never local ones.
2298  *
2299  * Since buffers are pinned/unpinned very frequently, pin buffers without
2300  * taking the buffer header lock; instead update the state variable in loop of
2301  * CAS operations. Hopefully it's just a single CAS.
2302  *
2303  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2304  * must have been done already.
2305  *
2306  * Returns true if buffer is BM_VALID, else false. This provision allows
2307  * some callers to avoid an extra spinlock cycle.
2308  */
2309 static bool
2311 {
2313  bool result;
2314  PrivateRefCountEntry *ref;
2315 
2316  Assert(!BufferIsLocal(b));
2317  Assert(ReservedRefCountEntry != NULL);
2318 
2319  ref = GetPrivateRefCountEntry(b, true);
2320 
2321  if (ref == NULL)
2322  {
2323  uint32 buf_state;
2324  uint32 old_buf_state;
2325 
2326  ref = NewPrivateRefCountEntry(b);
2327 
2328  old_buf_state = pg_atomic_read_u32(&buf->state);
2329  for (;;)
2330  {
2331  if (old_buf_state & BM_LOCKED)
2332  old_buf_state = WaitBufHdrUnlocked(buf);
2333 
2334  buf_state = old_buf_state;
2335 
2336  /* increase refcount */
2337  buf_state += BUF_REFCOUNT_ONE;
2338 
2339  if (strategy == NULL)
2340  {
2341  /* Default case: increase usagecount unless already max. */
2343  buf_state += BUF_USAGECOUNT_ONE;
2344  }
2345  else
2346  {
2347  /*
2348  * Ring buffers shouldn't evict others from pool. Thus we
2349  * don't make usagecount more than 1.
2350  */
2351  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2352  buf_state += BUF_USAGECOUNT_ONE;
2353  }
2354 
2355  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2356  buf_state))
2357  {
2358  result = (buf_state & BM_VALID) != 0;
2359 
2360  /*
2361  * Assume that we acquired a buffer pin for the purposes of
2362  * Valgrind buffer client checks (even in !result case) to
2363  * keep things simple. Buffers that are unsafe to access are
2364  * not generally guaranteed to be marked undefined or
2365  * non-accessible in any case.
2366  */
2368  break;
2369  }
2370  }
2371  }
2372  else
2373  {
2374  /*
2375  * If we previously pinned the buffer, it must surely be valid.
2376  *
2377  * Note: We deliberately avoid a Valgrind client request here.
2378  * Individual access methods can optionally superimpose buffer page
2379  * client requests on top of our client requests to enforce that
2380  * buffers are only accessed while locked (and pinned). It's possible
2381  * that the buffer page is legitimately non-accessible here. We
2382  * cannot meddle with that.
2383  */
2384  result = true;
2385  }
2386 
2387  ref->refcount++;
2388  Assert(ref->refcount > 0);
2390  return result;
2391 }
2392 
2393 /*
2394  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2395  * The spinlock is released before return.
2396  *
2397  * As this function is called with the spinlock held, the caller has to
2398  * previously call ReservePrivateRefCountEntry() and
2399  * ResourceOwnerEnlarge(CurrentResourceOwner);
2400  *
2401  * Currently, no callers of this function want to modify the buffer's
2402  * usage_count at all, so there's no need for a strategy parameter.
2403  * Also we don't bother with a BM_VALID test (the caller could check that for
2404  * itself).
2405  *
2406  * Also all callers only ever use this function when it's known that the
2407  * buffer can't have a preexisting pin by this backend. That allows us to skip
2408  * searching the private refcount array & hash, which is a boon, because the
2409  * spinlock is still held.
2410  *
2411  * Note: use of this routine is frequently mandatory, not just an optimization
2412  * to save a spin lock/unlock cycle, because we need to pin a buffer before
2413  * its state can change under us.
2414  */
2415 static void
2417 {
2418  Buffer b;
2419  PrivateRefCountEntry *ref;
2420  uint32 buf_state;
2421 
2422  /*
2423  * As explained, We don't expect any preexisting pins. That allows us to
2424  * manipulate the PrivateRefCount after releasing the spinlock
2425  */
2427 
2428  /*
2429  * Buffer can't have a preexisting pin, so mark its page as defined to
2430  * Valgrind (this is similar to the PinBuffer() case where the backend
2431  * doesn't already have a buffer pin)
2432  */
2434 
2435  /*
2436  * Since we hold the buffer spinlock, we can update the buffer state and
2437  * release the lock in one operation.
2438  */
2439  buf_state = pg_atomic_read_u32(&buf->state);
2440  Assert(buf_state & BM_LOCKED);
2441  buf_state += BUF_REFCOUNT_ONE;
2442  UnlockBufHdr(buf, buf_state);
2443 
2445 
2446  ref = NewPrivateRefCountEntry(b);
2447  ref->refcount++;
2448 
2450 }
2451 
2452 /*
2453  * UnpinBuffer -- make buffer available for replacement.
2454  *
2455  * This should be applied only to shared buffers, never local ones. This
2456  * always adjusts CurrentResourceOwner.
2457  */
2458 static void
2460 {
2462 
2465 }
2466 
2467 static void
2469 {
2470  PrivateRefCountEntry *ref;
2472 
2473  Assert(!BufferIsLocal(b));
2474 
2475  /* not moving as we're likely deleting it soon anyway */
2476  ref = GetPrivateRefCountEntry(b, false);
2477  Assert(ref != NULL);
2478  Assert(ref->refcount > 0);
2479  ref->refcount--;
2480  if (ref->refcount == 0)
2481  {
2482  uint32 buf_state;
2483  uint32 old_buf_state;
2484 
2485  /*
2486  * Mark buffer non-accessible to Valgrind.
2487  *
2488  * Note that the buffer may have already been marked non-accessible
2489  * within access method code that enforces that buffers are only
2490  * accessed while a buffer lock is held.
2491  */
2493 
2494  /* I'd better not still hold the buffer content lock */
2496 
2497  /*
2498  * Decrement the shared reference count.
2499  *
2500  * Since buffer spinlock holder can update status using just write,
2501  * it's not safe to use atomic decrement here; thus use a CAS loop.
2502  */
2503  old_buf_state = pg_atomic_read_u32(&buf->state);
2504  for (;;)
2505  {
2506  if (old_buf_state & BM_LOCKED)
2507  old_buf_state = WaitBufHdrUnlocked(buf);
2508 
2509  buf_state = old_buf_state;
2510 
2511  buf_state -= BUF_REFCOUNT_ONE;
2512 
2513  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2514  buf_state))
2515  break;
2516  }
2517 
2518  /* Support LockBufferForCleanup() */
2519  if (buf_state & BM_PIN_COUNT_WAITER)
2520  {
2521  /*
2522  * Acquire the buffer header lock, re-check that there's a waiter.
2523  * Another backend could have unpinned this buffer, and already
2524  * woken up the waiter. There's no danger of the buffer being
2525  * replaced after we unpinned it above, as it's pinned by the
2526  * waiter.
2527  */
2528  buf_state = LockBufHdr(buf);
2529 
2530  if ((buf_state & BM_PIN_COUNT_WAITER) &&
2531  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2532  {
2533  /* we just released the last pin other than the waiter's */
2534  int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2535 
2536  buf_state &= ~BM_PIN_COUNT_WAITER;
2537  UnlockBufHdr(buf, buf_state);
2538  ProcSendSignal(wait_backend_pgprocno);
2539  }
2540  else
2541  UnlockBufHdr(buf, buf_state);
2542  }
2544  }
2545 }
2546 
2547 #define ST_SORT sort_checkpoint_bufferids
2548 #define ST_ELEMENT_TYPE CkptSortItem
2549 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2550 #define ST_SCOPE static
2551 #define ST_DEFINE
2552 #include <lib/sort_template.h>
2553 
2554 /*
2555  * BufferSync -- Write out all dirty buffers in the pool.
2556  *
2557  * This is called at checkpoint time to write out all dirty shared buffers.
2558  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2559  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2560  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2561  * unlogged buffers, which are otherwise skipped. The remaining flags
2562  * currently have no effect here.
2563  */
2564 static void
2565 BufferSync(int flags)
2566 {
2567  uint32 buf_state;
2568  int buf_id;
2569  int num_to_scan;
2570  int num_spaces;
2571  int num_processed;
2572  int num_written;
2573  CkptTsStatus *per_ts_stat = NULL;
2574  Oid last_tsid;
2575  binaryheap *ts_heap;
2576  int i;
2577  int mask = BM_DIRTY;
2578  WritebackContext wb_context;
2579 
2580  /*
2581  * Unless this is a shutdown checkpoint or we have been explicitly told,
2582  * we write only permanent, dirty buffers. But at shutdown or end of
2583  * recovery, we write all dirty buffers.
2584  */
2587  mask |= BM_PERMANENT;
2588 
2589  /*
2590  * Loop over all buffers, and mark the ones that need to be written with
2591  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2592  * can estimate how much work needs to be done.
2593  *
2594  * This allows us to write only those pages that were dirty when the
2595  * checkpoint began, and not those that get dirtied while it proceeds.
2596  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2597  * later in this function, or by normal backends or the bgwriter cleaning
2598  * scan, the flag is cleared. Any buffer dirtied after this point won't
2599  * have the flag set.
2600  *
2601  * Note that if we fail to write some buffer, we may leave buffers with
2602  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2603  * certainly need to be written for the next checkpoint attempt, too.
2604  */
2605  num_to_scan = 0;
2606  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2607  {
2608  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2609 
2610  /*
2611  * Header spinlock is enough to examine BM_DIRTY, see comment in
2612  * SyncOneBuffer.
2613  */
2614  buf_state = LockBufHdr(bufHdr);
2615 
2616  if ((buf_state & mask) == mask)
2617  {
2618  CkptSortItem *item;
2619 
2620  buf_state |= BM_CHECKPOINT_NEEDED;
2621 
2622  item = &CkptBufferIds[num_to_scan++];
2623  item->buf_id = buf_id;
2624  item->tsId = bufHdr->tag.spcOid;
2625  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2626  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2627  item->blockNum = bufHdr->tag.blockNum;
2628  }
2629 
2630  UnlockBufHdr(bufHdr, buf_state);
2631 
2632  /* Check for barrier events in case NBuffers is large. */
2635  }
2636 
2637  if (num_to_scan == 0)
2638  return; /* nothing to do */
2639 
2641 
2642  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2643 
2644  /*
2645  * Sort buffers that need to be written to reduce the likelihood of random
2646  * IO. The sorting is also important for the implementation of balancing
2647  * writes between tablespaces. Without balancing writes we'd potentially
2648  * end up writing to the tablespaces one-by-one; possibly overloading the
2649  * underlying system.
2650  */
2651  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2652 
2653  num_spaces = 0;
2654 
2655  /*
2656  * Allocate progress status for each tablespace with buffers that need to
2657  * be flushed. This requires the to-be-flushed array to be sorted.
2658  */
2659  last_tsid = InvalidOid;
2660  for (i = 0; i < num_to_scan; i++)
2661  {
2662  CkptTsStatus *s;
2663  Oid cur_tsid;
2664 
2665  cur_tsid = CkptBufferIds[i].tsId;
2666 
2667  /*
2668  * Grow array of per-tablespace status structs, every time a new
2669  * tablespace is found.
2670  */
2671  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2672  {
2673  Size sz;
2674 
2675  num_spaces++;
2676 
2677  /*
2678  * Not worth adding grow-by-power-of-2 logic here - even with a
2679  * few hundred tablespaces this should be fine.
2680  */
2681  sz = sizeof(CkptTsStatus) * num_spaces;
2682 
2683  if (per_ts_stat == NULL)
2684  per_ts_stat = (CkptTsStatus *) palloc(sz);
2685  else
2686  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2687 
2688  s = &per_ts_stat[num_spaces - 1];
2689  memset(s, 0, sizeof(*s));
2690  s->tsId = cur_tsid;
2691 
2692  /*
2693  * The first buffer in this tablespace. As CkptBufferIds is sorted
2694  * by tablespace all (s->num_to_scan) buffers in this tablespace
2695  * will follow afterwards.
2696  */
2697  s->index = i;
2698 
2699  /*
2700  * progress_slice will be determined once we know how many buffers
2701  * are in each tablespace, i.e. after this loop.
2702  */
2703 
2704  last_tsid = cur_tsid;
2705  }
2706  else
2707  {
2708  s = &per_ts_stat[num_spaces - 1];
2709  }
2710 
2711  s->num_to_scan++;
2712 
2713  /* Check for barrier events. */
2716  }
2717 
2718  Assert(num_spaces > 0);
2719 
2720  /*
2721  * Build a min-heap over the write-progress in the individual tablespaces,
2722  * and compute how large a portion of the total progress a single
2723  * processed buffer is.
2724  */
2725  ts_heap = binaryheap_allocate(num_spaces,
2727  NULL);
2728 
2729  for (i = 0; i < num_spaces; i++)
2730  {
2731  CkptTsStatus *ts_stat = &per_ts_stat[i];
2732 
2733  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2734 
2735  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2736  }
2737 
2738  binaryheap_build(ts_heap);
2739 
2740  /*
2741  * Iterate through to-be-checkpointed buffers and write the ones (still)
2742  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2743  * tablespaces; otherwise the sorting would lead to only one tablespace
2744  * receiving writes at a time, making inefficient use of the hardware.
2745  */
2746  num_processed = 0;
2747  num_written = 0;
2748  while (!binaryheap_empty(ts_heap))
2749  {
2750  BufferDesc *bufHdr = NULL;
2751  CkptTsStatus *ts_stat = (CkptTsStatus *)
2753 
2754  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2755  Assert(buf_id != -1);
2756 
2757  bufHdr = GetBufferDescriptor(buf_id);
2758 
2759  num_processed++;
2760 
2761  /*
2762  * We don't need to acquire the lock here, because we're only looking
2763  * at a single bit. It's possible that someone else writes the buffer
2764  * and clears the flag right after we check, but that doesn't matter
2765  * since SyncOneBuffer will then do nothing. However, there is a
2766  * further race condition: it's conceivable that between the time we
2767  * examine the bit here and the time SyncOneBuffer acquires the lock,
2768  * someone else not only wrote the buffer but replaced it with another
2769  * page and dirtied it. In that improbable case, SyncOneBuffer will
2770  * write the buffer though we didn't need to. It doesn't seem worth
2771  * guarding against this, though.
2772  */
2774  {
2775  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2776  {
2777  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2779  num_written++;
2780  }
2781  }
2782 
2783  /*
2784  * Measure progress independent of actually having to flush the buffer
2785  * - otherwise writing become unbalanced.
2786  */
2787  ts_stat->progress += ts_stat->progress_slice;
2788  ts_stat->num_scanned++;
2789  ts_stat->index++;
2790 
2791  /* Have all the buffers from the tablespace been processed? */
2792  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2793  {
2794  binaryheap_remove_first(ts_heap);
2795  }
2796  else
2797  {
2798  /* update heap with the new progress */
2799  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2800  }
2801 
2802  /*
2803  * Sleep to throttle our I/O rate.
2804  *
2805  * (This will check for barrier events even if it doesn't sleep.)
2806  */
2807  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2808  }
2809 
2810  /*
2811  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
2812  * IOContext will always be IOCONTEXT_NORMAL.
2813  */
2815 
2816  pfree(per_ts_stat);
2817  per_ts_stat = NULL;
2818  binaryheap_free(ts_heap);
2819 
2820  /*
2821  * Update checkpoint statistics. As noted above, this doesn't include
2822  * buffers written by other backends or bgwriter scan.
2823  */
2824  CheckpointStats.ckpt_bufs_written += num_written;
2825 
2826  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2827 }
2828 
2829 /*
2830  * BgBufferSync -- Write out some dirty buffers in the pool.
2831  *
2832  * This is called periodically by the background writer process.
2833  *
2834  * Returns true if it's appropriate for the bgwriter process to go into
2835  * low-power hibernation mode. (This happens if the strategy clock sweep
2836  * has been "lapped" and no buffer allocations have occurred recently,
2837  * or if the bgwriter has been effectively disabled by setting
2838  * bgwriter_lru_maxpages to 0.)
2839  */
2840 bool
2842 {
2843  /* info obtained from freelist.c */
2844  int strategy_buf_id;
2845  uint32 strategy_passes;
2846  uint32 recent_alloc;
2847 
2848  /*
2849  * Information saved between calls so we can determine the strategy
2850  * point's advance rate and avoid scanning already-cleaned buffers.
2851  */
2852  static bool saved_info_valid = false;
2853  static int prev_strategy_buf_id;
2854  static uint32 prev_strategy_passes;
2855  static int next_to_clean;
2856  static uint32 next_passes;
2857 
2858  /* Moving averages of allocation rate and clean-buffer density */
2859  static float smoothed_alloc = 0;
2860  static float smoothed_density = 10.0;
2861 
2862  /* Potentially these could be tunables, but for now, not */
2863  float smoothing_samples = 16;
2864  float scan_whole_pool_milliseconds = 120000.0;
2865 
2866  /* Used to compute how far we scan ahead */
2867  long strategy_delta;
2868  int bufs_to_lap;
2869  int bufs_ahead;
2870  float scans_per_alloc;
2871  int reusable_buffers_est;
2872  int upcoming_alloc_est;
2873  int min_scan_buffers;
2874 
2875  /* Variables for the scanning loop proper */
2876  int num_to_scan;
2877  int num_written;
2878  int reusable_buffers;
2879 
2880  /* Variables for final smoothed_density update */
2881  long new_strategy_delta;
2882  uint32 new_recent_alloc;
2883 
2884  /*
2885  * Find out where the freelist clock sweep currently is, and how many
2886  * buffer allocations have happened since our last call.
2887  */
2888  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2889 
2890  /* Report buffer alloc counts to pgstat */
2891  PendingBgWriterStats.buf_alloc += recent_alloc;
2892 
2893  /*
2894  * If we're not running the LRU scan, just stop after doing the stats
2895  * stuff. We mark the saved state invalid so that we can recover sanely
2896  * if LRU scan is turned back on later.
2897  */
2898  if (bgwriter_lru_maxpages <= 0)
2899  {
2900  saved_info_valid = false;
2901  return true;
2902  }
2903 
2904  /*
2905  * Compute strategy_delta = how many buffers have been scanned by the
2906  * clock sweep since last time. If first time through, assume none. Then
2907  * see if we are still ahead of the clock sweep, and if so, how many
2908  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2909  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2910  * behavior when the passes counts wrap around.
2911  */
2912  if (saved_info_valid)
2913  {
2914  int32 passes_delta = strategy_passes - prev_strategy_passes;
2915 
2916  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2917  strategy_delta += (long) passes_delta * NBuffers;
2918 
2919  Assert(strategy_delta >= 0);
2920 
2921  if ((int32) (next_passes - strategy_passes) > 0)
2922  {
2923  /* we're one pass ahead of the strategy point */
2924  bufs_to_lap = strategy_buf_id - next_to_clean;
2925 #ifdef BGW_DEBUG
2926  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2927  next_passes, next_to_clean,
2928  strategy_passes, strategy_buf_id,
2929  strategy_delta, bufs_to_lap);
2930 #endif
2931  }
2932  else if (next_passes == strategy_passes &&
2933  next_to_clean >= strategy_buf_id)
2934  {
2935  /* on same pass, but ahead or at least not behind */
2936  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2937 #ifdef BGW_DEBUG
2938  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2939  next_passes, next_to_clean,
2940  strategy_passes, strategy_buf_id,
2941  strategy_delta, bufs_to_lap);
2942 #endif
2943  }
2944  else
2945  {
2946  /*
2947  * We're behind, so skip forward to the strategy point and start
2948  * cleaning from there.
2949  */
2950 #ifdef BGW_DEBUG
2951  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2952  next_passes, next_to_clean,
2953  strategy_passes, strategy_buf_id,
2954  strategy_delta);
2955 #endif
2956  next_to_clean = strategy_buf_id;
2957  next_passes = strategy_passes;
2958  bufs_to_lap = NBuffers;
2959  }
2960  }
2961  else
2962  {
2963  /*
2964  * Initializing at startup or after LRU scanning had been off. Always
2965  * start at the strategy point.
2966  */
2967 #ifdef BGW_DEBUG
2968  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2969  strategy_passes, strategy_buf_id);
2970 #endif
2971  strategy_delta = 0;
2972  next_to_clean = strategy_buf_id;
2973  next_passes = strategy_passes;
2974  bufs_to_lap = NBuffers;
2975  }
2976 
2977  /* Update saved info for next time */
2978  prev_strategy_buf_id = strategy_buf_id;
2979  prev_strategy_passes = strategy_passes;
2980  saved_info_valid = true;
2981 
2982  /*
2983  * Compute how many buffers had to be scanned for each new allocation, ie,
2984  * 1/density of reusable buffers, and track a moving average of that.
2985  *
2986  * If the strategy point didn't move, we don't update the density estimate
2987  */
2988  if (strategy_delta > 0 && recent_alloc > 0)
2989  {
2990  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2991  smoothed_density += (scans_per_alloc - smoothed_density) /
2992  smoothing_samples;
2993  }
2994 
2995  /*
2996  * Estimate how many reusable buffers there are between the current
2997  * strategy point and where we've scanned ahead to, based on the smoothed
2998  * density estimate.
2999  */
3000  bufs_ahead = NBuffers - bufs_to_lap;
3001  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3002 
3003  /*
3004  * Track a moving average of recent buffer allocations. Here, rather than
3005  * a true average we want a fast-attack, slow-decline behavior: we
3006  * immediately follow any increase.
3007  */
3008  if (smoothed_alloc <= (float) recent_alloc)
3009  smoothed_alloc = recent_alloc;
3010  else
3011  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3012  smoothing_samples;
3013 
3014  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3015  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3016 
3017  /*
3018  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3019  * eventually underflow to zero, and the underflows produce annoying
3020  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3021  * zero, there's no point in tracking smaller and smaller values of
3022  * smoothed_alloc, so just reset it to exactly zero to avoid this
3023  * syndrome. It will pop back up as soon as recent_alloc increases.
3024  */
3025  if (upcoming_alloc_est == 0)
3026  smoothed_alloc = 0;
3027 
3028  /*
3029  * Even in cases where there's been little or no buffer allocation
3030  * activity, we want to make a small amount of progress through the buffer
3031  * cache so that as many reusable buffers as possible are clean after an
3032  * idle period.
3033  *
3034  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3035  * the BGW will be called during the scan_whole_pool time; slice the
3036  * buffer pool into that many sections.
3037  */
3038  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3039 
3040  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3041  {
3042 #ifdef BGW_DEBUG
3043  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3044  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3045 #endif
3046  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3047  }
3048 
3049  /*
3050  * Now write out dirty reusable buffers, working forward from the
3051  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3052  * enough buffers to match our estimate of the next cycle's allocation
3053  * requirements, or hit the bgwriter_lru_maxpages limit.
3054  */
3055 
3056  num_to_scan = bufs_to_lap;
3057  num_written = 0;
3058  reusable_buffers = reusable_buffers_est;
3059 
3060  /* Execute the LRU scan */
3061  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3062  {
3063  int sync_state = SyncOneBuffer(next_to_clean, true,
3064  wb_context);
3065 
3066  if (++next_to_clean >= NBuffers)
3067  {
3068  next_to_clean = 0;
3069  next_passes++;
3070  }
3071  num_to_scan--;
3072 
3073  if (sync_state & BUF_WRITTEN)
3074  {
3075  reusable_buffers++;
3076  if (++num_written >= bgwriter_lru_maxpages)
3077  {
3079  break;
3080  }
3081  }
3082  else if (sync_state & BUF_REUSABLE)
3083  reusable_buffers++;
3084  }
3085 
3086  PendingBgWriterStats.buf_written_clean += num_written;
3087 
3088 #ifdef BGW_DEBUG
3089  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3090  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3091  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3092  bufs_to_lap - num_to_scan,
3093  num_written,
3094  reusable_buffers - reusable_buffers_est);
3095 #endif
3096 
3097  /*
3098  * Consider the above scan as being like a new allocation scan.
3099  * Characterize its density and update the smoothed one based on it. This
3100  * effectively halves the moving average period in cases where both the
3101  * strategy and the background writer are doing some useful scanning,
3102  * which is helpful because a long memory isn't as desirable on the
3103  * density estimates.
3104  */
3105  new_strategy_delta = bufs_to_lap - num_to_scan;
3106  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3107  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3108  {
3109  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3110  smoothed_density += (scans_per_alloc - smoothed_density) /
3111  smoothing_samples;
3112 
3113 #ifdef BGW_DEBUG
3114  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3115  new_recent_alloc, new_strategy_delta,
3116  scans_per_alloc, smoothed_density);
3117 #endif
3118  }
3119 
3120  /* Return true if OK to hibernate */
3121  return (bufs_to_lap == 0 && recent_alloc == 0);
3122 }
3123 
3124 /*
3125  * SyncOneBuffer -- process a single buffer during syncing.
3126  *
3127  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3128  * buffers marked recently used, as these are not replacement candidates.
3129  *
3130  * Returns a bitmask containing the following flag bits:
3131  * BUF_WRITTEN: we wrote the buffer.
3132  * BUF_REUSABLE: buffer is available for replacement, ie, it has
3133  * pin count 0 and usage count 0.
3134  *
3135  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3136  * after locking it, but we don't care all that much.)
3137  */
3138 static int
3139 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3140 {
3141  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3142  int result = 0;
3143  uint32 buf_state;
3144  BufferTag tag;
3145 
3146  /* Make sure we can handle the pin */
3149 
3150  /*
3151  * Check whether buffer needs writing.
3152  *
3153  * We can make this check without taking the buffer content lock so long
3154  * as we mark pages dirty in access methods *before* logging changes with
3155  * XLogInsert(): if someone marks the buffer dirty just after our check we
3156  * don't worry because our checkpoint.redo points before log record for
3157  * upcoming changes and so we are not required to write such dirty buffer.
3158  */
3159  buf_state = LockBufHdr(bufHdr);
3160 
3161  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3162  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3163  {
3164  result |= BUF_REUSABLE;
3165  }
3166  else if (skip_recently_used)
3167  {
3168  /* Caller told us not to write recently-used buffers */
3169  UnlockBufHdr(bufHdr, buf_state);
3170  return result;
3171  }
3172 
3173  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3174  {
3175  /* It's clean, so nothing to do */
3176  UnlockBufHdr(bufHdr, buf_state);
3177  return result;
3178  }
3179 
3180  /*
3181  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3182  * buffer is clean by the time we've locked it.)
3183  */
3184  PinBuffer_Locked(bufHdr);
3186 
3188 
3190 
3191  tag = bufHdr->tag;
3192 
3193  UnpinBuffer(bufHdr);
3194 
3195  /*
3196  * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3197  * IOContext will always be IOCONTEXT_NORMAL.
3198  */
3200 
3201  return result | BUF_WRITTEN;
3202 }
3203 
3204 /*
3205  * AtEOXact_Buffers - clean up at end of transaction.
3206  *
3207  * As of PostgreSQL 8.0, buffer pins should get released by the
3208  * ResourceOwner mechanism. This routine is just a debugging
3209  * cross-check that no pins remain.
3210  */
3211 void
3212 AtEOXact_Buffers(bool isCommit)
3213 {
3215 
3216  AtEOXact_LocalBuffers(isCommit);
3217 
3219 }
3220 
3221 /*
3222  * Initialize access to shared buffer pool
3223  *
3224  * This is called during backend startup (whether standalone or under the
3225  * postmaster). It sets up for this backend's access to the already-existing
3226  * buffer pool.
3227  */
3228 void
3230 {
3231  HASHCTL hash_ctl;
3232 
3233  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3234 
3235  hash_ctl.keysize = sizeof(int32);
3236  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3237 
3238  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3239  HASH_ELEM | HASH_BLOBS);
3240 
3241  /*
3242  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3243  * the corresponding phase of backend shutdown.
3244  */
3245  Assert(MyProc != NULL);
3247 }
3248 
3249 /*
3250  * During backend exit, ensure that we released all shared-buffer locks and
3251  * assert that we have no remaining pins.
3252  */
3253 static void
3255 {
3256  UnlockBuffers();
3257 
3259 
3260  /* localbuf.c needs a chance too */
3262 }
3263 
3264 /*
3265  * CheckForBufferLeaks - ensure this backend holds no buffer pins
3266  *
3267  * As of PostgreSQL 8.0, buffer pins should get released by the
3268  * ResourceOwner mechanism. This routine is just a debugging
3269  * cross-check that no pins remain.
3270  */
3271 static void
3273 {
3274 #ifdef USE_ASSERT_CHECKING
3275  int RefCountErrors = 0;
3277  int i;
3278  char *s;
3279 
3280  /* check the array */
3281  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3282  {
3284 
3285  if (res->buffer != InvalidBuffer)
3286  {
3287  s = DebugPrintBufferRefcount(res->buffer);
3288  elog(WARNING, "buffer refcount leak: %s", s);
3289  pfree(s);
3290 
3291  RefCountErrors++;
3292  }
3293  }
3294 
3295  /* if necessary search the hash */
3297  {
3298  HASH_SEQ_STATUS hstat;
3299 
3301  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3302  {
3303  s = DebugPrintBufferRefcount(res->buffer);
3304  elog(WARNING, "buffer refcount leak: %s", s);
3305  pfree(s);
3306  RefCountErrors++;
3307  }
3308  }
3309 
3310  Assert(RefCountErrors == 0);
3311 #endif
3312 }
3313 
3314 /*
3315  * Helper routine to issue warnings when a buffer is unexpectedly pinned
3316  */
3317 char *
3319 {
3320  BufferDesc *buf;
3321  int32 loccount;
3322  char *path;
3323  char *result;
3324  ProcNumber backend;
3325  uint32 buf_state;
3326 
3328  if (BufferIsLocal(buffer))
3329  {
3331  loccount = LocalRefCount[-buffer - 1];
3332  backend = MyProcNumber;
3333  }
3334  else
3335  {
3337  loccount = GetPrivateRefCount(buffer);
3338  backend = INVALID_PROC_NUMBER;
3339  }
3340 
3341  /* theoretically we should lock the bufhdr here */
3342  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3343  BufTagGetForkNum(&buf->tag));
3344  buf_state = pg_atomic_read_u32(&buf->state);
3345 
3346  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3347  buffer, path,
3348  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3349  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3350  pfree(path);
3351  return result;
3352 }
3353 
3354 /*
3355  * CheckPointBuffers
3356  *
3357  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3358  *
3359  * Note: temporary relations do not participate in checkpoints, so they don't
3360  * need to be flushed.
3361  */
3362 void
3364 {
3365  BufferSync(flags);
3366 }
3367 
3368 /*
3369  * BufferGetBlockNumber
3370  * Returns the block number associated with a buffer.
3371  *
3372  * Note:
3373  * Assumes that the buffer is valid and pinned, else the
3374  * value may be obsolete immediately...
3375  */
3378 {
3379  BufferDesc *bufHdr;
3380 
3382 
3383  if (BufferIsLocal(buffer))
3384  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3385  else
3386  bufHdr = GetBufferDescriptor(buffer - 1);
3387 
3388  /* pinned, so OK to read tag without spinlock */
3389  return bufHdr->tag.blockNum;
3390 }
3391 
3392 /*
3393  * BufferGetTag
3394  * Returns the relfilelocator, fork number and block number associated with
3395  * a buffer.
3396  */
3397 void
3399  BlockNumber *blknum)
3400 {
3401  BufferDesc *bufHdr;
3402 
3403  /* Do the same checks as BufferGetBlockNumber. */
3405 
3406  if (BufferIsLocal(buffer))
3407  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3408  else
3409  bufHdr = GetBufferDescriptor(buffer - 1);
3410 
3411  /* pinned, so OK to read tag without spinlock */
3412  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3413  *forknum = BufTagGetForkNum(&bufHdr->tag);
3414  *blknum = bufHdr->tag.blockNum;
3415 }
3416 
3417 /*
3418  * FlushBuffer
3419  * Physically write out a shared buffer.
3420  *
3421  * NOTE: this actually just passes the buffer contents to the kernel; the
3422  * real write to disk won't happen until the kernel feels like it. This
3423  * is okay from our point of view since we can redo the changes from WAL.
3424  * However, we will need to force the changes to disk via fsync before
3425  * we can checkpoint WAL.
3426  *
3427  * The caller must hold a pin on the buffer and have share-locked the
3428  * buffer contents. (Note: a share-lock does not prevent updates of
3429  * hint bits in the buffer, so the page could change while the write
3430  * is in progress, but we assume that that will not invalidate the data
3431  * written.)
3432  *
3433  * If the caller has an smgr reference for the buffer's relation, pass it
3434  * as the second parameter. If not, pass NULL.
3435  */
3436 static void
3438  IOContext io_context)
3439 {
3440  XLogRecPtr recptr;
3441  ErrorContextCallback errcallback;
3442  instr_time io_start;
3443  Block bufBlock;
3444  char *bufToWrite;
3445  uint32 buf_state;
3446 
3447  /*
3448  * Try to start an I/O operation. If StartBufferIO returns false, then
3449  * someone else flushed the buffer before we could, so we need not do
3450  * anything.
3451  */
3452  if (!StartBufferIO(buf, false))
3453  return;
3454 
3455  /* Setup error traceback support for ereport() */
3457  errcallback.arg = (void *) buf;
3458  errcallback.previous = error_context_stack;
3459  error_context_stack = &errcallback;
3460 
3461  /* Find smgr relation for buffer */
3462  if (reln == NULL)
3464 
3465  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3466  buf->tag.blockNum,
3468  reln->smgr_rlocator.locator.dbOid,
3470 
3471  buf_state = LockBufHdr(buf);
3472 
3473  /*
3474  * Run PageGetLSN while holding header lock, since we don't have the
3475  * buffer locked exclusively in all cases.
3476  */
3477  recptr = BufferGetLSN(buf);
3478 
3479  /* To check if block content changes while flushing. - vadim 01/17/97 */
3480  buf_state &= ~BM_JUST_DIRTIED;
3481  UnlockBufHdr(buf, buf_state);
3482 
3483  /*
3484  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3485  * rule that log updates must hit disk before any of the data-file changes
3486  * they describe do.
3487  *
3488  * However, this rule does not apply to unlogged relations, which will be
3489  * lost after a crash anyway. Most unlogged relation pages do not bear
3490  * LSNs since we never emit WAL records for them, and therefore flushing
3491  * up through the buffer LSN would be useless, but harmless. However,
3492  * GiST indexes use LSNs internally to track page-splits, and therefore
3493  * unlogged GiST pages bear "fake" LSNs generated by
3494  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3495  * LSN counter could advance past the WAL insertion point; and if it did
3496  * happen, attempting to flush WAL through that location would fail, with
3497  * disastrous system-wide consequences. To make sure that can't happen,
3498  * skip the flush if the buffer isn't permanent.
3499  */
3500  if (buf_state & BM_PERMANENT)
3501  XLogFlush(recptr);
3502 
3503  /*
3504  * Now it's safe to write buffer to disk. Note that no one else should
3505  * have been able to write it while we were busy with log flushing because
3506  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3507  */
3508  bufBlock = BufHdrGetBlock(buf);
3509 
3510  /*
3511  * Update page checksum if desired. Since we have only shared lock on the
3512  * buffer, other processes might be updating hint bits in it, so we must
3513  * copy the page to private storage if we do checksumming.
3514  */
3515  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3516 
3518 
3519  /*
3520  * bufToWrite is either the shared buffer or a copy, as appropriate.
3521  */
3522  smgrwrite(reln,
3523  BufTagGetForkNum(&buf->tag),
3524  buf->tag.blockNum,
3525  bufToWrite,
3526  false);
3527 
3528  /*
3529  * When a strategy is in use, only flushes of dirty buffers already in the
3530  * strategy ring are counted as strategy writes (IOCONTEXT
3531  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3532  * statistics tracking.
3533  *
3534  * If a shared buffer initially added to the ring must be flushed before
3535  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3536  *
3537  * If a shared buffer which was added to the ring later because the
3538  * current strategy buffer is pinned or in use or because all strategy
3539  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3540  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3541  * (from_ring will be false).
3542  *
3543  * When a strategy is not in use, the write can only be a "regular" write
3544  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3545  */
3547  IOOP_WRITE, io_start, 1);
3548 
3550 
3551  /*
3552  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3553  * end the BM_IO_IN_PROGRESS state.
3554  */
3555  TerminateBufferIO(buf, true, 0, true);
3556 
3557  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3558  buf->tag.blockNum,
3560  reln->smgr_rlocator.locator.dbOid,
3562 
3563  /* Pop the error context stack */
3564  error_context_stack = errcallback.previous;
3565 }
3566 
3567 /*
3568  * RelationGetNumberOfBlocksInFork
3569  * Determines the current number of pages in the specified relation fork.
3570  *
3571  * Note that the accuracy of the result will depend on the details of the
3572  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3573  * it might not be.
3574  */
3577 {
3578  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3579  {
3580  /*
3581  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3582  * tableam returns the size in bytes - but for the purpose of this
3583  * routine, we want the number of blocks. Therefore divide, rounding
3584  * up.
3585  */
3586  uint64 szbytes;
3587 
3588  szbytes = table_relation_size(relation, forkNum);
3589 
3590  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3591  }
3592  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3593  {
3594  return smgrnblocks(RelationGetSmgr(relation), forkNum);
3595  }
3596  else
3597  Assert(false);
3598 
3599  return 0; /* keep compiler quiet */
3600 }
3601 
3602 /*
3603  * BufferIsPermanent
3604  * Determines whether a buffer will potentially still be around after
3605  * a crash. Caller must hold a buffer pin.
3606  */
3607 bool
3609 {
3610  BufferDesc *bufHdr;
3611 
3612  /* Local buffers are used only for temp relations. */
3613  if (BufferIsLocal(buffer))
3614  return false;
3615 
3616  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3619 
3620  /*
3621  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3622  * need not bother with the buffer header spinlock. Even if someone else
3623  * changes the buffer header state while we're doing this, the state is
3624  * changed atomically, so we'll read the old value or the new value, but
3625  * not random garbage.
3626  */
3627  bufHdr = GetBufferDescriptor(buffer - 1);
3628  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3629 }
3630 
3631 /*
3632  * BufferGetLSNAtomic
3633  * Retrieves the LSN of the buffer atomically using a buffer header lock.
3634  * This is necessary for some callers who may not have an exclusive lock
3635  * on the buffer.
3636  */
3637 XLogRecPtr
3639 {
3640  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3641  char *page = BufferGetPage(buffer);
3642  XLogRecPtr lsn;
3643  uint32 buf_state;
3644 
3645  /*
3646  * If we don't need locking for correctness, fastpath out.
3647  */
3649  return PageGetLSN(page);
3650 
3651  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3654 
3655  buf_state = LockBufHdr(bufHdr);
3656  lsn = PageGetLSN(page);
3657  UnlockBufHdr(bufHdr, buf_state);
3658 
3659  return lsn;
3660 }
3661 
3662 /* ---------------------------------------------------------------------
3663  * DropRelationBuffers
3664  *
3665  * This function removes from the buffer pool all the pages of the
3666  * specified relation forks that have block numbers >= firstDelBlock.
3667  * (In particular, with firstDelBlock = 0, all pages are removed.)
3668  * Dirty pages are simply dropped, without bothering to write them
3669  * out first. Therefore, this is NOT rollback-able, and so should be
3670  * used only with extreme caution!
3671  *
3672  * Currently, this is called only from smgr.c when the underlying file
3673  * is about to be deleted or truncated (firstDelBlock is needed for
3674  * the truncation case). The data in the affected pages would therefore
3675  * be deleted momentarily anyway, and there is no point in writing it.
3676  * It is the responsibility of higher-level code to ensure that the
3677  * deletion or truncation does not lose any data that could be needed
3678  * later. It is also the responsibility of higher-level code to ensure
3679  * that no other process could be trying to load more pages of the
3680  * relation into buffers.
3681  * --------------------------------------------------------------------
3682  */
3683 void
3685  int nforks, BlockNumber *firstDelBlock)
3686 {
3687  int i;
3688  int j;
3689  RelFileLocatorBackend rlocator;
3690  BlockNumber nForkBlock[MAX_FORKNUM];
3691  uint64 nBlocksToInvalidate = 0;
3692 
3693  rlocator = smgr_reln->smgr_rlocator;
3694 
3695  /* If it's a local relation, it's localbuf.c's problem. */
3696  if (RelFileLocatorBackendIsTemp(rlocator))
3697  {
3698  if (rlocator.backend == MyProcNumber)
3699  {
3700  for (j = 0; j < nforks; j++)
3701  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3702  firstDelBlock[j]);
3703  }
3704  return;
3705  }
3706 
3707  /*
3708  * To remove all the pages of the specified relation forks from the buffer
3709  * pool, we need to scan the entire buffer pool but we can optimize it by
3710  * finding the buffers from BufMapping table provided we know the exact
3711  * size of each fork of the relation. The exact size is required to ensure
3712  * that we don't leave any buffer for the relation being dropped as
3713  * otherwise the background writer or checkpointer can lead to a PANIC
3714  * error while flushing buffers corresponding to files that don't exist.
3715  *
3716  * To know the exact size, we rely on the size cached for each fork by us
3717  * during recovery which limits the optimization to recovery and on
3718  * standbys but we can easily extend it once we have shared cache for
3719  * relation size.
3720  *
3721  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3722  * and the future writes keeps the cached value up-to-date. See
3723  * smgrextend. It is possible that the value of the first lseek is smaller
3724  * than the actual number of existing blocks in the file due to buggy
3725  * Linux kernels that might not have accounted for the recent write. But
3726  * that should be fine because there must not be any buffers after that
3727  * file size.
3728  */
3729  for (i = 0; i < nforks; i++)
3730  {
3731  /* Get the number of blocks for a relation's fork */
3732  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3733 
3734  if (nForkBlock[i] == InvalidBlockNumber)
3735  {
3736  nBlocksToInvalidate = InvalidBlockNumber;
3737  break;
3738  }
3739 
3740  /* calculate the number of blocks to be invalidated */
3741  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3742  }
3743 
3744  /*
3745  * We apply the optimization iff the total number of blocks to invalidate
3746  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3747  */
3748  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3749  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3750  {
3751  for (j = 0; j < nforks; j++)
3752  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3753  nForkBlock[j], firstDelBlock[j]);
3754  return;
3755  }
3756 
3757  for (i = 0; i < NBuffers; i++)
3758  {
3759  BufferDesc *bufHdr = GetBufferDescriptor(i);
3760  uint32 buf_state;
3761 
3762  /*
3763  * We can make this a tad faster by prechecking the buffer tag before
3764  * we attempt to lock the buffer; this saves a lot of lock
3765  * acquisitions in typical cases. It should be safe because the
3766  * caller must have AccessExclusiveLock on the relation, or some other
3767  * reason to be certain that no one is loading new pages of the rel
3768  * into the buffer pool. (Otherwise we might well miss such pages
3769  * entirely.) Therefore, while the tag might be changing while we
3770  * look at it, it can't be changing *to* a value we care about, only
3771  * *away* from such a value. So false negatives are impossible, and
3772  * false positives are safe because we'll recheck after getting the
3773  * buffer lock.
3774  *
3775  * We could check forkNum and blockNum as well as the rlocator, but
3776  * the incremental win from doing so seems small.
3777  */
3778  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3779  continue;
3780 
3781  buf_state = LockBufHdr(bufHdr);
3782 
3783  for (j = 0; j < nforks; j++)
3784  {
3785  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3786  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3787  bufHdr->tag.blockNum >= firstDelBlock[j])
3788  {
3789  InvalidateBuffer(bufHdr); /* releases spinlock */
3790  break;
3791  }
3792  }
3793  if (j >= nforks)
3794  UnlockBufHdr(bufHdr, buf_state);
3795  }
3796 }
3797 
3798 /* ---------------------------------------------------------------------
3799  * DropRelationsAllBuffers
3800  *
3801  * This function removes from the buffer pool all the pages of all
3802  * forks of the specified relations. It's equivalent to calling
3803  * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
3804  * --------------------------------------------------------------------
3805  */
3806 void
3807 DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
3808 {
3809  int i;
3810  int n = 0;
3811  SMgrRelation *rels;
3812  BlockNumber (*block)[MAX_FORKNUM + 1];
3813  uint64 nBlocksToInvalidate = 0;
3814  RelFileLocator *locators;
3815  bool cached = true;
3816  bool use_bsearch;
3817 
3818  if (nlocators == 0)
3819  return;
3820 
3821  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3822 
3823  /* If it's a local relation, it's localbuf.c's problem. */
3824  for (i = 0; i < nlocators; i++)
3825  {
3826  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3827  {
3828  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
3829  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3830  }
3831  else
3832  rels[n++] = smgr_reln[i];
3833  }
3834 
3835  /*
3836  * If there are no non-local relations, then we're done. Release the
3837  * memory and return.
3838  */
3839  if (n == 0)
3840  {
3841  pfree(rels);
3842  return;
3843  }
3844 
3845  /*
3846  * This is used to remember the number of blocks for all the relations
3847  * forks.
3848  */
3849  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3850  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3851 
3852  /*
3853  * We can avoid scanning the entire buffer pool if we know the exact size
3854  * of each of the given relation forks. See DropRelationBuffers.
3855  */
3856  for (i = 0; i < n && cached; i++)
3857  {
3858  for (int j = 0; j <= MAX_FORKNUM; j++)
3859  {
3860  /* Get the number of blocks for a relation's fork. */
3861  block[i][j] = smgrnblocks_cached(rels[i], j);
3862 
3863  /* We need to only consider the relation forks that exists. */
3864  if (block[i][j] == InvalidBlockNumber)
3865  {
3866  if (!smgrexists(rels[i], j))
3867  continue;
3868  cached = false;
3869  break;
3870  }
3871 
3872  /* calculate the total number of blocks to be invalidated */
3873  nBlocksToInvalidate += block[i][j];
3874  }
3875  }
3876 
3877  /*
3878  * We apply the optimization iff the total number of blocks to invalidate
3879  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3880  */
3881  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3882  {
3883  for (i = 0; i < n; i++)
3884  {
3885  for (int j = 0; j <= MAX_FORKNUM; j++)
3886  {
3887  /* ignore relation forks that doesn't exist */
3888  if (!BlockNumberIsValid(block[i][j]))
3889  continue;
3890 
3891  /* drop all the buffers for a particular relation fork */
3892  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3893  j, block[i][j], 0);
3894  }
3895  }
3896 
3897  pfree(block);
3898  pfree(rels);
3899  return;
3900  }
3901 
3902  pfree(block);
3903  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3904  for (i = 0; i < n; i++)
3905  locators[i] = rels[i]->smgr_rlocator.locator;
3906 
3907  /*
3908  * For low number of relations to drop just use a simple walk through, to
3909  * save the bsearch overhead. The threshold to use is rather a guess than
3910  * an exactly determined value, as it depends on many factors (CPU and RAM
3911  * speeds, amount of shared buffers etc.).
3912  */
3913  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3914 
3915  /* sort the list of rlocators if necessary */
3916  if (use_bsearch)
3917  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3918 
3919  for (i = 0; i < NBuffers; i++)
3920  {
3921  RelFileLocator *rlocator = NULL;
3922  BufferDesc *bufHdr = GetBufferDescriptor(i);
3923  uint32 buf_state;
3924 
3925  /*
3926  * As in DropRelationBuffers, an unlocked precheck should be safe and
3927  * saves some cycles.
3928  */
3929 
3930  if (!use_bsearch)
3931  {
3932  int j;
3933 
3934  for (j = 0; j < n; j++)
3935  {
3936  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3937  {
3938  rlocator = &locators[j];
3939  break;
3940  }
3941  }
3942  }
3943  else
3944  {
3945  RelFileLocator locator;
3946 
3947  locator = BufTagGetRelFileLocator(&bufHdr->tag);
3948  rlocator = bsearch((const void *) &(locator),
3949  locators, n, sizeof(RelFileLocator),
3951  }
3952 
3953  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3954  if (rlocator == NULL)
3955  continue;
3956 
3957  buf_state = LockBufHdr(bufHdr);
3958  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3959  InvalidateBuffer(bufHdr); /* releases spinlock */
3960  else
3961  UnlockBufHdr(bufHdr, buf_state);
3962  }
3963 
3964  pfree(locators);
3965  pfree(rels);
3966 }
3967 
3968 /* ---------------------------------------------------------------------
3969  * FindAndDropRelationBuffers
3970  *
3971  * This function performs look up in BufMapping table and removes from the
3972  * buffer pool all the pages of the specified relation fork that has block
3973  * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
3974  * pages are removed.)
3975  * --------------------------------------------------------------------
3976  */
3977 static void
3979  BlockNumber nForkBlock,
3980  BlockNumber firstDelBlock)
3981 {
3982  BlockNumber curBlock;
3983 
3984  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3985  {
3986  uint32 bufHash; /* hash value for tag */
3987  BufferTag bufTag; /* identity of requested block */
3988  LWLock *bufPartitionLock; /* buffer partition lock for it */
3989  int buf_id;
3990  BufferDesc *bufHdr;
3991  uint32 buf_state;
3992 
3993  /* create a tag so we can lookup the buffer */
3994  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3995 
3996  /* determine its hash code and partition lock ID */
3997  bufHash = BufTableHashCode(&bufTag);
3998  bufPartitionLock = BufMappingPartitionLock(bufHash);
3999 
4000  /* Check that it is in the buffer pool. If not, do nothing. */
4001  LWLockAcquire(bufPartitionLock, LW_SHARED);
4002  buf_id = BufTableLookup(&bufTag, bufHash);
4003  LWLockRelease(bufPartitionLock);
4004 
4005  if (buf_id < 0)
4006  continue;
4007 
4008  bufHdr = GetBufferDescriptor(buf_id);
4009 
4010  /*
4011  * We need to lock the buffer header and recheck if the buffer is
4012  * still associated with the same block because the buffer could be
4013  * evicted by some other backend loading blocks for a different
4014  * relation after we release lock on the BufMapping table.
4015  */
4016  buf_state = LockBufHdr(bufHdr);
4017 
4018  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4019  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4020  bufHdr->tag.blockNum >= firstDelBlock)
4021  InvalidateBuffer(bufHdr); /* releases spinlock */
4022  else
4023  UnlockBufHdr(bufHdr, buf_state);
4024  }
4025 }
4026 
4027 /* ---------------------------------------------------------------------
4028  * DropDatabaseBuffers
4029  *
4030  * This function removes all the buffers in the buffer cache for a
4031  * particular database. Dirty pages are simply dropped, without
4032  * bothering to write them out first. This is used when we destroy a
4033  * database, to avoid trying to flush data to disk when the directory
4034  * tree no longer exists. Implementation is pretty similar to
4035  * DropRelationBuffers() which is for destroying just one relation.
4036  * --------------------------------------------------------------------
4037  */
4038 void
4040 {
4041  int i;
4042 
4043  /*
4044  * We needn't consider local buffers, since by assumption the target
4045  * database isn't our own.
4046  */
4047 
4048  for (i = 0; i < NBuffers; i++)
4049  {
4050  BufferDesc *bufHdr = GetBufferDescriptor(i);
4051  uint32 buf_state;
4052 
4053  /*
4054  * As in DropRelationBuffers, an unlocked precheck should be safe and
4055  * saves some cycles.
4056  */
4057  if (bufHdr->tag.dbOid != dbid)
4058  continue;
4059 
4060  buf_state = LockBufHdr(bufHdr);
4061  if (bufHdr->tag.dbOid == dbid)
4062  InvalidateBuffer(bufHdr); /* releases spinlock */
4063  else
4064  UnlockBufHdr(bufHdr, buf_state);
4065  }
4066 }
4067 
4068 /* -----------------------------------------------------------------
4069  * PrintBufferDescs
4070  *
4071  * this function prints all the buffer descriptors, for debugging
4072  * use only.
4073  * -----------------------------------------------------------------
4074  */
4075 #ifdef NOT_USED
4076 void
4077 PrintBufferDescs(void)
4078 {
4079  int i;
4080 
4081  for (i = 0; i < NBuffers; ++i)
4082  {
4085 
4086  /* theoretically we should lock the bufhdr here */
4087  elog(LOG,
4088  "[%02d] (freeNext=%d, rel=%s, "
4089  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4090  i, buf->freeNext,
4093  buf->tag.blockNum, buf->flags,
4094  buf->refcount, GetPrivateRefCount(b));
4095  }
4096 }
4097 #endif
4098 
4099 #ifdef NOT_USED
4100 void
4101 PrintPinnedBufs(void)
4102 {
4103  int i;
4104 
4105  for (i = 0; i < NBuffers; ++i)
4106  {
4109 
4110  if (GetPrivateRefCount(b) > 0)
4111  {
4112  /* theoretically we should lock the bufhdr here */
4113  elog(LOG,
4114  "[%02d] (freeNext=%d, rel=%s, "
4115  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4116  i, buf->freeNext,
4118  BufTagGetForkNum(&buf->tag)),
4119  buf->tag.blockNum, buf->flags,
4120  buf->refcount, GetPrivateRefCount(b));
4121  }
4122  }
4123 }
4124 #endif
4125 
4126 /* ---------------------------------------------------------------------
4127  * FlushRelationBuffers
4128  *
4129  * This function writes all dirty pages of a relation out to disk
4130  * (or more accurately, out to kernel disk buffers), ensuring that the
4131  * kernel has an up-to-date view of the relation.
4132  *
4133  * Generally, the caller should be holding AccessExclusiveLock on the
4134  * target relation to ensure that no other backend is busy dirtying
4135  * more blocks of the relation; the effects can't be expected to last
4136  * after the lock is released.
4137  *
4138  * XXX currently it sequentially searches the buffer pool, should be
4139  * changed to more clever ways of searching. This routine is not
4140  * used in any performance-critical code paths, so it's not worth
4141  * adding additional overhead to normal paths to make it go faster.
4142  * --------------------------------------------------------------------
4143  */
4144 void
4146 {
4147  int i;
4148  BufferDesc *bufHdr;
4149  SMgrRelation srel = RelationGetSmgr(rel);
4150 
4151  if (RelationUsesLocalBuffers(rel))
4152  {
4153  for (i = 0; i < NLocBuffer; i++)
4154  {
4155  uint32 buf_state;
4156  instr_time io_start;
4157 
4158  bufHdr = GetLocalBufferDescriptor(i);
4159  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4160  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4161  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4162  {
4163  ErrorContextCallback errcallback;
4164  Page localpage;
4165 
4166  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4167 
4168  /* Setup error traceback support for ereport() */
4170  errcallback.arg = (void *) bufHdr;
4171  errcallback.previous = error_context_stack;
4172  error_context_stack = &errcallback;
4173 
4174  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4175 
4177 
4178  smgrwrite(srel,
4179  BufTagGetForkNum(&bufHdr->tag),
4180  bufHdr->tag.blockNum,
4181  localpage,
4182  false);
4183 
4186  io_start, 1);
4187 
4188  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4189  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4190 
4192 
4193  /* Pop the error context stack */
4194  error_context_stack = errcallback.previous;
4195  }
4196  }
4197 
4198  return;
4199  }
4200 
4201  for (i = 0; i < NBuffers; i++)
4202  {
4203  uint32 buf_state;
4204 
4205  bufHdr = GetBufferDescriptor(i);
4206 
4207  /*
4208  * As in DropRelationBuffers, an unlocked precheck should be safe and
4209  * saves some cycles.
4210  */
4211  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4212  continue;
4213 
4214  /* Make sure we can handle the pin */
4217 
4218  buf_state = LockBufHdr(bufHdr);
4219  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4220  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4221  {
4222  PinBuffer_Locked(bufHdr);
4226  UnpinBuffer(bufHdr);
4227  }
4228  else
4229  UnlockBufHdr(bufHdr, buf_state);
4230  }
4231 }
4232 
4233 /* ---------------------------------------------------------------------
4234  * FlushRelationsAllBuffers
4235  *
4236  * This function flushes out of the buffer pool all the pages of all
4237  * forks of the specified smgr relations. It's equivalent to calling
4238  * FlushRelationBuffers once per relation. The relations are assumed not
4239  * to use local buffers.
4240  * --------------------------------------------------------------------
4241  */
4242 void
4244 {
4245  int i;
4246  SMgrSortArray *srels;
4247  bool use_bsearch;
4248 
4249  if (nrels == 0)
4250  return;
4251 
4252  /* fill-in array for qsort */
4253  srels = palloc(sizeof(SMgrSortArray) * nrels);
4254 
4255  for (i = 0; i < nrels; i++)
4256  {
4257  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4258 
4259  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4260  srels[i].srel = smgrs[i];
4261  }
4262 
4263  /*
4264  * Save the bsearch overhead for low number of relations to sync. See
4265  * DropRelationsAllBuffers for details.
4266  */
4267  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4268 
4269  /* sort the list of SMgrRelations if necessary */
4270  if (use_bsearch)
4271  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4272 
4273  for (i = 0; i < NBuffers; i++)
4274  {
4275  SMgrSortArray *srelent = NULL;
4276  BufferDesc *bufHdr = GetBufferDescriptor(i);
4277  uint32 buf_state;
4278 
4279  /*
4280  * As in DropRelationBuffers, an unlocked precheck should be safe and
4281  * saves some cycles.
4282  */
4283 
4284  if (!use_bsearch)
4285  {
4286  int j;
4287 
4288  for (j = 0; j < nrels; j++)
4289  {
4290  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4291  {
4292  srelent = &srels[j];
4293  break;
4294  }
4295  }
4296  }
4297  else
4298  {
4299  RelFileLocator rlocator;
4300 
4301  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4302  srelent = bsearch((const void *) &(rlocator),
4303  srels, nrels, sizeof(SMgrSortArray),
4305  }
4306 
4307  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4308  if (srelent == NULL)
4309  continue;
4310 
4311  /* Make sure we can handle the pin */
4314 
4315  buf_state = LockBufHdr(bufHdr);
4316  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4317  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4318  {
4319  PinBuffer_Locked(bufHdr);
4321  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4323  UnpinBuffer(bufHdr);
4324  }
4325  else
4326  UnlockBufHdr(bufHdr, buf_state);
4327  }
4328 
4329  pfree(srels);
4330 }
4331 
4332 /* ---------------------------------------------------------------------
4333  * RelationCopyStorageUsingBuffer
4334  *
4335  * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4336  * of using smgrread and smgrextend this will copy using bufmgr APIs.
4337  *
4338  * Refer comments atop CreateAndCopyRelationData() for details about
4339  * 'permanent' parameter.
4340  * --------------------------------------------------------------------
4341  */
4342 static void
4344  RelFileLocator dstlocator,
4345  ForkNumber forkNum, bool permanent)
4346 {
4347  Buffer srcBuf;
4348  Buffer dstBuf;
4349  Page srcPage;
4350  Page dstPage;
4351  bool use_wal;
4352  BlockNumber nblocks;
4353  BlockNumber blkno;
4355  BufferAccessStrategy bstrategy_src;
4356  BufferAccessStrategy bstrategy_dst;
4357 
4358  /*
4359  * In general, we want to write WAL whenever wal_level > 'minimal', but we
4360  * can skip it when copying any fork of an unlogged relation other than
4361  * the init fork.
4362  */
4363  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4364 
4365  /* Get number of blocks in the source relation. */
4366  nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
4367  forkNum);
4368 
4369  /* Nothing to copy; just return. */
4370  if (nblocks == 0)
4371  return;
4372 
4373  /*
4374  * Bulk extend the destination relation of the same size as the source
4375  * relation before starting to copy block by block.
4376  */
4377  memset(buf.data, 0, BLCKSZ);
4378  smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4379  buf.data, true);
4380 
4381  /* This is a bulk operation, so use buffer access strategies. */
4382  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4383  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4384 
4385  /* Iterate over each block of the source relation file. */
4386  for (blkno = 0; blkno < nblocks; blkno++)
4387  {
4389 
4390  /* Read block from source relation. */
4391  srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
4392  RBM_NORMAL, bstrategy_src,
4393  permanent);
4394  LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4395  srcPage = BufferGetPage(srcBuf);
4396 
4397  dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
4398  RBM_ZERO_AND_LOCK, bstrategy_dst,
4399  permanent);
4400  dstPage = BufferGetPage(dstBuf);
4401 
4403 
4404  /* Copy page data from the source to the destination. */
4405  memcpy(dstPage, srcPage, BLCKSZ);
4406  MarkBufferDirty(dstBuf);
4407 
4408  /* WAL-log the copied page. */
4409  if (use_wal)
4410  log_newpage_buffer(dstBuf, true);
4411 
4412  END_CRIT_SECTION();
4413 
4414  UnlockReleaseBuffer(dstBuf);
4415  UnlockReleaseBuffer(srcBuf);
4416  }
4417 
4418  FreeAccessStrategy(bstrategy_src);
4419  FreeAccessStrategy(bstrategy_dst);
4420 }
4421 
4422 /* ---------------------------------------------------------------------
4423  * CreateAndCopyRelationData
4424  *
4425  * Create destination relation storage and copy all forks from the
4426  * source relation to the destination.
4427  *
4428  * Pass permanent as true for permanent relations and false for
4429  * unlogged relations. Currently this API is not supported for
4430  * temporary relations.
4431  * --------------------------------------------------------------------
4432  */
4433 void
4435  RelFileLocator dst_rlocator, bool permanent)
4436 {
4437  char relpersistence;
4438  SMgrRelation src_rel;
4439  SMgrRelation dst_rel;
4440 
4441  /* Set the relpersistence. */
4442  relpersistence = permanent ?
4443  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4444 
4445  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4446  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4447 
4448  /*
4449  * Create and copy all forks of the relation. During create database we
4450  * have a separate cleanup mechanism which deletes complete database
4451  * directory. Therefore, each individual relation doesn't need to be
4452  * registered for cleanup.
4453  */
4454  RelationCreateStorage(dst_rlocator, relpersistence, false);
4455 
4456  /* copy main fork. */
4457  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4458  permanent);
4459 
4460  /* copy those extra forks that exist */
4461  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4462  forkNum <= MAX_FORKNUM; forkNum++)
4463  {
4464  if (smgrexists(src_rel, forkNum))
4465  {
4466  smgrcreate(dst_rel, forkNum, false);
4467 
4468  /*
4469  * WAL log creation if the relation is persistent, or this is the
4470  * init fork of an unlogged relation.
4471  */
4472  if (permanent || forkNum == INIT_FORKNUM)
4473  log_smgrcreate(&dst_rlocator, forkNum);
4474 
4475  /* Copy a fork's data, block by block. */
4476  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4477  permanent);
4478  }
4479  }
4480 }
4481 
4482 /* ---------------------------------------------------------------------
4483  * FlushDatabaseBuffers
4484  *
4485  * This function writes all dirty pages of a database out to disk
4486  * (or more accurately, out to kernel disk buffers), ensuring that the
4487  * kernel has an up-to-date view of the database.
4488  *
4489  * Generally, the caller should be holding an appropriate lock to ensure
4490  * no other backend is active in the target database; otherwise more
4491  * pages could get dirtied.
4492  *
4493  * Note we don't worry about flushing any pages of temporary relations.
4494  * It's assumed these wouldn't be interesting.
4495  * --------------------------------------------------------------------
4496  */
4497 void
4499 {
4500  int i;
4501  BufferDesc *bufHdr;
4502 
4503  for (i = 0; i < NBuffers; i++)
4504  {
4505  uint32 buf_state;
4506 
4507  bufHdr = GetBufferDescriptor(i);
4508 
4509  /*
4510  * As in DropRelationBuffers, an unlocked precheck should be safe and
4511  * saves some cycles.
4512  */
4513  if (bufHdr->tag.dbOid != dbid)
4514  continue;
4515 
4516  /* Make sure we can handle the pin */
4519 
4520  buf_state = LockBufHdr(bufHdr);
4521  if (bufHdr->tag.dbOid == dbid &&
4522  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4523  {
4524  PinBuffer_Locked(bufHdr);
4528  UnpinBuffer(bufHdr);
4529  }
4530  else
4531  UnlockBufHdr(bufHdr, buf_state);
4532  }
4533 }
4534 
4535 /*
4536  * Flush a previously, shared or exclusively, locked and pinned buffer to the
4537  * OS.
4538  */
4539 void
4541 {
4542  BufferDesc *bufHdr;
4543 
4544  /* currently not needed, but no fundamental reason not to support */
4546 
4548 
4549  bufHdr = GetBufferDescriptor(buffer - 1);
4550 
4552 
4554 }
4555 
4556 /*
4557  * ReleaseBuffer -- release the pin on a buffer
4558  */
4559 void
4561 {
4562  if (!BufferIsValid(buffer))
4563  elog(ERROR, "bad buffer ID: %d", buffer);
4564 
4565  if (BufferIsLocal(buffer))
4567  else
4569 }
4570 
4571 /*
4572  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4573  *
4574  * This is just a shorthand for a common combination.
4575  */
4576 void
4578 {
4581 }
4582 
4583 /*
4584  * IncrBufferRefCount
4585  * Increment the pin count on a buffer that we have *already* pinned
4586  * at least once.
4587  *
4588  * This function cannot be used on a buffer we do not have pinned,
4589  * because it doesn't change the shared buffer state.
4590  */
4591 void
4593 {
4596  if (BufferIsLocal(buffer))
4597  LocalRefCount[-buffer - 1]++;
4598  else
4599  {
4600  PrivateRefCountEntry *ref;
4601 
4602  ref = GetPrivateRefCountEntry(buffer, true);
4603  Assert(ref != NULL);
4604  ref->refcount++;
4605  }
4607 }
4608 
4609 /*
4610  * MarkBufferDirtyHint
4611  *
4612  * Mark a buffer dirty for non-critical changes.
4613  *
4614  * This is essentially the same as MarkBufferDirty, except:
4615  *
4616  * 1. The caller does not write WAL; so if checksums are enabled, we may need
4617  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4618  * 2. The caller might have only share-lock instead of exclusive-lock on the
4619  * buffer's content lock.
4620  * 3. This function does not guarantee that the buffer is always marked dirty
4621  * (due to a race condition), so it cannot be used for important changes.
4622  */
4623 void
4625 {
4626  BufferDesc *bufHdr;
4627  Page page = BufferGetPage(buffer);
4628 
4629  if (!BufferIsValid(buffer))
4630  elog(ERROR, "bad buffer ID: %d", buffer);
4631 
4632  if (BufferIsLocal(buffer))
4633  {
4635  return;
4636  }
4637 
4638  bufHdr = GetBufferDescriptor(buffer - 1);
4639 
4641  /* here, either share or exclusive lock is OK */
4643 
4644  /*
4645  * This routine might get called many times on the same page, if we are
4646  * making the first scan after commit of an xact that added/deleted many
4647  * tuples. So, be as quick as we can if the buffer is already dirty. We
4648  * do this by not acquiring spinlock if it looks like the status bits are
4649  * already set. Since we make this test unlocked, there's a chance we
4650  * might fail to notice that the flags have just been cleared, and failed
4651  * to reset them, due to memory-ordering issues. But since this function
4652  * is only intended to be used in cases where failing to write out the
4653  * data would be harmless anyway, it doesn't really matter.
4654  */
4655  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4657  {
4659  bool dirtied = false;
4660  bool delayChkptFlags = false;
4661  uint32 buf_state;
4662 
4663  /*
4664  * If we need to protect hint bit updates from torn writes, WAL-log a
4665  * full page image of the page. This full page image is only necessary
4666  * if the hint bit update is the first change to the page since the
4667  * last checkpoint.
4668  *
4669  * We don't check full_page_writes here because that logic is included
4670  * when we call XLogInsert() since the value changes dynamically.
4671  */
4672  if (XLogHintBitIsNeeded() &&
4673  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4674  {
4675  /*
4676  * If we must not write WAL, due to a relfilelocator-specific
4677  * condition or being in recovery, don't dirty the page. We can
4678  * set the hint, just not dirty the page as a result so the hint
4679  * is lost when we evict the page or shutdown.
4680  *
4681  * See src/backend/storage/page/README for longer discussion.
4682  */
4683  if (RecoveryInProgress() ||
4685  return;
4686 
4687  /*
4688  * If the block is already dirty because we either made a change
4689  * or set a hint already, then we don't need to write a full page
4690  * image. Note that aggressive cleaning of blocks dirtied by hint
4691  * bit setting would increase the call rate. Bulk setting of hint
4692  * bits would reduce the call rate...
4693  *
4694  * We must issue the WAL record before we mark the buffer dirty.
4695  * Otherwise we might write the page before we write the WAL. That
4696  * causes a race condition, since a checkpoint might occur between
4697  * writing the WAL record and marking the buffer dirty. We solve
4698  * that with a kluge, but one that is already in use during
4699  * transaction commit to prevent race conditions. Basically, we
4700  * simply prevent the checkpoint WAL record from being written
4701  * until we have marked the buffer dirty. We don't start the
4702  * checkpoint flush until we have marked dirty, so our checkpoint
4703  * must flush the change to disk successfully or the checkpoint
4704  * never gets written, so crash recovery will fix.
4705  *
4706  * It's possible we may enter here without an xid, so it is
4707  * essential that CreateCheckPoint waits for virtual transactions
4708  * rather than full transactionids.
4709  */
4712  delayChkptFlags = true;
4713  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4714  }
4715 
4716  buf_state = LockBufHdr(bufHdr);
4717 
4718  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4719 
4720  if (!(buf_state & BM_DIRTY))
4721  {
4722  dirtied = true; /* Means "will be dirtied by this action" */
4723 
4724  /*
4725  * Set the page LSN if we wrote a backup block. We aren't supposed
4726  * to set this when only holding a share lock but as long as we
4727  * serialise it somehow we're OK. We choose to set LSN while
4728  * holding the buffer header lock, which causes any reader of an
4729  * LSN who holds only a share lock to also obtain a buffer header
4730  * lock before using PageGetLSN(), which is enforced in
4731  * BufferGetLSNAtomic().
4732  *
4733  * If checksums are enabled, you might think we should reset the
4734  * checksum here. That will happen when the page is written
4735  * sometime later in this checkpoint cycle.
4736  */
4737  if (!XLogRecPtrIsInvalid(lsn))
4738  PageSetLSN(page, lsn);
4739  }
4740 
4741  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4742  UnlockBufHdr(bufHdr, buf_state);
4743 
4744  if (delayChkptFlags)
4746 
4747  if (dirtied)
4748  {
4749  VacuumPageDirty++;
4751  if (VacuumCostActive)
4753  }
4754  }
4755 }
4756 
4757 /*
4758  * Release buffer content locks for shared buffers.
4759  *
4760  * Used to clean up after errors.
4761  *
4762  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
4763  * of releasing buffer content locks per se; the only thing we need to deal
4764  * with here is clearing any PIN_COUNT request that was in progress.
4765  */
4766 void
4768 {
4770 
4771  if (buf)
4772  {
4773  uint32 buf_state;
4774 
4775  buf_state = LockBufHdr(buf);
4776 
4777  /*
4778  * Don't complain if flag bit not set; it could have been reset but we
4779  * got a cancel/die interrupt before getting the signal.
4780  */
4781  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4782  buf->wait_backend_pgprocno == MyProcNumber)
4783  buf_state &= ~BM_PIN_COUNT_WAITER;
4784 
4785  UnlockBufHdr(buf, buf_state);
4786 
4787  PinCountWaitBuf = NULL;
4788  }
4789 }
4790 
4791 /*
4792  * Acquire or release the content_lock for the buffer.
4793  */
4794 void
4796 {
4797  BufferDesc *buf;
4798 
4800  if (BufferIsLocal(buffer))
4801  return; /* local buffers need no lock */
4802 
4804 
4805  if (mode == BUFFER_LOCK_UNLOCK)
4807  else if (mode == BUFFER_LOCK_SHARE)
4809  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4811  else
4812  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4813 }
4814 
4815 /*
4816  * Acquire the content_lock for the buffer, but only if we don't have to wait.
4817  *
4818  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
4819  */
4820 bool
4822 {
4823  BufferDesc *buf;
4824 
4826  if (BufferIsLocal(buffer))
4827  return true; /* act as though we got it */
4828 
4830 
4832  LW_EXCLUSIVE);
4833 }
4834 
4835 /*
4836  * Verify that this backend is pinning the buffer exactly once.
4837  *
4838  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
4839  * holds a pin on the buffer. We do not care whether some other backend does.
4840  */
4841 void
4843 {
4844  if (BufferIsLocal(buffer))
4845  {
4846  if (LocalRefCount[-buffer - 1] != 1)
4847  elog(ERROR, "incorrect local pin count: %d",
4848  LocalRefCount[-buffer - 1]);
4849  }
4850  else
4851  {
4852  if (GetPrivateRefCount(buffer) != 1)
4853  elog(ERROR, "incorrect local pin count: %d",
4855  }
4856 }
4857 
4858 /*
4859  * LockBufferForCleanup - lock a buffer in preparation for deleting items
4860  *
4861  * Items may be deleted from a disk page only when the caller (a) holds an
4862  * exclusive lock on the buffer and (b) has observed that no other backend
4863  * holds a pin on the buffer. If there is a pin, then the other backend
4864  * might have a pointer into the buffer (for example, a heapscan reference
4865  * to an item --- see README for more details). It's OK if a pin is added
4866  * after the cleanup starts, however; the newly-arrived backend will be
4867  * unable to look at the page until we release the exclusive lock.
4868  *
4869  * To implement this protocol, a would-be deleter must pin the buffer and
4870  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
4871  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
4872  * it has successfully observed pin count = 1.
4873  */
4874 void
4876 {
4877  BufferDesc *bufHdr;
4878  TimestampTz waitStart = 0;
4879  bool waiting = false;
4880  bool logged_recovery_conflict = false;
4881 
4883  Assert(PinCountWaitBuf == NULL);
4884 
4886 
4887  /* Nobody else to wait for */
4888  if (BufferIsLocal(buffer))
4889  return;
4890 
4891  bufHdr = GetBufferDescriptor(buffer - 1);
4892 
4893  for (;;)
4894  {
4895  uint32 buf_state;
4896 
4897  /* Try to acquire lock */
4899  buf_state = LockBufHdr(bufHdr);
4900 
4901  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4902  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4903  {
4904  /* Successfully acquired exclusive lock with pincount 1 */
4905  UnlockBufHdr(bufHdr, buf_state);
4906 
4907  /*
4908  * Emit the log message if recovery conflict on buffer pin was
4909  * resolved but the startup process waited longer than
4910  * deadlock_timeout for it.
4911  */
4912  if (logged_recovery_conflict)
4914  waitStart, GetCurrentTimestamp(),
4915  NULL, false);
4916 
4917  if (waiting)
4918  {
4919  /* reset ps display to remove the suffix if we added one */
4921  waiting = false;
4922  }
4923  return;
4924  }
4925  /* Failed, so mark myself as waiting for pincount 1 */
4926  if (buf_state & BM_PIN_COUNT_WAITER)
4927  {
4928  UnlockBufHdr(bufHdr, buf_state);
4930  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4931  }
4933  PinCountWaitBuf = bufHdr;
4934  buf_state |= BM_PIN_COUNT_WAITER;
4935  UnlockBufHdr(bufHdr, buf_state);
4937 
4938  /* Wait to be signaled by UnpinBuffer() */
4939  if (InHotStandby)
4940  {
4941  if (!waiting)
4942  {
4943  /* adjust the process title to indicate that it's waiting */
4944  set_ps_display_suffix("waiting");
4945  waiting = true;
4946  }
4947 
4948  /*
4949  * Emit the log message if the startup process is waiting longer
4950  * than deadlock_timeout for recovery conflict on buffer pin.
4951  *
4952  * Skip this if first time through because the startup process has
4953  * not started waiting yet in this case. So, the wait start
4954  * timestamp is set after this logic.
4955  */
4956  if (waitStart != 0 && !logged_recovery_conflict)
4957  {
4959 
4960  if (TimestampDifferenceExceeds(waitStart, now,
4961  DeadlockTimeout))
4962  {
4964  waitStart, now, NULL, true);
4965  logged_recovery_conflict = true;
4966  }
4967  }
4968 
4969  /*
4970  * Set the wait start timestamp if logging is enabled and first
4971  * time through.
4972  */
4973  if (log_recovery_conflict_waits && waitStart == 0)
4974  waitStart = GetCurrentTimestamp();
4975 
4976  /* Publish the bufid that Startup process waits on */
4978  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4980  /* Reset the published bufid */
4982  }
4983  else
4984  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
4985 
4986  /*
4987  * Remove flag marking us as waiter. Normally this will not be set
4988  * anymore, but ProcWaitForSignal() can return for other signals as
4989  * well. We take care to only reset the flag if we're the waiter, as
4990  * theoretically another backend could have started waiting. That's
4991  * impossible with the current usages due to table level locking, but
4992  * better be safe.
4993  */
4994  buf_state = LockBufHdr(bufHdr);
4995  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4997  buf_state &= ~BM_PIN_COUNT_WAITER;
4998  UnlockBufHdr(bufHdr, buf_state);
4999 
5000  PinCountWaitBuf = NULL;
5001  /* Loop back and try again */
5002  }
5003 }
5004 
5005 /*
5006  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5007  * requests cancellation of all pin holders that are blocking it.
5008  */
5009 bool
5011 {
5012  int bufid = GetStartupBufferPinWaitBufId();
5013 
5014  /*
5015  * If we get woken slowly then it's possible that the Startup process was
5016  * already woken by other backends before we got here. Also possible that
5017  * we get here by multiple interrupts or interrupts at inappropriate
5018  * times, so make sure we do nothing if the bufid is not set.
5019  */
5020  if (bufid < 0)
5021  return false;
5022 
5023  if (GetPrivateRefCount(bufid + 1) > 0)
5024  return true;
5025 
5026  return false;
5027 }
5028 
5029 /*
5030  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5031  *
5032  * We won't loop, but just check once to see if the pin count is OK. If
5033  * not, return false with no lock held.
5034  */
5035 bool
5037 {
5038  BufferDesc *bufHdr;
5039  uint32 buf_state,
5040  refcount;
5041 
5043 
5044  if (BufferIsLocal(buffer))
5045  {
5046  refcount = LocalRefCount[-buffer - 1];
5047  /* There should be exactly one pin */
5048  Assert(refcount > 0);
5049  if (refcount != 1)
5050  return false;
5051  /* Nobody else to wait for */
5052  return true;
5053  }
5054 
5055  /* There should be exactly one local pin */
5057  Assert(refcount);
5058  if (refcount != 1)
5059  return false;
5060 
5061  /* Try to acquire lock */
5063  return false;
5064 
5065  bufHdr = GetBufferDescriptor(buffer - 1);
5066  buf_state = LockBufHdr(bufHdr);
5067  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5068 
5069  Assert(refcount > 0);
5070  if (refcount == 1)
5071  {
5072  /* Successfully acquired exclusive lock with pincount 1 */
5073  UnlockBufHdr(bufHdr, buf_state);
5074  return true;
5075  }
5076 
5077  /* Failed, so release the lock */
5078  UnlockBufHdr(bufHdr, buf_state);
5080  return false;
5081 }
5082 
5083 /*
5084  * IsBufferCleanupOK - as above, but we already have the lock
5085  *
5086  * Check whether it's OK to perform cleanup on a buffer we've already
5087  * locked. If we observe that the pin count is 1, our exclusive lock
5088  * happens to be a cleanup lock, and we can proceed with anything that
5089  * would have been allowable had we sought a cleanup lock originally.
5090  */
5091 bool
5093 {
5094  BufferDesc *bufHdr;
5095  uint32 buf_state;
5096 
5098 
5099  if (BufferIsLocal(buffer))
5100  {
5101  /* There should be exactly one pin */
5102  if (LocalRefCount[-buffer - 1] != 1)
5103  return false;
5104  /* Nobody else to wait for */
5105  return true;
5106  }
5107 
5108  /* There should be exactly one local pin */
5109  if (GetPrivateRefCount(buffer) != 1)
5110  return false;
5111 
5112  bufHdr = GetBufferDescriptor(buffer - 1);
5113 
5114  /* caller must hold exclusive lock on buffer */
5116  LW_EXCLUSIVE));
5117 
5118  buf_state = LockBufHdr(bufHdr);
5119 
5120  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5121  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5122  {
5123  /* pincount is OK. */
5124  UnlockBufHdr(bufHdr, buf_state);
5125  return true;
5126  }
5127 
5128  UnlockBufHdr(bufHdr, buf_state);
5129  return false;
5130 }
5131 
5132 
5133 /*
5134  * Functions for buffer I/O handling
5135  *
5136  * Note: We assume that nested buffer I/O never occurs.
5137  * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5138  *
5139  * Also note that these are used only for shared buffers, not local ones.
5140  */
5141 
5142 /*
5143  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5144  */
5145 static void
5147 {
5149 
5151  for (;;)
5152  {
5153  uint32 buf_state;
5154 
5155  /*
5156  * It may not be necessary to acquire the spinlock to check the flag
5157  * here, but since this test is essential for correctness, we'd better
5158  * play it safe.
5159  */
5160  buf_state = LockBufHdr(buf);
5161  UnlockBufHdr(buf, buf_state);
5162 
5163  if (!(buf_state & BM_IO_IN_PROGRESS))
5164  break;
5165  ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5166  }
5168 }
5169 
5170 /*
5171  * StartBufferIO: begin I/O on this buffer
5172  * (Assumptions)
5173  * My process is executing no IO
5174  * The buffer is Pinned
5175  *
5176  * In some scenarios there are race conditions in which multiple backends
5177  * could attempt the same I/O operation concurrently. If someone else
5178  * has already started I/O on this buffer then we will block on the
5179  * I/O condition variable until he's done.
5180  *
5181  * Input operations are only attempted on buffers that are not BM_VALID,
5182  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5183  * so we can always tell if the work is already done.
5184  *
5185  * Returns true if we successfully marked the buffer as I/O busy,
5186  * false if someone else already did the work.
5187  */
5188 static bool
5189 StartBufferIO(BufferDesc *buf, bool forInput)
5190 {
5191  uint32 buf_state;
5192 
5194 
5195  for (;;)
5196  {
5197  buf_state = LockBufHdr(buf);
5198 
5199  if (!(buf_state & BM_IO_IN_PROGRESS))
5200  break;
5201  UnlockBufHdr(buf, buf_state);
5202  WaitIO(buf);
5203  }
5204 
5205  /* Once we get here, there is definitely no I/O active on this buffer */
5206 
5207  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5208  {
5209  /* someone else already did the I/O */
5210  UnlockBufHdr(buf, buf_state);
5211  return false;
5212  }
5213 
5214  buf_state |= BM_IO_IN_PROGRESS;
5215  UnlockBufHdr(buf, buf_state);
5216 
5219 
5220  return true;
5221 }
5222 
5223 /*
5224  * TerminateBufferIO: release a buffer we were doing I/O on
5225  * (Assumptions)
5226  * My process is executing IO for the buffer
5227  * BM_IO_IN_PROGRESS bit is set for the buffer
5228  * The buffer is Pinned
5229  *
5230  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5231  * buffer's BM_DIRTY flag. This is appropriate when terminating a
5232  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5233  * marking the buffer clean if it was re-dirtied while we were writing.
5234  *
5235  * set_flag_bits gets ORed into the buffer's flags. It must include
5236  * BM_IO_ERROR in a failure case. For successful completion it could
5237  * be 0, or BM_VALID if we just finished reading in the page.
5238  *
5239  * If forget_owner is true, we release the buffer I/O from the current
5240  * resource owner. (forget_owner=false is used when the resource owner itself
5241  * is being released)
5242  */
5243 static void
5244 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5245  bool forget_owner)
5246 {
5247  uint32 buf_state;
5248 
5249  buf_state = LockBufHdr(buf);
5250 
5251  Assert(buf_state & BM_IO_IN_PROGRESS);
5252 
5253  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5254  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5255  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5256 
5257  buf_state |= set_flag_bits;
5258  UnlockBufHdr(buf, buf_state);
5259 
5260  if (forget_owner)
5263 
5265 }
5266 
5267 /*
5268  * AbortBufferIO: Clean up active buffer I/O after an error.
5269  *
5270  * All LWLocks we might have held have been released,
5271  * but we haven't yet released buffer pins, so the buffer is still pinned.
5272  *
5273  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5274  * possible the error condition wasn't related to the I/O.
5275  *
5276  * Note: this does not remove the buffer I/O from the resource owner.
5277  * That's correct when we're releasing the whole resource owner, but
5278  * beware if you use this in other contexts.
5279  */
5280 static void
5282 {
5283  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5284  uint32 buf_state;
5285 
5286  buf_state = LockBufHdr(buf_hdr);
5287  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5288 
5289  if (!(buf_state & BM_VALID))
5290  {
5291  Assert(!(buf_state & BM_DIRTY));
5292  UnlockBufHdr(buf_hdr, buf_state);
5293  }
5294  else
5295  {
5296  Assert(buf_state & BM_DIRTY);
5297  UnlockBufHdr(buf_hdr, buf_state);
5298 
5299  /* Issue notice if this is not the first failure... */
5300  if (buf_state & BM_IO_ERROR)
5301  {
5302  /* Buffer is pinned, so we can read tag without spinlock */
5303  char *path;
5304 
5305  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5306  BufTagGetForkNum(&buf_hdr->tag));
5307  ereport(WARNING,
5308  (errcode(ERRCODE_IO_ERROR),
5309  errmsg("could not write block %u of %s",
5310  buf_hdr->tag.blockNum, path),
5311  errdetail("Multiple failures --- write error might be permanent.")));
5312  pfree(path);
5313  }
5314  }
5315 
5316  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5317 }
5318 
5319 /*
5320  * Error context callback for errors occurring during shared buffer writes.
5321  */
5322 static void
5324 {
5325  BufferDesc *bufHdr = (BufferDesc *) arg;
5326 
5327  /* Buffer is pinned, so we can read the tag without locking the spinlock */
5328  if (bufHdr != NULL)
5329  {
5330  char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5331  BufTagGetForkNum(&bufHdr->tag));
5332 
5333  errcontext("writing block %u of relation %s",
5334  bufHdr->tag.blockNum, path);
5335  pfree(path);
5336  }
5337 }
5338 
5339 /*
5340  * Error context callback for errors occurring during local buffer writes.
5341  */
5342 static void
5344 {
5345  BufferDesc *bufHdr = (BufferDesc *) arg;
5346 
5347  if (bufHdr != NULL)
5348  {
5349  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5350  MyProcNumber,
5351  BufTagGetForkNum(&bufHdr->tag));
5352 
5353  errcontext("writing block %u of relation %s",
5354  bufHdr->tag.blockNum, path);
5355  pfree(path);
5356  }
5357 }
5358 
5359 /*
5360  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5361  */
5362 static int
5363 rlocator_comparator(const void *p1, const void *p2)
5364 {
5365  RelFileLocator n1 = *(const RelFileLocator *) p1;
5366  RelFileLocator n2 = *(const RelFileLocator *) p2;
5367 
5368  if (n1.relNumber < n2.relNumber)
5369  return -1;
5370  else if (n1.relNumber > n2.relNumber)
5371  return 1;
5372 
5373  if (n1.dbOid < n2.dbOid)
5374  return -1;
5375  else if (n1.dbOid > n2.dbOid)
5376  return 1;
5377 
5378  if (n1.spcOid < n2.spcOid)
5379  return -1;
5380  else if (n1.spcOid > n2.spcOid)
5381  return 1;
5382  else
5383  return 0;
5384 }
5385 
5386 /*
5387  * Lock buffer header - set BM_LOCKED in buffer state.
5388  */
5389 uint32
5391 {
5392  SpinDelayStatus delayStatus;
5393  uint32 old_buf_state;
5394 
5396 
5397  init_local_spin_delay(&delayStatus);
5398 
5399  while (true)
5400  {
5401  /* set BM_LOCKED flag */
5402  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5403  /* if it wasn't set before we're OK */
5404  if (!(old_buf_state & BM_LOCKED))
5405  break;
5406  perform_spin_delay(&delayStatus);
5407  }
5408  finish_spin_delay(&delayStatus);
5409  return old_buf_state | BM_LOCKED;
5410 }
5411 
5412 /*
5413  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5414  * state at that point.
5415  *
5416  * Obviously the buffer could be locked by the time the value is returned, so
5417  * this is primarily useful in CAS style loops.
5418  */
5419 static uint32
5421 {
5422  SpinDelayStatus delayStatus;
5423  uint32 buf_state;
5424 
5425  init_local_spin_delay(&delayStatus);
5426 
5427  buf_state = pg_atomic_read_u32(&buf->state);
5428 
5429  while (buf_state & BM_LOCKED)
5430  {
5431  perform_spin_delay(&delayStatus);
5432  buf_state = pg_atomic_read_u32(&buf->state);
5433  }
5434 
5435  finish_spin_delay(&delayStatus);
5436 
5437  return buf_state;
5438 }
5439 
5440 /*
5441  * BufferTag comparator.
5442  */
5443 static inline int
5445 {
5446  int ret;
5447  RelFileLocator rlocatora;
5448  RelFileLocator rlocatorb;
5449 
5450  rlocatora = BufTagGetRelFileLocator(ba);
5451  rlocatorb = BufTagGetRelFileLocator(bb);
5452 
5453  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5454 
5455  if (ret != 0)
5456  return ret;
5457 
5458  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5459  return -1;
5460  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5461  return 1;
5462 
5463  if (ba->blockNum < bb->blockNum)
5464  return -1;
5465  if (ba->blockNum > bb->blockNum)
5466  return 1;
5467 
5468  return 0;
5469 }
5470 
5471 /*
5472  * Comparator determining the writeout order in a checkpoint.
5473  *
5474  * It is important that tablespaces are compared first, the logic balancing
5475  * writes between tablespaces relies on it.
5476  */
5477 static inline int
5479 {
5480  /* compare tablespace */
5481  if (a->tsId < b->tsId)
5482  return -1;
5483  else if (a->tsId > b->tsId)
5484  return 1;
5485  /* compare relation */
5486  if (a->relNumber < b->relNumber)
5487  return -1;
5488  else if (a->relNumber > b->relNumber)
5489  return 1;
5490  /* compare fork */
5491  else if (a->forkNum < b->forkNum)
5492  return -1;
5493  else if (a->forkNum > b->forkNum)
5494  return 1;
5495  /* compare block number */
5496  else if (a->blockNum < b->blockNum)
5497  return -1;
5498  else if (a->blockNum > b->blockNum)
5499  return 1;
5500  /* equal page IDs are unlikely, but not impossible */
5501  return 0;
5502 }
5503 
5504 /*
5505  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5506  * progress.
5507  */
5508 static int
5510 {
5511  CkptTsStatus *sa = (CkptTsStatus *) a;
5512  CkptTsStatus *sb = (CkptTsStatus *) b;
5513 
5514  /* we want a min-heap, so return 1 for the a < b */
5515  if (sa->progress < sb->progress)
5516  return 1;
5517  else if (sa->progress == sb->progress)
5518  return 0;
5519  else
5520  return -1;
5521 }
5522 
5523 /*
5524  * Initialize a writeback context, discarding potential previous state.
5525  *
5526  * *max_pending is a pointer instead of an immediate value, so the coalesce
5527  * limits can easily changed by the GUC mechanism, and so calling code does
5528  * not have to check the current configuration. A value of 0 means that no
5529  * writeback control will be performed.
5530  */
5531 void
5532 WritebackContextInit(WritebackContext *context, int *max_pending)
5533 {
5534  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5535 
5536  context->max_pending = max_pending;
5537  context->nr_pending = 0;
5538 }
5539 
5540 /*
5541  * Add buffer to list of pending writeback requests.
5542  */
5543 void
5545  BufferTag *tag)
5546 {
5547  PendingWriteback *pending;
5548 
5550  return;
5551 
5552  /*
5553  * Add buffer to the pending writeback array, unless writeback control is
5554  * disabled.
5555  */
5556  if (*wb_context->max_pending > 0)
5557  {
5559 
5560  pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5561 
5562  pending->tag = *tag;
5563  }
5564 
5565  /*
5566  * Perform pending flushes if the writeback limit is exceeded. This
5567  * includes the case where previously an item has been added, but control
5568  * is now disabled.
5569  */
5570  if (wb_context->nr_pending >= *wb_context->max_pending)
5571  IssuePendingWritebacks(wb_context, io_context);
5572 }
5573 
5574 #define ST_SORT sort_pending_writebacks
5575 #define ST_ELEMENT_TYPE PendingWriteback
5576 #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5577 #define ST_SCOPE static
5578 #define ST_DEFINE
5579 #include <lib/sort_template.h>
5580 
5581 /*
5582  * Issue all pending writeback requests, previously scheduled with
5583  * ScheduleBufferTagForWriteback, to the OS.
5584  *
5585  * Because this is only used to improve the OSs IO scheduling we try to never
5586  * error out - it's just a hint.
5587  */
5588 void
5590 {
5591  instr_time io_start;
5592  int i;
5593 
5594  if (wb_context->nr_pending == 0)
5595  return;
5596 
5597  /*
5598  * Executing the writes in-order can make them a lot faster, and allows to
5599  * merge writeback requests to consecutive blocks into larger writebacks.
5600  */
5601  sort_pending_writebacks(wb_context->pending_writebacks,
5602  wb_context->nr_pending);
5603 
5605 
5606  /*
5607  * Coalesce neighbouring writes, but nothing else. For that we iterate
5608  * through the, now sorted, array of pending flushes, and look forward to
5609  * find all neighbouring (or identical) writes.
5610  */
5611  for (i = 0; i < wb_context->nr_pending; i++)
5612  {
5615  SMgrRelation reln;
5616  int ahead;
5617  BufferTag tag;
5618  RelFileLocator currlocator;
5619  Size nblocks = 1;
5620 
5621  cur = &wb_context->pending_writebacks[i];
5622  tag = cur->tag;
5623  currlocator = BufTagGetRelFileLocator(&tag);
5624 
5625  /*
5626  * Peek ahead, into following writeback requests, to see if they can
5627  * be combined with the current one.
5628  */
5629  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5630  {
5631 
5632  next = &wb_context->pending_writebacks[i + ahead + 1];
5633 
5634  /* different file, stop */
5635  if (!RelFileLocatorEquals(currlocator,
5636  BufTagGetRelFileLocator(&next->tag)) ||
5637  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5638  break;
5639 
5640  /* ok, block queued twice, skip */
5641  if (cur->tag.blockNum == next->tag.blockNum)
5642  continue;
5643 
5644  /* only merge consecutive writes */
5645  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5646  break;
5647 
5648  nblocks++;
5649  cur = next;
5650  }
5651 
5652  i += ahead;
5653 
5654  /* and finally tell the kernel to write the data to storage */
5655  reln = smgropen(currlocator, INVALID_PROC_NUMBER);
5656  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5657  }
5658 
5659  /*
5660  * Assume that writeback requests are only issued for buffers containing
5661  * blocks of permanent relations.
5662  */
5664  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5665 
5666  wb_context->nr_pending = 0;
5667 }
5668 
5669 /* ResourceOwner callbacks */
5670 
5671 static void
5673 {
5675 
5677 }
5678 
5679 static char *
5681 {
5683 
5684  return psprintf("lost track of buffer IO on buffer %d", buffer);
5685 }
5686 
5687 static void
5689 {
5691 
5692  /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
5693  if (!BufferIsValid(buffer))
5694  elog(ERROR, "bad buffer ID: %d", buffer);
5695 
5696  if (BufferIsLocal(buffer))
5698  else
5700 }
5701 
5702 static char *
5704 {
5706 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:344
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:405
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:290
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:234
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1790
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1618
int BgWriterDelay
Definition: bgwriter.c:57
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:221
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:63
#define BM_PERMANENT
Definition: buf_internals.h:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:45
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:43
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:48
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:67
#define BM_DIRTY
Definition: buf_internals.h:61
#define BM_LOCKED
Definition: buf_internals.h:60
#define BM_JUST_DIRTIED
Definition: buf_internals.h:66
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:52
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:64
static void ClearBufferTag(BufferTag *tag)
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:46
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:51
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:62
#define BM_IO_ERROR
Definition: buf_internals.h:65
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:68
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:138
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:4842
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:4243
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4592
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4039
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:5478
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2128
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:223
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3377
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:304
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:3684
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2252
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:627
static uint32 PrivateRefCountClock
Definition: bufmgr.c:199
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3437
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:5672
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2310
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:214
bool zero_damaged_pages
Definition: bufmgr.c:135
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:82
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2416
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5420
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:5444
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:67
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5092
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:64
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
Definition: bufmgr.c:5244
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:5680
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1239
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:838
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3212
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:3318
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:5281
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:870
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3272
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:4434
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:3807
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5363
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:902
struct SMgrSortArray SMgrSortArray
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1539
static void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:1773
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3254
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5509
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:3398
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
#define BUF_REUSABLE
Definition: bufmgr.c:72
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5343
static void BufferSync(int flags)
Definition: bufmgr.c:2565
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:5703
void CheckPointBuffers(int flags)
Definition: bufmgr.c:3363
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2157
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1848
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2841
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:3608
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:91
void UnlockBuffers(void)
Definition: bufmgr.c:4767
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:537
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1607
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4821
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:5189
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:3576
int bgwriter_flush_after
Definition: bufmgr.c:160
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4560
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:3978
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3638
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5010
int checkpoint_flush_after
Definition: bufmgr.c:159
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4577
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:2468
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5323
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5544
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5532
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2189
void InitBufferPoolAccess(void)
Definition: bufmgr.c:3229
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:462
double bgwriter_lru_multiplier
Definition: bufmgr.c:137
int backend_flush_after
Definition: bufmgr.c:161
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:238
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:164
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:404
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1804
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4875
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4795
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:200
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:4624
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4145
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5589
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:427
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:821
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:658
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:74
int maintenance_io_concurrency
Definition: bufmgr.c:153
static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:1014
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2459
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4498
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1441
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4343
int effective_io_concurrency
Definition: bufmgr.c:146
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:330
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:781
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5390
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:5688
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:196
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3139
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:734
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:197
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:198
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5036
int bgwriter_lru_maxpages
Definition: bufmgr.c:136
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5146
#define BUF_WRITTEN
Definition: bufmgr.c:71
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:4540
@ BAS_BULKREAD
Definition: bufmgr.h:35
@ BAS_BULKWRITE
Definition: bufmgr.h:37
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:158
#define P_NEW
Definition: bufmgr.h:152
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:130
#define BMR_SMGR(p_smgr, p_relpersistence)
Definition: bufmgr.h:107
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:131
void * Block
Definition: bufmgr.h:24
@ EB_LOCK_TARGET
Definition: bufmgr.h:91
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:88
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:76
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:82
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:73
@ EB_LOCK_FIRST
Definition: bufmgr.h:85
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159
ReadBufferMode
Definition: bufmgr.h:43
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:49
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:47
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:45
@ RBM_NORMAL
Definition: bufmgr.h:44
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:50
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
Pointer Page
Definition: bufpage.h:78
#define PIV_LOG_WARNING
Definition: bufpage.h:465
static bool PageIsNew(Page page)
Definition: bufpage.h:230
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
#define PIV_REPORT_STAT
Definition: bufpage.h:466
unsigned int uint32
Definition: c.h:493
signed int int32
Definition: c.h:481
double float8
Definition: c.h:617
#define unlikely(x)
Definition: c.h:298
#define lengthof(array)
Definition: c.h:775
#define MemSet(start, val, len)
Definition: c.h:1007
size_t Size
Definition: c.h:592
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:711
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1395
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
struct cursor * cur
Definition: ecpg.c:28
int errdetail(const char *fmt,...)
Definition: elog.c:1205
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:639
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:716
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:756
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:38
int64 VacuumPageHit
Definition: globals.c:154
int NBuffers
Definition: globals.c:139
ProcNumber MyProcNumber
Definition: globals.c:87
int VacuumCostPageMiss
Definition: globals.c:149
int64 VacuumPageMiss
Definition: globals.c:155
bool VacuumCostActive
Definition: globals.c:159
int64 VacuumPageDirty
Definition: globals.c:156
int VacuumCostBalance
Definition: globals.c:158
int MaxBackends
Definition: globals.c:143
int VacuumCostPageDirty
Definition: globals.c:150
int VacuumCostPageHit
Definition: globals.c:148
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static volatile sig_atomic_t waiting
Definition: latch.c:162
Assert(fmt[strlen(fmt) - 1] !='\n')
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:430
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:480
int32 * LocalRefCount
Definition: localbuf.c:46
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:681
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:117
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:830
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:655
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
int NLocBuffer
Definition: localbuf.c:42
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:69
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:688
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1897
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1172
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1941
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1785
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1343
@ LW_SHARED
Definition: lwlock.h:117
@ LW_EXCLUSIVE
Definition: lwlock.h:116
void pfree(void *pointer)
Definition: mcxt.c:1508
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1528
void * palloc(Size size)
Definition: mcxt.c:1304
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static PgChecksumMode mode
Definition: pg_checksums.c:56
int64 current_size
Definition: pg_checksums.c:64
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
static char * buf
Definition: pg_test_fsync.c:73
IOObject
Definition: pgstat.h:279
@ IOOBJECT_RELATION
Definition: pgstat.h:280
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:281
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:635
IOContext
Definition: pgstat.h:287
@ IOCONTEXT_NORMAL
Definition: pgstat.h:290
@ IOOP_EXTEND
Definition: pgstat.h:299
@ IOOP_READ
Definition: pgstat.h:302
@ IOOP_WRITEBACK
Definition: pgstat.h:305
@ IOOP_HIT
Definition: pgstat.h:301
@ IOOP_EVICT
Definition: pgstat.h:298
@ IOOP_REUSE
Definition: pgstat.h:303
@ IOOP_WRITE
Definition: pgstat.h:304
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:640
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:100
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:122
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
#define qsort(a, b, c, d)
Definition: port.h:449
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
#define NUM_AUXILIARY_PROCS
Definition: proc.h:440
#define DELAY_CHKPT_START
Definition: proc.h:114
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:464
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:394
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:342
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:658
#define RelationIsValid(relation)
Definition: rel.h:478
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
#define MAX_FORKNUM
Definition: relpath.h:62
#define relpath(rlocator, forknum)
Definition: relpath.h:94
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:132
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:192
#define init_local_spin_delay(status)
Definition: s_lock.h:843
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:655
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:643
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:198
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:411
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:679
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:560
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:535
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:398
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:585
#define SmgrIsTemp(smgr)
Definition: smgr.h:73
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121
static void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer)
Definition: smgr.h:114
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1878
PGPROC * MyProc
Definition: proc.c:66
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:671
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:659
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1866
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:792
bool log_recovery_conflict_waits
Definition: standby.c:41
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:273
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:532
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
struct SMgrRelationData * smgr
Definition: bufmgr.h:102
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 local_blks_written
Definition: instrument.h:33
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:165
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:110
int index
Definition: bufmgr.c:118
int num_scanned
Definition: bufmgr.c:115
float8 progress
Definition: bufmgr.c:109
int num_to_scan
Definition: bufmgr.c:113
Oid tsId
Definition: bufmgr.c:100
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: lwlock.h:41
int delayChkptFlags
Definition: proc.h:236
PgStat_Counter buf_written_clean
Definition: pgstat.h:255
PgStat_Counter maxwritten_clean
Definition: pgstat.h:256
PgStat_Counter buf_alloc
Definition: pgstat.h:257
PgStat_Counter buffers_written
Definition: pgstat.h:270
Buffer recent_buffer
Definition: bufmgr.h:59
RelFileLocator locator
RelFileNumber relNumber
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37
SMgrRelation srel
Definition: bufmgr.c:131
RelFileLocator rlocator
Definition: bufmgr.c:130
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
Definition: buf_internals.h:98
Oid spcOid
Definition: buf_internals.h:94
Oid dbOid
Definition: buf_internals.h:95
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1860
bool RecoveryInProgress(void)
Definition: xlog.c:6201
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3061
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2728
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:138
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:141
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:137
#define XLogIsNeeded()
Definition: xlog.h:107
#define XLogHintBitIsNeeded()
Definition: xlog.h:118
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
#define InHotStandby
Definition: xlogutils.h:57