PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * StartReadBuffer() -- as above, with separate wait step
23  * StartReadBuffers() -- multiple block version
24  * WaitReadBuffers() -- second step of above
25  *
26  * ReleaseBuffer() -- unpin a buffer
27  *
28  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29  * The disk write is delayed until buffer replacement or checkpoint.
30  *
31  * See also these files:
32  * freelist.c -- chooses victim for buffer replacement
33  * buf_table.c -- manages the buffer lookup table
34  */
35 #include "postgres.h"
36 
37 #include <sys/file.h>
38 #include <unistd.h>
39 
40 #include "access/tableam.h"
41 #include "access/xloginsert.h"
42 #include "access/xlogutils.h"
43 #include "catalog/storage.h"
44 #include "catalog/storage_xlog.h"
45 #include "executor/instrument.h"
46 #include "lib/binaryheap.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "postmaster/bgwriter.h"
51 #include "storage/buf_internals.h"
52 #include "storage/bufmgr.h"
53 #include "storage/fd.h"
54 #include "storage/ipc.h"
55 #include "storage/lmgr.h"
56 #include "storage/proc.h"
57 #include "storage/read_stream.h"
58 #include "storage/smgr.h"
59 #include "storage/standby.h"
60 #include "utils/memdebug.h"
61 #include "utils/ps_status.h"
62 #include "utils/rel.h"
63 #include "utils/resowner.h"
64 #include "utils/timestamp.h"
65 
66 
67 /* Note: these two macros only work on shared buffers, not local ones! */
68 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
69 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
70 
71 /* Note: this macro only works on local buffers, not shared ones! */
72 #define LocalBufHdrGetBlock(bufHdr) \
73  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
74 
75 /* Bits in SyncOneBuffer's return value */
76 #define BUF_WRITTEN 0x01
77 #define BUF_REUSABLE 0x02
78 
79 #define RELS_BSEARCH_THRESHOLD 20
80 
81 /*
82  * This is the size (in the number of blocks) above which we scan the
83  * entire buffer pool to remove the buffers for all the pages of relation
84  * being dropped. For the relations with size below this threshold, we find
85  * the buffers by doing lookups in BufMapping table.
86  */
87 #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
88 
89 typedef struct PrivateRefCountEntry
90 {
94 
95 /* 64 bytes, about the size of a cache line on common systems */
96 #define REFCOUNT_ARRAY_ENTRIES 8
97 
98 /*
99  * Status of buffers to checkpoint for a particular tablespace, used
100  * internally in BufferSync.
101  */
102 typedef struct CkptTsStatus
103 {
104  /* oid of the tablespace */
106 
107  /*
108  * Checkpoint progress for this tablespace. To make progress comparable
109  * between tablespaces the progress is, for each tablespace, measured as a
110  * number between 0 and the total number of to-be-checkpointed pages. Each
111  * page checkpointed in this tablespace increments this space's progress
112  * by progress_slice.
113  */
116 
117  /* number of to-be checkpointed pages in this tablespace */
119  /* already processed pages in this tablespace */
121 
122  /* current offset in CkptBufferIds for this tablespace */
123  int index;
125 
126 /*
127  * Type for array used to sort SMgrRelations
128  *
129  * FlushRelationsAllBuffers shares the same comparator function with
130  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
131  * compatible.
132  */
133 typedef struct SMgrSortArray
134 {
135  RelFileLocator rlocator; /* This must be the first member */
138 
139 /*
140  * Helper struct for read stream object used in
141  * RelationCopyStorageUsingBuffer() function.
142  */
144 {
147 };
148 
149 /*
150  * Callback function to get next block for read stream object used in
151  * RelationCopyStorageUsingBuffer() function.
152  */
153 static BlockNumber
155  void *callback_private_data,
156  void *per_buffer_data)
157 {
158  struct copy_storage_using_buffer_read_stream_private *p = callback_private_data;
159 
160  if (p->blocknum < p->nblocks)
161  return p->blocknum++;
162 
163  return InvalidBlockNumber;
164 }
165 
166 /* GUC variables */
167 bool zero_damaged_pages = false;
170 bool track_io_timing = false;
171 
172 /*
173  * How many buffers PrefetchBuffer callers should try to stay ahead of their
174  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
175  * for buffers not belonging to tablespaces that have their
176  * effective_io_concurrency parameter set.
177  */
179 
180 /*
181  * Like effective_io_concurrency, but used by maintenance code paths that might
182  * benefit from a higher setting because they work on behalf of many sessions.
183  * Overridden by the tablespace setting of the same name.
184  */
186 
187 /*
188  * Limit on how many blocks should be handled in single I/O operations.
189  * StartReadBuffers() callers should respect it, as should other operations
190  * that call smgr APIs directly.
191  */
193 
194 /*
195  * GUC variables about triggering kernel writeback for buffers written; OS
196  * dependent defaults are set via the GUC mechanism.
197  */
201 
202 /* local state for LockBufferForCleanup */
204 
205 /*
206  * Backend-Private refcount management:
207  *
208  * Each buffer also has a private refcount that keeps track of the number of
209  * times the buffer is pinned in the current process. This is so that the
210  * shared refcount needs to be modified only once if a buffer is pinned more
211  * than once by an individual backend. It's also used to check that no buffers
212  * are still pinned at the end of transactions and when exiting.
213  *
214  *
215  * To avoid - as we used to - requiring an array with NBuffers entries to keep
216  * track of local buffers, we use a small sequentially searched array
217  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
218  * keep track of backend local pins.
219  *
220  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
221  * refcounts are kept track of in the array; after that, new array entries
222  * displace old ones into the hash table. That way a frequently used entry
223  * can't get "stuck" in the hashtable while infrequent ones clog the array.
224  *
225  * Note that in most scenarios the number of pinned buffers will not exceed
226  * REFCOUNT_ARRAY_ENTRIES.
227  *
228  *
229  * To enter a buffer into the refcount tracking mechanism first reserve a free
230  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
231  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
232  * memory allocations in NewPrivateRefCountEntry() which can be important
233  * because in some scenarios it's called with a spinlock held...
234  */
236 static HTAB *PrivateRefCountHash = NULL;
240 
241 static void ReservePrivateRefCountEntry(void);
244 static inline int32 GetPrivateRefCount(Buffer buffer);
246 
247 /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
248 static void ResOwnerReleaseBufferIO(Datum res);
249 static char *ResOwnerPrintBufferIO(Datum res);
250 static void ResOwnerReleaseBufferPin(Datum res);
251 static char *ResOwnerPrintBufferPin(Datum res);
252 
254 {
255  .name = "buffer io",
256  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
257  .release_priority = RELEASE_PRIO_BUFFER_IOS,
258  .ReleaseResource = ResOwnerReleaseBufferIO,
259  .DebugPrint = ResOwnerPrintBufferIO
260 };
261 
263 {
264  .name = "buffer pin",
265  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
266  .release_priority = RELEASE_PRIO_BUFFER_PINS,
267  .ReleaseResource = ResOwnerReleaseBufferPin,
268  .DebugPrint = ResOwnerPrintBufferPin
269 };
270 
271 /*
272  * Ensure that the PrivateRefCountArray has sufficient space to store one more
273  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
274  * a new entry - but it's perfectly fine to not use a reserved entry.
275  */
276 static void
278 {
279  /* Already reserved (or freed), nothing to do */
280  if (ReservedRefCountEntry != NULL)
281  return;
282 
283  /*
284  * First search for a free entry the array, that'll be sufficient in the
285  * majority of cases.
286  */
287  {
288  int i;
289 
290  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
291  {
293 
295 
296  if (res->buffer == InvalidBuffer)
297  {
299  return;
300  }
301  }
302  }
303 
304  /*
305  * No luck. All array entries are full. Move one array entry into the hash
306  * table.
307  */
308  {
309  /*
310  * Move entry from the current clock position in the array into the
311  * hashtable. Use that slot.
312  */
313  PrivateRefCountEntry *hashent;
314  bool found;
315 
316  /* select victim slot */
319 
320  /* Better be used, otherwise we shouldn't get here. */
322 
323  /* enter victim array entry into hashtable */
326  HASH_ENTER,
327  &found);
328  Assert(!found);
330 
331  /* clear the now free array slot */
334 
336  }
337 }
338 
339 /*
340  * Fill a previously reserved refcount entry.
341  */
342 static PrivateRefCountEntry *
344 {
346 
347  /* only allowed to be called when a reservation has been made */
348  Assert(ReservedRefCountEntry != NULL);
349 
350  /* use up the reserved entry */
352  ReservedRefCountEntry = NULL;
353 
354  /* and fill it */
355  res->buffer = buffer;
356  res->refcount = 0;
357 
358  return res;
359 }
360 
361 /*
362  * Return the PrivateRefCount entry for the passed buffer.
363  *
364  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
365  * do_move is true, and the entry resides in the hashtable the entry is
366  * optimized for frequent access by moving it to the array.
367  */
368 static PrivateRefCountEntry *
370 {
372  int i;
373 
376 
377  /*
378  * First search for references in the array, that'll be sufficient in the
379  * majority of cases.
380  */
381  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
382  {
384 
385  if (res->buffer == buffer)
386  return res;
387  }
388 
389  /*
390  * By here we know that the buffer, if already pinned, isn't residing in
391  * the array.
392  *
393  * Only look up the buffer in the hashtable if we've previously overflowed
394  * into it.
395  */
396  if (PrivateRefCountOverflowed == 0)
397  return NULL;
398 
400 
401  if (res == NULL)
402  return NULL;
403  else if (!do_move)
404  {
405  /* caller doesn't want us to move the hash entry into the array */
406  return res;
407  }
408  else
409  {
410  /* move buffer from hashtable into the free array slot */
411  bool found;
413 
414  /* Ensure there's a free array slot */
416 
417  /* Use up the reserved slot */
418  Assert(ReservedRefCountEntry != NULL);
420  ReservedRefCountEntry = NULL;
421  Assert(free->buffer == InvalidBuffer);
422 
423  /* and fill it */
424  free->buffer = buffer;
425  free->refcount = res->refcount;
426 
427  /* delete from hashtable */
429  Assert(found);
432 
433  return free;
434  }
435 }
436 
437 /*
438  * Returns how many times the passed buffer is pinned by this backend.
439  *
440  * Only works for shared memory buffers!
441  */
442 static inline int32
444 {
446 
449 
450  /*
451  * Not moving the entry - that's ok for the current users, but we might
452  * want to change this one day.
453  */
454  ref = GetPrivateRefCountEntry(buffer, false);
455 
456  if (ref == NULL)
457  return 0;
458  return ref->refcount;
459 }
460 
461 /*
462  * Release resources used to track the reference count of a buffer which we no
463  * longer have pinned and don't want to pin again immediately.
464  */
465 static void
467 {
468  Assert(ref->refcount == 0);
469 
470  if (ref >= &PrivateRefCountArray[0] &&
472  {
473  ref->buffer = InvalidBuffer;
474 
475  /*
476  * Mark the just used entry as reserved - in many scenarios that
477  * allows us to avoid ever having to search the array/hash for free
478  * entries.
479  */
480  ReservedRefCountEntry = ref;
481  }
482  else
483  {
484  bool found;
485  Buffer buffer = ref->buffer;
486 
488  Assert(found);
491  }
492 }
493 
494 /*
495  * BufferIsPinned
496  * True iff the buffer is pinned (also checks for valid buffer number).
497  *
498  * NOTE: what we check here is that *this* backend holds a pin on
499  * the buffer. We do not care whether some other backend does.
500  */
501 #define BufferIsPinned(bufnum) \
502 ( \
503  !BufferIsValid(bufnum) ? \
504  false \
505  : \
506  BufferIsLocal(bufnum) ? \
507  (LocalRefCount[-(bufnum) - 1] > 0) \
508  : \
509  (GetPrivateRefCount(bufnum) > 0) \
510 )
511 
512 
514  SMgrRelation smgr, char smgr_persistence,
515  ForkNumber forkNum, BlockNumber blockNum,
518  ForkNumber fork,
519  BufferAccessStrategy strategy,
520  uint32 flags,
521  uint32 extend_by,
522  BlockNumber extend_upto,
523  Buffer *buffers,
524  uint32 *extended_by);
526  ForkNumber fork,
527  BufferAccessStrategy strategy,
528  uint32 flags,
529  uint32 extend_by,
530  BlockNumber extend_upto,
531  Buffer *buffers,
532  uint32 *extended_by);
533 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
534 static void PinBuffer_Locked(BufferDesc *buf);
535 static void UnpinBuffer(BufferDesc *buf);
536 static void UnpinBufferNoOwner(BufferDesc *buf);
537 static void BufferSync(int flags);
539 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
540  WritebackContext *wb_context);
541 static void WaitIO(BufferDesc *buf);
542 static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
543 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
544  uint32 set_flag_bits, bool forget_owner);
545 static void AbortBufferIO(Buffer buffer);
546 static void shared_buffer_write_error_callback(void *arg);
547 static void local_buffer_write_error_callback(void *arg);
548 static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
549  char relpersistence,
550  ForkNumber forkNum,
551  BlockNumber blockNum,
552  BufferAccessStrategy strategy,
553  bool *foundPtr, IOContext io_context);
554 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
555 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
556  IOObject io_object, IOContext io_context);
557 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
558  ForkNumber forkNum,
559  BlockNumber nForkBlock,
560  BlockNumber firstDelBlock);
561 static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
562  RelFileLocator dstlocator,
563  ForkNumber forkNum, bool permanent);
564 static void AtProcExit_Buffers(int code, Datum arg);
565 static void CheckForBufferLeaks(void);
566 static int rlocator_comparator(const void *p1, const void *p2);
567 static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
568 static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
569 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
570 
571 
572 /*
573  * Implementation of PrefetchBuffer() for shared buffers.
574  */
577  ForkNumber forkNum,
578  BlockNumber blockNum)
579 {
580  PrefetchBufferResult result = {InvalidBuffer, false};
581  BufferTag newTag; /* identity of requested block */
582  uint32 newHash; /* hash value for newTag */
583  LWLock *newPartitionLock; /* buffer partition lock for it */
584  int buf_id;
585 
586  Assert(BlockNumberIsValid(blockNum));
587 
588  /* create a tag so we can lookup the buffer */
589  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
590  forkNum, blockNum);
591 
592  /* determine its hash code and partition lock ID */
593  newHash = BufTableHashCode(&newTag);
594  newPartitionLock = BufMappingPartitionLock(newHash);
595 
596  /* see if the block is in the buffer pool already */
597  LWLockAcquire(newPartitionLock, LW_SHARED);
598  buf_id = BufTableLookup(&newTag, newHash);
599  LWLockRelease(newPartitionLock);
600 
601  /* If not in buffers, initiate prefetch */
602  if (buf_id < 0)
603  {
604 #ifdef USE_PREFETCH
605  /*
606  * Try to initiate an asynchronous read. This returns false in
607  * recovery if the relation file doesn't exist.
608  */
609  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
610  smgrprefetch(smgr_reln, forkNum, blockNum, 1))
611  {
612  result.initiated_io = true;
613  }
614 #endif /* USE_PREFETCH */
615  }
616  else
617  {
618  /*
619  * Report the buffer it was in at that time. The caller may be able
620  * to avoid a buffer table lookup, but it's not pinned and it must be
621  * rechecked!
622  */
623  result.recent_buffer = buf_id + 1;
624  }
625 
626  /*
627  * If the block *is* in buffers, we do nothing. This is not really ideal:
628  * the block might be just about to be evicted, which would be stupid
629  * since we know we are going to need it soon. But the only easy answer
630  * is to bump the usage_count, which does not seem like a great solution:
631  * when the caller does ultimately touch the block, usage_count would get
632  * bumped again, resulting in too much favoritism for blocks that are
633  * involved in a prefetch sequence. A real fix would involve some
634  * additional per-buffer state, and it's not clear that there's enough of
635  * a problem to justify that.
636  */
637 
638  return result;
639 }
640 
641 /*
642  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
643  *
644  * This is named by analogy to ReadBuffer but doesn't actually allocate a
645  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
646  * block will not be delayed by the I/O. Prefetching is optional.
647  *
648  * There are three possible outcomes:
649  *
650  * 1. If the block is already cached, the result includes a valid buffer that
651  * could be used by the caller to avoid the need for a later buffer lookup, but
652  * it's not pinned, so the caller must recheck it.
653  *
654  * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
655  * true. Currently there is no way to know if the data was already cached by
656  * the kernel and therefore didn't really initiate I/O, and no way to know when
657  * the I/O completes other than using synchronous ReadBuffer().
658  *
659  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
660  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
661  * lack of a kernel facility), direct I/O is enabled, or the underlying
662  * relation file wasn't found and we are in recovery. (If the relation file
663  * wasn't found and we are not in recovery, an error is raised).
664  */
667 {
668  Assert(RelationIsValid(reln));
669  Assert(BlockNumberIsValid(blockNum));
670 
671  if (RelationUsesLocalBuffers(reln))
672  {
673  /* see comments in ReadBufferExtended */
674  if (RELATION_IS_OTHER_TEMP(reln))
675  ereport(ERROR,
676  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
677  errmsg("cannot access temporary tables of other sessions")));
678 
679  /* pass it off to localbuf.c */
680  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
681  }
682  else
683  {
684  /* pass it to the shared buffer version */
685  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
686  }
687 }
688 
689 /*
690  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
691  *
692  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
693  * successful. Return true if the buffer is valid and still has the expected
694  * tag. In that case, the buffer is pinned and the usage count is bumped.
695  */
696 bool
698  Buffer recent_buffer)
699 {
700  BufferDesc *bufHdr;
701  BufferTag tag;
702  uint32 buf_state;
703  bool have_private_ref;
704 
705  Assert(BufferIsValid(recent_buffer));
706 
709  InitBufferTag(&tag, &rlocator, forkNum, blockNum);
710 
711  if (BufferIsLocal(recent_buffer))
712  {
713  int b = -recent_buffer - 1;
714 
715  bufHdr = GetLocalBufferDescriptor(b);
716  buf_state = pg_atomic_read_u32(&bufHdr->state);
717 
718  /* Is it still valid and holding the right tag? */
719  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
720  {
721  PinLocalBuffer(bufHdr, true);
722 
724 
725  return true;
726  }
727  }
728  else
729  {
730  bufHdr = GetBufferDescriptor(recent_buffer - 1);
731  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
732 
733  /*
734  * Do we already have this buffer pinned with a private reference? If
735  * so, it must be valid and it is safe to check the tag without
736  * locking. If not, we have to lock the header first and then check.
737  */
738  if (have_private_ref)
739  buf_state = pg_atomic_read_u32(&bufHdr->state);
740  else
741  buf_state = LockBufHdr(bufHdr);
742 
743  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
744  {
745  /*
746  * It's now safe to pin the buffer. We can't pin first and ask
747  * questions later, because it might confuse code paths like
748  * InvalidateBuffer() if we pinned a random non-matching buffer.
749  */
750  if (have_private_ref)
751  PinBuffer(bufHdr, NULL); /* bump pin count */
752  else
753  PinBuffer_Locked(bufHdr); /* pin for first time */
754 
756 
757  return true;
758  }
759 
760  /* If we locked the header above, now unlock. */
761  if (!have_private_ref)
762  UnlockBufHdr(bufHdr, buf_state);
763  }
764 
765  return false;
766 }
767 
768 /*
769  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
770  * fork with RBM_NORMAL mode and default strategy.
771  */
772 Buffer
774 {
775  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
776 }
777 
778 /*
779  * ReadBufferExtended -- returns a buffer containing the requested
780  * block of the requested relation. If the blknum
781  * requested is P_NEW, extend the relation file and
782  * allocate a new block. (Caller is responsible for
783  * ensuring that only one backend tries to extend a
784  * relation at the same time!)
785  *
786  * Returns: the buffer number for the buffer containing
787  * the block read. The returned buffer has been pinned.
788  * Does not return on error --- elog's instead.
789  *
790  * Assume when this function is called, that reln has been opened already.
791  *
792  * In RBM_NORMAL mode, the page is read from disk, and the page header is
793  * validated. An error is thrown if the page header is not valid. (But
794  * note that an all-zero page is considered "valid"; see
795  * PageIsVerifiedExtended().)
796  *
797  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
798  * valid, the page is zeroed instead of throwing an error. This is intended
799  * for non-critical data, where the caller is prepared to repair errors.
800  *
801  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
802  * filled with zeros instead of reading it from disk. Useful when the caller
803  * is going to fill the page from scratch, since this saves I/O and avoids
804  * unnecessary failure if the page-on-disk has corrupt page headers.
805  * The page is returned locked to ensure that the caller has a chance to
806  * initialize the page before it's made visible to others.
807  * Caution: do not use this mode to read a page that is beyond the relation's
808  * current physical EOF; that is likely to cause problems in md.c when
809  * the page is modified and written out. P_NEW is OK, though.
810  *
811  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
812  * a cleanup-strength lock on the page.
813  *
814  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
815  *
816  * If strategy is not NULL, a nondefault buffer access strategy is used.
817  * See buffer/README for details.
818  */
819 inline Buffer
822 {
823  Buffer buf;
824 
825  /*
826  * Reject attempts to read non-local temporary relations; we would be
827  * likely to get wrong data since we have no visibility into the owning
828  * session's local buffers.
829  */
830  if (RELATION_IS_OTHER_TEMP(reln))
831  ereport(ERROR,
832  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
833  errmsg("cannot access temporary tables of other sessions")));
834 
835  /*
836  * Read the buffer, and update pgstat counters to reflect a cache hit or
837  * miss.
838  */
839  buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
840  forkNum, blockNum, mode, strategy);
841 
842  return buf;
843 }
844 
845 
846 /*
847  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
848  * a relcache entry for the relation.
849  *
850  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
851  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
852  * cannot be used for temporary relations (and making that work might be
853  * difficult, unless we only want to read temporary relations for our own
854  * ProcNumber).
855  */
856 Buffer
858  BlockNumber blockNum, ReadBufferMode mode,
859  BufferAccessStrategy strategy, bool permanent)
860 {
861  SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
862 
863  return ReadBuffer_common(NULL, smgr,
864  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
865  forkNum, blockNum,
866  mode, strategy);
867 }
868 
869 /*
870  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
871  */
872 Buffer
874  ForkNumber forkNum,
875  BufferAccessStrategy strategy,
876  uint32 flags)
877 {
878  Buffer buf;
879  uint32 extend_by = 1;
880 
881  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
882  &buf, &extend_by);
883 
884  return buf;
885 }
886 
887 /*
888  * Extend relation by multiple blocks.
889  *
890  * Tries to extend the relation by extend_by blocks. Depending on the
891  * availability of resources the relation may end up being extended by a
892  * smaller number of pages (unless an error is thrown, always by at least one
893  * page). *extended_by is updated to the number of pages the relation has been
894  * extended to.
895  *
896  * buffers needs to be an array that is at least extend_by long. Upon
897  * completion, the first extend_by array elements will point to a pinned
898  * buffer.
899  *
900  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
901  * locked. This is useful for callers that want a buffer that is guaranteed to
902  * be empty.
903  */
906  ForkNumber fork,
907  BufferAccessStrategy strategy,
908  uint32 flags,
909  uint32 extend_by,
910  Buffer *buffers,
911  uint32 *extended_by)
912 {
913  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
914  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
915  Assert(extend_by > 0);
916 
917  if (bmr.smgr == NULL)
918  {
919  bmr.smgr = RelationGetSmgr(bmr.rel);
920  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
921  }
922 
923  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
924  extend_by, InvalidBlockNumber,
925  buffers, extended_by);
926 }
927 
928 /*
929  * Extend the relation so it is at least extend_to blocks large, return buffer
930  * (extend_to - 1).
931  *
932  * This is useful for callers that want to write a specific page, regardless
933  * of the current size of the relation (e.g. useful for visibilitymap and for
934  * crash recovery).
935  */
936 Buffer
938  ForkNumber fork,
939  BufferAccessStrategy strategy,
940  uint32 flags,
941  BlockNumber extend_to,
943 {
945  uint32 extended_by = 0;
947  Buffer buffers[64];
948 
949  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
950  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
951  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
952 
953  if (bmr.smgr == NULL)
954  {
955  bmr.smgr = RelationGetSmgr(bmr.rel);
956  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
957  }
958 
959  /*
960  * If desired, create the file if it doesn't exist. If
961  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
962  * an smgrexists call.
963  */
964  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
965  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
967  !smgrexists(bmr.smgr, fork))
968  {
970 
971  /* recheck, fork might have been created concurrently */
972  if (!smgrexists(bmr.smgr, fork))
973  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
974 
976  }
977 
978  /*
979  * If requested, invalidate size cache, so that smgrnblocks asks the
980  * kernel.
981  */
982  if (flags & EB_CLEAR_SIZE_CACHE)
984 
985  /*
986  * Estimate how many pages we'll need to extend by. This avoids acquiring
987  * unnecessarily many victim buffers.
988  */
989  current_size = smgrnblocks(bmr.smgr, fork);
990 
991  /*
992  * Since no-one else can be looking at the page contents yet, there is no
993  * difference between an exclusive lock and a cleanup-strength lock. Note
994  * that we pass the original mode to ReadBuffer_common() below, when
995  * falling back to reading the buffer to a concurrent relation extension.
996  */
998  flags |= EB_LOCK_TARGET;
999 
1000  while (current_size < extend_to)
1001  {
1002  uint32 num_pages = lengthof(buffers);
1003  BlockNumber first_block;
1004 
1005  if ((uint64) current_size + num_pages > extend_to)
1006  num_pages = extend_to - current_size;
1007 
1008  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1009  num_pages, extend_to,
1010  buffers, &extended_by);
1011 
1012  current_size = first_block + extended_by;
1013  Assert(num_pages != 0 || current_size >= extend_to);
1014 
1015  for (uint32 i = 0; i < extended_by; i++)
1016  {
1017  if (first_block + i != extend_to - 1)
1018  ReleaseBuffer(buffers[i]);
1019  else
1020  buffer = buffers[i];
1021  }
1022  }
1023 
1024  /*
1025  * It's possible that another backend concurrently extended the relation.
1026  * In that case read the buffer.
1027  *
1028  * XXX: Should we control this via a flag?
1029  */
1030  if (buffer == InvalidBuffer)
1031  {
1032  Assert(extended_by == 0);
1034  fork, extend_to - 1, mode, strategy);
1035  }
1036 
1037  return buffer;
1038 }
1039 
1040 /*
1041  * Lock and optionally zero a buffer, as part of the implementation of
1042  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1043  * pinned. If the buffer is not already valid, it is zeroed and made valid.
1044  */
1045 static void
1047 {
1048  BufferDesc *bufHdr;
1049  bool need_to_zero;
1050  bool isLocalBuf = BufferIsLocal(buffer);
1051 
1053 
1054  if (already_valid)
1055  {
1056  /*
1057  * If the caller already knew the buffer was valid, we can skip some
1058  * header interaction. The caller just wants to lock the buffer.
1059  */
1060  need_to_zero = false;
1061  }
1062  else if (isLocalBuf)
1063  {
1064  /* Simple case for non-shared buffers. */
1065  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1066  need_to_zero = (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1067  }
1068  else
1069  {
1070  /*
1071  * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1072  * concurrently. Even though we aren't doing I/O, that ensures that
1073  * we don't zero a page that someone else has pinned. An exclusive
1074  * content lock wouldn't be enough, because readers are allowed to
1075  * drop the content lock after determining that a tuple is visible
1076  * (see buffer access rules in README).
1077  */
1078  bufHdr = GetBufferDescriptor(buffer - 1);
1079  need_to_zero = StartBufferIO(bufHdr, true, false);
1080  }
1081 
1082  if (need_to_zero)
1083  {
1084  memset(BufferGetPage(buffer), 0, BLCKSZ);
1085 
1086  /*
1087  * Grab the buffer content lock before marking the page as valid, to
1088  * make sure that no other backend sees the zeroed page before the
1089  * caller has had a chance to initialize it.
1090  *
1091  * Since no-one else can be looking at the page contents yet, there is
1092  * no difference between an exclusive lock and a cleanup-strength
1093  * lock. (Note that we cannot use LockBuffer() or
1094  * LockBufferForCleanup() here, because they assert that the buffer is
1095  * already valid.)
1096  */
1097  if (!isLocalBuf)
1099 
1100  if (isLocalBuf)
1101  {
1102  /* Only need to adjust flags */
1103  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1104 
1105  buf_state |= BM_VALID;
1106  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1107  }
1108  else
1109  {
1110  /* Set BM_VALID, terminate IO, and wake up any waiters */
1111  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1112  }
1113  }
1114  else if (!isLocalBuf)
1115  {
1116  /*
1117  * The buffer is valid, so we can't zero it. The caller still expects
1118  * the page to be locked on return.
1119  */
1120  if (mode == RBM_ZERO_AND_LOCK)
1122  else
1124  }
1125 }
1126 
1127 /*
1128  * Pin a buffer for a given block. *foundPtr is set to true if the block was
1129  * already present, or false if more work is required to either read it in or
1130  * zero it.
1131  */
1134  SMgrRelation smgr,
1135  char persistence,
1136  ForkNumber forkNum,
1137  BlockNumber blockNum,
1138  BufferAccessStrategy strategy,
1139  bool *foundPtr)
1140 {
1141  BufferDesc *bufHdr;
1142  IOContext io_context;
1143  IOObject io_object;
1144 
1145  Assert(blockNum != P_NEW);
1146 
1147  /* Persistence should be set before */
1148  Assert((persistence == RELPERSISTENCE_TEMP ||
1149  persistence == RELPERSISTENCE_PERMANENT ||
1150  persistence == RELPERSISTENCE_UNLOGGED));
1151 
1152  if (persistence == RELPERSISTENCE_TEMP)
1153  {
1154  io_context = IOCONTEXT_NORMAL;
1155  io_object = IOOBJECT_TEMP_RELATION;
1156  }
1157  else
1158  {
1159  io_context = IOContextForStrategy(strategy);
1160  io_object = IOOBJECT_RELATION;
1161  }
1162 
1163  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1165  smgr->smgr_rlocator.locator.dbOid,
1167  smgr->smgr_rlocator.backend);
1168 
1169  if (persistence == RELPERSISTENCE_TEMP)
1170  {
1171  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1172  if (*foundPtr)
1174  }
1175  else
1176  {
1177  bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1178  strategy, foundPtr, io_context);
1179  if (*foundPtr)
1181  }
1182  if (rel)
1183  {
1184  /*
1185  * While pgBufferUsage's "read" counter isn't bumped unless we reach
1186  * WaitReadBuffers() (so, not for hits, and not for buffers that are
1187  * zeroed instead), the per-relation stats always count them.
1188  */
1190  if (*foundPtr)
1192  }
1193  if (*foundPtr)
1194  {
1195  VacuumPageHit++;
1196  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1197  if (VacuumCostActive)
1199 
1200  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1202  smgr->smgr_rlocator.locator.dbOid,
1204  smgr->smgr_rlocator.backend,
1205  true);
1206  }
1207 
1208  return BufferDescriptorGetBuffer(bufHdr);
1209 }
1210 
1211 /*
1212  * ReadBuffer_common -- common logic for all ReadBuffer variants
1213  *
1214  * smgr is required, rel is optional unless using P_NEW.
1215  */
1217 ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1218  ForkNumber forkNum,
1219  BlockNumber blockNum, ReadBufferMode mode,
1220  BufferAccessStrategy strategy)
1221 {
1222  ReadBuffersOperation operation;
1223  Buffer buffer;
1224  int flags;
1225  char persistence;
1226 
1227  /*
1228  * Backward compatibility path, most code should use ExtendBufferedRel()
1229  * instead, as acquiring the extension lock inside ExtendBufferedRel()
1230  * scales a lot better.
1231  */
1232  if (unlikely(blockNum == P_NEW))
1233  {
1235 
1236  /*
1237  * Since no-one else can be looking at the page contents yet, there is
1238  * no difference between an exclusive lock and a cleanup-strength
1239  * lock.
1240  */
1242  flags |= EB_LOCK_FIRST;
1243 
1244  return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1245  }
1246 
1247  if (rel)
1248  persistence = rel->rd_rel->relpersistence;
1249  else
1250  persistence = smgr_persistence;
1251 
1253  mode == RBM_ZERO_AND_LOCK))
1254  {
1255  bool found;
1256 
1257  buffer = PinBufferForBlock(rel, smgr, persistence,
1258  forkNum, blockNum, strategy, &found);
1259  ZeroAndLockBuffer(buffer, mode, found);
1260  return buffer;
1261  }
1262 
1263  if (mode == RBM_ZERO_ON_ERROR)
1265  else
1266  flags = 0;
1267  operation.smgr = smgr;
1268  operation.rel = rel;
1269  operation.persistence = persistence;
1270  operation.forknum = forkNum;
1271  operation.strategy = strategy;
1272  if (StartReadBuffer(&operation,
1273  &buffer,
1274  blockNum,
1275  flags))
1276  WaitReadBuffers(&operation);
1277 
1278  return buffer;
1279 }
1280 
1281 static pg_attribute_always_inline bool
1283  Buffer *buffers,
1284  BlockNumber blockNum,
1285  int *nblocks,
1286  int flags)
1287 {
1288  int actual_nblocks = *nblocks;
1289  int io_buffers_len = 0;
1290 
1291  Assert(*nblocks > 0);
1292  Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1293 
1294  for (int i = 0; i < actual_nblocks; ++i)
1295  {
1296  bool found;
1297 
1298  buffers[i] = PinBufferForBlock(operation->rel,
1299  operation->smgr,
1300  operation->persistence,
1301  operation->forknum,
1302  blockNum + i,
1303  operation->strategy,
1304  &found);
1305 
1306  if (found)
1307  {
1308  /*
1309  * Terminate the read as soon as we get a hit. It could be a
1310  * single buffer hit, or it could be a hit that follows a readable
1311  * range. We don't want to create more than one readable range,
1312  * so we stop here.
1313  */
1314  actual_nblocks = i + 1;
1315  break;
1316  }
1317  else
1318  {
1319  /* Extend the readable range to cover this block. */
1320  io_buffers_len++;
1321  }
1322  }
1323  *nblocks = actual_nblocks;
1324 
1325  if (likely(io_buffers_len == 0))
1326  return false;
1327 
1328  /* Populate information needed for I/O. */
1329  operation->buffers = buffers;
1330  operation->blocknum = blockNum;
1331  operation->flags = flags;
1332  operation->nblocks = actual_nblocks;
1333  operation->io_buffers_len = io_buffers_len;
1334 
1335  if (flags & READ_BUFFERS_ISSUE_ADVICE)
1336  {
1337  /*
1338  * In theory we should only do this if PinBufferForBlock() had to
1339  * allocate new buffers above. That way, if two calls to
1340  * StartReadBuffers() were made for the same blocks before
1341  * WaitReadBuffers(), only the first would issue the advice. That'd be
1342  * a better simulation of true asynchronous I/O, which would only
1343  * start the I/O once, but isn't done here for simplicity. Note also
1344  * that the following call might actually issue two advice calls if we
1345  * cross a segment boundary; in a true asynchronous version we might
1346  * choose to process only one real I/O at a time in that case.
1347  */
1348  smgrprefetch(operation->smgr,
1349  operation->forknum,
1350  blockNum,
1351  operation->io_buffers_len);
1352  }
1353 
1354  /* Indicate that WaitReadBuffers() should be called. */
1355  return true;
1356 }
1357 
1358 /*
1359  * Begin reading a range of blocks beginning at blockNum and extending for
1360  * *nblocks. On return, up to *nblocks pinned buffers holding those blocks
1361  * are written into the buffers array, and *nblocks is updated to contain the
1362  * actual number, which may be fewer than requested. Caller sets some of the
1363  * members of operation; see struct definition.
1364  *
1365  * If false is returned, no I/O is necessary. If true is returned, one I/O
1366  * has been started, and WaitReadBuffers() must be called with the same
1367  * operation object before the buffers are accessed. Along with the operation
1368  * object, the caller-supplied array of buffers must remain valid until
1369  * WaitReadBuffers() is called.
1370  *
1371  * Currently the I/O is only started with optional operating system advice if
1372  * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1373  * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1374  * be initiated here.
1375  */
1376 bool
1378  Buffer *buffers,
1379  BlockNumber blockNum,
1380  int *nblocks,
1381  int flags)
1382 {
1383  return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
1384 }
1385 
1386 /*
1387  * Single block version of the StartReadBuffers(). This might save a few
1388  * instructions when called from another translation unit, because it is
1389  * specialized for nblocks == 1.
1390  */
1391 bool
1393  Buffer *buffer,
1394  BlockNumber blocknum,
1395  int flags)
1396 {
1397  int nblocks = 1;
1398  bool result;
1399 
1400  result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
1401  Assert(nblocks == 1); /* single block can't be short */
1402 
1403  return result;
1404 }
1405 
1406 static inline bool
1408 {
1409  if (BufferIsLocal(buffer))
1410  {
1411  BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1412 
1413  return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1414  }
1415  else
1416  return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1417 }
1418 
1419 void
1421 {
1422  Buffer *buffers;
1423  int nblocks;
1424  BlockNumber blocknum;
1425  ForkNumber forknum;
1426  IOContext io_context;
1427  IOObject io_object;
1428  char persistence;
1429 
1430  /*
1431  * Currently operations are only allowed to include a read of some range,
1432  * with an optional extra buffer that is already pinned at the end. So
1433  * nblocks can be at most one more than io_buffers_len.
1434  */
1435  Assert((operation->nblocks == operation->io_buffers_len) ||
1436  (operation->nblocks == operation->io_buffers_len + 1));
1437 
1438  /* Find the range of the physical read we need to perform. */
1439  nblocks = operation->io_buffers_len;
1440  if (nblocks == 0)
1441  return; /* nothing to do */
1442 
1443  buffers = &operation->buffers[0];
1444  blocknum = operation->blocknum;
1445  forknum = operation->forknum;
1446  persistence = operation->persistence;
1447 
1448  if (persistence == RELPERSISTENCE_TEMP)
1449  {
1450  io_context = IOCONTEXT_NORMAL;
1451  io_object = IOOBJECT_TEMP_RELATION;
1452  }
1453  else
1454  {
1455  io_context = IOContextForStrategy(operation->strategy);
1456  io_object = IOOBJECT_RELATION;
1457  }
1458 
1459  /*
1460  * We count all these blocks as read by this backend. This is traditional
1461  * behavior, but might turn out to be not true if we find that someone
1462  * else has beaten us and completed the read of some of these blocks. In
1463  * that case the system globally double-counts, but we traditionally don't
1464  * count this as a "hit", and we don't have a separate counter for "miss,
1465  * but another backend completed the read".
1466  */
1467  if (persistence == RELPERSISTENCE_TEMP)
1468  pgBufferUsage.local_blks_read += nblocks;
1469  else
1470  pgBufferUsage.shared_blks_read += nblocks;
1471 
1472  for (int i = 0; i < nblocks; ++i)
1473  {
1474  int io_buffers_len;
1475  Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
1476  void *io_pages[MAX_IO_COMBINE_LIMIT];
1477  instr_time io_start;
1478  BlockNumber io_first_block;
1479 
1480  /*
1481  * Skip this block if someone else has already completed it. If an
1482  * I/O is already in progress in another backend, this will wait for
1483  * the outcome: either done, or something went wrong and we will
1484  * retry.
1485  */
1486  if (!WaitReadBuffersCanStartIO(buffers[i], false))
1487  {
1488  /*
1489  * Report this as a 'hit' for this backend, even though it must
1490  * have started out as a miss in PinBufferForBlock().
1491  */
1492  TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
1493  operation->smgr->smgr_rlocator.locator.spcOid,
1494  operation->smgr->smgr_rlocator.locator.dbOid,
1495  operation->smgr->smgr_rlocator.locator.relNumber,
1496  operation->smgr->smgr_rlocator.backend,
1497  true);
1498  continue;
1499  }
1500 
1501  /* We found a buffer that we need to read in. */
1502  io_buffers[0] = buffers[i];
1503  io_pages[0] = BufferGetBlock(buffers[i]);
1504  io_first_block = blocknum + i;
1505  io_buffers_len = 1;
1506 
1507  /*
1508  * How many neighboring-on-disk blocks can we can scatter-read into
1509  * other buffers at the same time? In this case we don't wait if we
1510  * see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
1511  * for the head block, so we should get on with that I/O as soon as
1512  * possible. We'll come back to this block again, above.
1513  */
1514  while ((i + 1) < nblocks &&
1515  WaitReadBuffersCanStartIO(buffers[i + 1], true))
1516  {
1517  /* Must be consecutive block numbers. */
1518  Assert(BufferGetBlockNumber(buffers[i + 1]) ==
1519  BufferGetBlockNumber(buffers[i]) + 1);
1520 
1521  io_buffers[io_buffers_len] = buffers[++i];
1522  io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1523  }
1524 
1526  smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
1527  pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1528  io_buffers_len);
1529 
1530  /* Verify each block we read, and terminate the I/O. */
1531  for (int j = 0; j < io_buffers_len; ++j)
1532  {
1533  BufferDesc *bufHdr;
1534  Block bufBlock;
1535 
1536  if (persistence == RELPERSISTENCE_TEMP)
1537  {
1538  bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
1539  bufBlock = LocalBufHdrGetBlock(bufHdr);
1540  }
1541  else
1542  {
1543  bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
1544  bufBlock = BufHdrGetBlock(bufHdr);
1545  }
1546 
1547  /* check for garbage data */
1548  if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1550  {
1551  if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
1552  {
1553  ereport(WARNING,
1555  errmsg("invalid page in block %u of relation %s; zeroing out page",
1556  io_first_block + j,
1557  relpath(operation->smgr->smgr_rlocator, forknum))));
1558  memset(bufBlock, 0, BLCKSZ);
1559  }
1560  else
1561  ereport(ERROR,
1563  errmsg("invalid page in block %u of relation %s",
1564  io_first_block + j,
1565  relpath(operation->smgr->smgr_rlocator, forknum))));
1566  }
1567 
1568  /* Terminate I/O and set BM_VALID. */
1569  if (persistence == RELPERSISTENCE_TEMP)
1570  {
1571  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1572 
1573  buf_state |= BM_VALID;
1574  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1575  }
1576  else
1577  {
1578  /* Set BM_VALID, terminate IO, and wake up any waiters */
1579  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1580  }
1581 
1582  /* Report I/Os as completing individually. */
1583  TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
1584  operation->smgr->smgr_rlocator.locator.spcOid,
1585  operation->smgr->smgr_rlocator.locator.dbOid,
1586  operation->smgr->smgr_rlocator.locator.relNumber,
1587  operation->smgr->smgr_rlocator.backend,
1588  false);
1589  }
1590 
1591  VacuumPageMiss += io_buffers_len;
1592  if (VacuumCostActive)
1593  VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1594  }
1595 }
1596 
1597 /*
1598  * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1599  * buffer. If no buffer exists already, selects a replacement victim and
1600  * evicts the old page, but does NOT read in new page.
1601  *
1602  * "strategy" can be a buffer replacement strategy object, or NULL for
1603  * the default strategy. The selected buffer's usage_count is advanced when
1604  * using the default strategy, but otherwise possibly not (see PinBuffer).
1605  *
1606  * The returned buffer is pinned and is already marked as holding the
1607  * desired page. If it already did have the desired page, *foundPtr is
1608  * set true. Otherwise, *foundPtr is set false.
1609  *
1610  * io_context is passed as an output parameter to avoid calling
1611  * IOContextForStrategy() when there is a shared buffers hit and no IO
1612  * statistics need be captured.
1613  *
1614  * No locks are held either at entry or exit.
1615  */
1617 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1618  BlockNumber blockNum,
1619  BufferAccessStrategy strategy,
1620  bool *foundPtr, IOContext io_context)
1621 {
1622  BufferTag newTag; /* identity of requested block */
1623  uint32 newHash; /* hash value for newTag */
1624  LWLock *newPartitionLock; /* buffer partition lock for it */
1625  int existing_buf_id;
1626  Buffer victim_buffer;
1627  BufferDesc *victim_buf_hdr;
1628  uint32 victim_buf_state;
1629 
1630  /* Make sure we will have room to remember the buffer pin */
1633 
1634  /* create a tag so we can lookup the buffer */
1635  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1636 
1637  /* determine its hash code and partition lock ID */
1638  newHash = BufTableHashCode(&newTag);
1639  newPartitionLock = BufMappingPartitionLock(newHash);
1640 
1641  /* see if the block is in the buffer pool already */
1642  LWLockAcquire(newPartitionLock, LW_SHARED);
1643  existing_buf_id = BufTableLookup(&newTag, newHash);
1644  if (existing_buf_id >= 0)
1645  {
1646  BufferDesc *buf;
1647  bool valid;
1648 
1649  /*
1650  * Found it. Now, pin the buffer so no one can steal it from the
1651  * buffer pool, and check to see if the correct data has been loaded
1652  * into the buffer.
1653  */
1654  buf = GetBufferDescriptor(existing_buf_id);
1655 
1656  valid = PinBuffer(buf, strategy);
1657 
1658  /* Can release the mapping lock as soon as we've pinned it */
1659  LWLockRelease(newPartitionLock);
1660 
1661  *foundPtr = true;
1662 
1663  if (!valid)
1664  {
1665  /*
1666  * We can only get here if (a) someone else is still reading in
1667  * the page, (b) a previous read attempt failed, or (c) someone
1668  * called StartReadBuffers() but not yet WaitReadBuffers().
1669  */
1670  *foundPtr = false;
1671  }
1672 
1673  return buf;
1674  }
1675 
1676  /*
1677  * Didn't find it in the buffer pool. We'll have to initialize a new
1678  * buffer. Remember to unlock the mapping lock while doing the work.
1679  */
1680  LWLockRelease(newPartitionLock);
1681 
1682  /*
1683  * Acquire a victim buffer. Somebody else might try to do the same, we
1684  * don't hold any conflicting locks. If so we'll have to undo our work
1685  * later.
1686  */
1687  victim_buffer = GetVictimBuffer(strategy, io_context);
1688  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1689 
1690  /*
1691  * Try to make a hashtable entry for the buffer under its new tag. If
1692  * somebody else inserted another buffer for the tag, we'll release the
1693  * victim buffer we acquired and use the already inserted one.
1694  */
1695  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1696  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1697  if (existing_buf_id >= 0)
1698  {
1699  BufferDesc *existing_buf_hdr;
1700  bool valid;
1701 
1702  /*
1703  * Got a collision. Someone has already done what we were about to do.
1704  * We'll just handle this as if it were found in the buffer pool in
1705  * the first place. First, give up the buffer we were planning to
1706  * use.
1707  *
1708  * We could do this after releasing the partition lock, but then we'd
1709  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1710  * before acquiring the lock, for the rare case of such a collision.
1711  */
1712  UnpinBuffer(victim_buf_hdr);
1713 
1714  /*
1715  * The victim buffer we acquired previously is clean and unused, let
1716  * it be found again quickly
1717  */
1718  StrategyFreeBuffer(victim_buf_hdr);
1719 
1720  /* remaining code should match code at top of routine */
1721 
1722  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1723 
1724  valid = PinBuffer(existing_buf_hdr, strategy);
1725 
1726  /* Can release the mapping lock as soon as we've pinned it */
1727  LWLockRelease(newPartitionLock);
1728 
1729  *foundPtr = true;
1730 
1731  if (!valid)
1732  {
1733  /*
1734  * We can only get here if (a) someone else is still reading in
1735  * the page, (b) a previous read attempt failed, or (c) someone
1736  * called StartReadBuffers() but not yet WaitReadBuffers().
1737  */
1738  *foundPtr = false;
1739  }
1740 
1741  return existing_buf_hdr;
1742  }
1743 
1744  /*
1745  * Need to lock the buffer header too in order to change its tag.
1746  */
1747  victim_buf_state = LockBufHdr(victim_buf_hdr);
1748 
1749  /* some sanity checks while we hold the buffer header lock */
1750  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1751  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1752 
1753  victim_buf_hdr->tag = newTag;
1754 
1755  /*
1756  * Make sure BM_PERMANENT is set for buffers that must be written at every
1757  * checkpoint. Unlogged buffers only need to be written at shutdown
1758  * checkpoints, except for their "init" forks, which need to be treated
1759  * just like permanent relations.
1760  */
1761  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1762  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1763  victim_buf_state |= BM_PERMANENT;
1764 
1765  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1766 
1767  LWLockRelease(newPartitionLock);
1768 
1769  /*
1770  * Buffer contents are currently invalid.
1771  */
1772  *foundPtr = false;
1773 
1774  return victim_buf_hdr;
1775 }
1776 
1777 /*
1778  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1779  * freelist.
1780  *
1781  * The buffer header spinlock must be held at entry. We drop it before
1782  * returning. (This is sane because the caller must have locked the
1783  * buffer in order to be sure it should be dropped.)
1784  *
1785  * This is used only in contexts such as dropping a relation. We assume
1786  * that no other backend could possibly be interested in using the page,
1787  * so the only reason the buffer might be pinned is if someone else is
1788  * trying to write it out. We have to let them finish before we can
1789  * reclaim the buffer.
1790  *
1791  * The buffer could get reclaimed by someone else while we are waiting
1792  * to acquire the necessary locks; if so, don't mess it up.
1793  */
1794 static void
1796 {
1797  BufferTag oldTag;
1798  uint32 oldHash; /* hash value for oldTag */
1799  LWLock *oldPartitionLock; /* buffer partition lock for it */
1800  uint32 oldFlags;
1801  uint32 buf_state;
1802 
1803  /* Save the original buffer tag before dropping the spinlock */
1804  oldTag = buf->tag;
1805 
1806  buf_state = pg_atomic_read_u32(&buf->state);
1807  Assert(buf_state & BM_LOCKED);
1808  UnlockBufHdr(buf, buf_state);
1809 
1810  /*
1811  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1812  * worth storing the hashcode in BufferDesc so we need not recompute it
1813  * here? Probably not.
1814  */
1815  oldHash = BufTableHashCode(&oldTag);
1816  oldPartitionLock = BufMappingPartitionLock(oldHash);
1817 
1818 retry:
1819 
1820  /*
1821  * Acquire exclusive mapping lock in preparation for changing the buffer's
1822  * association.
1823  */
1824  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1825 
1826  /* Re-lock the buffer header */
1827  buf_state = LockBufHdr(buf);
1828 
1829  /* If it's changed while we were waiting for lock, do nothing */
1830  if (!BufferTagsEqual(&buf->tag, &oldTag))
1831  {
1832  UnlockBufHdr(buf, buf_state);
1833  LWLockRelease(oldPartitionLock);
1834  return;
1835  }
1836 
1837  /*
1838  * We assume the only reason for it to be pinned is that someone else is
1839  * flushing the page out. Wait for them to finish. (This could be an
1840  * infinite loop if the refcount is messed up... it would be nice to time
1841  * out after awhile, but there seems no way to be sure how many loops may
1842  * be needed. Note that if the other guy has pinned the buffer but not
1843  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1844  * be busy-looping here.)
1845  */
1846  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1847  {
1848  UnlockBufHdr(buf, buf_state);
1849  LWLockRelease(oldPartitionLock);
1850  /* safety check: should definitely not be our *own* pin */
1852  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1853  WaitIO(buf);
1854  goto retry;
1855  }
1856 
1857  /*
1858  * Clear out the buffer's tag and flags. We must do this to ensure that
1859  * linear scans of the buffer array don't think the buffer is valid.
1860  */
1861  oldFlags = buf_state & BUF_FLAG_MASK;
1862  ClearBufferTag(&buf->tag);
1863  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1864  UnlockBufHdr(buf, buf_state);
1865 
1866  /*
1867  * Remove the buffer from the lookup hashtable, if it was in there.
1868  */
1869  if (oldFlags & BM_TAG_VALID)
1870  BufTableDelete(&oldTag, oldHash);
1871 
1872  /*
1873  * Done with mapping lock.
1874  */
1875  LWLockRelease(oldPartitionLock);
1876 
1877  /*
1878  * Insert the buffer at the head of the list of free buffers.
1879  */
1881 }
1882 
1883 /*
1884  * Helper routine for GetVictimBuffer()
1885  *
1886  * Needs to be called on a buffer with a valid tag, pinned, but without the
1887  * buffer header spinlock held.
1888  *
1889  * Returns true if the buffer can be reused, in which case the buffer is only
1890  * pinned by this backend and marked as invalid, false otherwise.
1891  */
1892 static bool
1894 {
1895  uint32 buf_state;
1896  uint32 hash;
1897  LWLock *partition_lock;
1898  BufferTag tag;
1899 
1901 
1902  /* have buffer pinned, so it's safe to read tag without lock */
1903  tag = buf_hdr->tag;
1904 
1905  hash = BufTableHashCode(&tag);
1906  partition_lock = BufMappingPartitionLock(hash);
1907 
1908  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1909 
1910  /* lock the buffer header */
1911  buf_state = LockBufHdr(buf_hdr);
1912 
1913  /*
1914  * We have the buffer pinned nobody else should have been able to unset
1915  * this concurrently.
1916  */
1917  Assert(buf_state & BM_TAG_VALID);
1918  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1919  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1920 
1921  /*
1922  * If somebody else pinned the buffer since, or even worse, dirtied it,
1923  * give up on this buffer: It's clearly in use.
1924  */
1925  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1926  {
1927  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1928 
1929  UnlockBufHdr(buf_hdr, buf_state);
1930  LWLockRelease(partition_lock);
1931 
1932  return false;
1933  }
1934 
1935  /*
1936  * Clear out the buffer's tag and flags and usagecount. This is not
1937  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1938  * doing anything with the buffer. But currently it's beneficial, as the
1939  * cheaper pre-check for several linear scans of shared buffers use the
1940  * tag (see e.g. FlushDatabaseBuffers()).
1941  */
1942  ClearBufferTag(&buf_hdr->tag);
1943  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1944  UnlockBufHdr(buf_hdr, buf_state);
1945 
1946  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1947 
1948  /* finally delete buffer from the buffer mapping table */
1949  BufTableDelete(&tag, hash);
1950 
1951  LWLockRelease(partition_lock);
1952 
1953  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1954  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1956 
1957  return true;
1958 }
1959 
1960 static Buffer
1962 {
1963  BufferDesc *buf_hdr;
1964  Buffer buf;
1965  uint32 buf_state;
1966  bool from_ring;
1967 
1968  /*
1969  * Ensure, while the spinlock's not yet held, that there's a free refcount
1970  * entry, and a resource owner slot for the pin.
1971  */
1974 
1975  /* we return here if a prospective victim buffer gets used concurrently */
1976 again:
1977 
1978  /*
1979  * Select a victim buffer. The buffer is returned with its header
1980  * spinlock still held!
1981  */
1982  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1983  buf = BufferDescriptorGetBuffer(buf_hdr);
1984 
1985  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1986 
1987  /* Pin the buffer and then release the buffer spinlock */
1988  PinBuffer_Locked(buf_hdr);
1989 
1990  /*
1991  * We shouldn't have any other pins for this buffer.
1992  */
1994 
1995  /*
1996  * If the buffer was dirty, try to write it out. There is a race
1997  * condition here, in that someone might dirty it after we released the
1998  * buffer header lock above, or even while we are writing it out (since
1999  * our share-lock won't prevent hint-bit updates). We will recheck the
2000  * dirty bit after re-locking the buffer header.
2001  */
2002  if (buf_state & BM_DIRTY)
2003  {
2004  LWLock *content_lock;
2005 
2006  Assert(buf_state & BM_TAG_VALID);
2007  Assert(buf_state & BM_VALID);
2008 
2009  /*
2010  * We need a share-lock on the buffer contents to write it out (else
2011  * we might write invalid data, eg because someone else is compacting
2012  * the page contents while we write). We must use a conditional lock
2013  * acquisition here to avoid deadlock. Even though the buffer was not
2014  * pinned (and therefore surely not locked) when StrategyGetBuffer
2015  * returned it, someone else could have pinned and exclusive-locked it
2016  * by the time we get here. If we try to get the lock unconditionally,
2017  * we'd block waiting for them; if they later block waiting for us,
2018  * deadlock ensues. (This has been observed to happen when two
2019  * backends are both trying to split btree index pages, and the second
2020  * one just happens to be trying to split the page the first one got
2021  * from StrategyGetBuffer.)
2022  */
2023  content_lock = BufferDescriptorGetContentLock(buf_hdr);
2024  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2025  {
2026  /*
2027  * Someone else has locked the buffer, so give it up and loop back
2028  * to get another one.
2029  */
2030  UnpinBuffer(buf_hdr);
2031  goto again;
2032  }
2033 
2034  /*
2035  * If using a nondefault strategy, and writing the buffer would
2036  * require a WAL flush, let the strategy decide whether to go ahead
2037  * and write/reuse the buffer or to choose another victim. We need a
2038  * lock to inspect the page LSN, so this can't be done inside
2039  * StrategyGetBuffer.
2040  */
2041  if (strategy != NULL)
2042  {
2043  XLogRecPtr lsn;
2044 
2045  /* Read the LSN while holding buffer header lock */
2046  buf_state = LockBufHdr(buf_hdr);
2047  lsn = BufferGetLSN(buf_hdr);
2048  UnlockBufHdr(buf_hdr, buf_state);
2049 
2050  if (XLogNeedsFlush(lsn)
2051  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2052  {
2053  LWLockRelease(content_lock);
2054  UnpinBuffer(buf_hdr);
2055  goto again;
2056  }
2057  }
2058 
2059  /* OK, do the I/O */
2060  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2061  LWLockRelease(content_lock);
2062 
2064  &buf_hdr->tag);
2065  }
2066 
2067 
2068  if (buf_state & BM_VALID)
2069  {
2070  /*
2071  * When a BufferAccessStrategy is in use, blocks evicted from shared
2072  * buffers are counted as IOOP_EVICT in the corresponding context
2073  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2074  * strategy in two cases: 1) while initially claiming buffers for the
2075  * strategy ring 2) to replace an existing strategy ring buffer
2076  * because it is pinned or in use and cannot be reused.
2077  *
2078  * Blocks evicted from buffers already in the strategy ring are
2079  * counted as IOOP_REUSE in the corresponding strategy context.
2080  *
2081  * At this point, we can accurately count evictions and reuses,
2082  * because we have successfully claimed the valid buffer. Previously,
2083  * we may have been forced to release the buffer due to concurrent
2084  * pinners or erroring out.
2085  */
2087  from_ring ? IOOP_REUSE : IOOP_EVICT);
2088  }
2089 
2090  /*
2091  * If the buffer has an entry in the buffer mapping table, delete it. This
2092  * can fail because another backend could have pinned or dirtied the
2093  * buffer.
2094  */
2095  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2096  {
2097  UnpinBuffer(buf_hdr);
2098  goto again;
2099  }
2100 
2101  /* a final set of sanity checks */
2102 #ifdef USE_ASSERT_CHECKING
2103  buf_state = pg_atomic_read_u32(&buf_hdr->state);
2104 
2105  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2106  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2107 
2109 #endif
2110 
2111  return buf;
2112 }
2113 
2114 /*
2115  * Limit the number of pins a batch operation may additionally acquire, to
2116  * avoid running out of pinnable buffers.
2117  *
2118  * One additional pin is always allowed, as otherwise the operation likely
2119  * cannot be performed at all.
2120  *
2121  * The number of allowed pins for a backend is computed based on
2122  * shared_buffers and the maximum number of connections possible. That's very
2123  * pessimistic, but outside of toy-sized shared_buffers it should allow
2124  * sufficient pins.
2125  */
2126 void
2127 LimitAdditionalPins(uint32 *additional_pins)
2128 {
2129  uint32 max_backends;
2130  int max_proportional_pins;
2131 
2132  if (*additional_pins <= 1)
2133  return;
2134 
2135  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2136  max_proportional_pins = NBuffers / max_backends;
2137 
2138  /*
2139  * Subtract the approximate number of buffers already pinned by this
2140  * backend. We get the number of "overflowed" pins for free, but don't
2141  * know the number of pins in PrivateRefCountArray. The cost of
2142  * calculating that exactly doesn't seem worth it, so just assume the max.
2143  */
2144  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2145 
2146  if (max_proportional_pins <= 0)
2147  max_proportional_pins = 1;
2148 
2149  if (*additional_pins > max_proportional_pins)
2150  *additional_pins = max_proportional_pins;
2151 }
2152 
2153 /*
2154  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2155  * avoid duplicating the tracing and relpersistence related logic.
2156  */
2157 static BlockNumber
2159  ForkNumber fork,
2160  BufferAccessStrategy strategy,
2161  uint32 flags,
2162  uint32 extend_by,
2163  BlockNumber extend_upto,
2164  Buffer *buffers,
2165  uint32 *extended_by)
2166 {
2167  BlockNumber first_block;
2168 
2169  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2173  bmr.smgr->smgr_rlocator.backend,
2174  extend_by);
2175 
2176  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2177  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2178  extend_by, extend_upto,
2179  buffers, &extend_by);
2180  else
2181  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2182  extend_by, extend_upto,
2183  buffers, &extend_by);
2184  *extended_by = extend_by;
2185 
2186  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2190  bmr.smgr->smgr_rlocator.backend,
2191  *extended_by,
2192  first_block);
2193 
2194  return first_block;
2195 }
2196 
2197 /*
2198  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2199  * shared buffers.
2200  */
2201 static BlockNumber
2203  ForkNumber fork,
2204  BufferAccessStrategy strategy,
2205  uint32 flags,
2206  uint32 extend_by,
2207  BlockNumber extend_upto,
2208  Buffer *buffers,
2209  uint32 *extended_by)
2210 {
2211  BlockNumber first_block;
2212  IOContext io_context = IOContextForStrategy(strategy);
2213  instr_time io_start;
2214 
2215  LimitAdditionalPins(&extend_by);
2216 
2217  /*
2218  * Acquire victim buffers for extension without holding extension lock.
2219  * Writing out victim buffers is the most expensive part of extending the
2220  * relation, particularly when doing so requires WAL flushes. Zeroing out
2221  * the buffers is also quite expensive, so do that before holding the
2222  * extension lock as well.
2223  *
2224  * These pages are pinned by us and not valid. While we hold the pin they
2225  * can't be acquired as victim buffers by another backend.
2226  */
2227  for (uint32 i = 0; i < extend_by; i++)
2228  {
2229  Block buf_block;
2230 
2231  buffers[i] = GetVictimBuffer(strategy, io_context);
2232  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2233 
2234  /* new buffers are zero-filled */
2235  MemSet((char *) buf_block, 0, BLCKSZ);
2236  }
2237 
2238  /*
2239  * Lock relation against concurrent extensions, unless requested not to.
2240  *
2241  * We use the same extension lock for all forks. That's unnecessarily
2242  * restrictive, but currently extensions for forks don't happen often
2243  * enough to make it worth locking more granularly.
2244  *
2245  * Note that another backend might have extended the relation by the time
2246  * we get the lock.
2247  */
2248  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2250 
2251  /*
2252  * If requested, invalidate size cache, so that smgrnblocks asks the
2253  * kernel.
2254  */
2255  if (flags & EB_CLEAR_SIZE_CACHE)
2257 
2258  first_block = smgrnblocks(bmr.smgr, fork);
2259 
2260  /*
2261  * Now that we have the accurate relation size, check if the caller wants
2262  * us to extend to only up to a specific size. If there were concurrent
2263  * extensions, we might have acquired too many buffers and need to release
2264  * them.
2265  */
2266  if (extend_upto != InvalidBlockNumber)
2267  {
2268  uint32 orig_extend_by = extend_by;
2269 
2270  if (first_block > extend_upto)
2271  extend_by = 0;
2272  else if ((uint64) first_block + extend_by > extend_upto)
2273  extend_by = extend_upto - first_block;
2274 
2275  for (uint32 i = extend_by; i < orig_extend_by; i++)
2276  {
2277  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2278 
2279  /*
2280  * The victim buffer we acquired previously is clean and unused,
2281  * let it be found again quickly
2282  */
2283  StrategyFreeBuffer(buf_hdr);
2284  UnpinBuffer(buf_hdr);
2285  }
2286 
2287  if (extend_by == 0)
2288  {
2289  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2291  *extended_by = extend_by;
2292  return first_block;
2293  }
2294  }
2295 
2296  /* Fail if relation is already at maximum possible length */
2297  if ((uint64) first_block + extend_by >= MaxBlockNumber)
2298  ereport(ERROR,
2299  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2300  errmsg("cannot extend relation %s beyond %u blocks",
2301  relpath(bmr.smgr->smgr_rlocator, fork),
2302  MaxBlockNumber)));
2303 
2304  /*
2305  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2306  *
2307  * This needs to happen before we extend the relation, because as soon as
2308  * we do, other backends can start to read in those pages.
2309  */
2310  for (uint32 i = 0; i < extend_by; i++)
2311  {
2312  Buffer victim_buf = buffers[i];
2313  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2314  BufferTag tag;
2315  uint32 hash;
2316  LWLock *partition_lock;
2317  int existing_id;
2318 
2319  /* in case we need to pin an existing buffer below */
2322 
2323  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2324  hash = BufTableHashCode(&tag);
2325  partition_lock = BufMappingPartitionLock(hash);
2326 
2327  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2328 
2329  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2330 
2331  /*
2332  * We get here only in the corner case where we are trying to extend
2333  * the relation but we found a pre-existing buffer. This can happen
2334  * because a prior attempt at extending the relation failed, and
2335  * because mdread doesn't complain about reads beyond EOF (when
2336  * zero_damaged_pages is ON) and so a previous attempt to read a block
2337  * beyond EOF could have left a "valid" zero-filled buffer.
2338  * Unfortunately, we have also seen this case occurring because of
2339  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2340  * that doesn't account for a recent write. In that situation, the
2341  * pre-existing buffer would contain valid data that we don't want to
2342  * overwrite. Since the legitimate cases should always have left a
2343  * zero-filled buffer, complain if not PageIsNew.
2344  */
2345  if (existing_id >= 0)
2346  {
2347  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2348  Block buf_block;
2349  bool valid;
2350 
2351  /*
2352  * Pin the existing buffer before releasing the partition lock,
2353  * preventing it from being evicted.
2354  */
2355  valid = PinBuffer(existing_hdr, strategy);
2356 
2357  LWLockRelease(partition_lock);
2358 
2359  /*
2360  * The victim buffer we acquired previously is clean and unused,
2361  * let it be found again quickly
2362  */
2363  StrategyFreeBuffer(victim_buf_hdr);
2364  UnpinBuffer(victim_buf_hdr);
2365 
2366  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2367  buf_block = BufHdrGetBlock(existing_hdr);
2368 
2369  if (valid && !PageIsNew((Page) buf_block))
2370  ereport(ERROR,
2371  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2372  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2373  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2374 
2375  /*
2376  * We *must* do smgr[zero]extend before succeeding, else the page
2377  * will not be reserved by the kernel, and the next P_NEW call
2378  * will decide to return the same page. Clear the BM_VALID bit,
2379  * do StartBufferIO() and proceed.
2380  *
2381  * Loop to handle the very small possibility that someone re-sets
2382  * BM_VALID between our clearing it and StartBufferIO inspecting
2383  * it.
2384  */
2385  do
2386  {
2387  uint32 buf_state = LockBufHdr(existing_hdr);
2388 
2389  buf_state &= ~BM_VALID;
2390  UnlockBufHdr(existing_hdr, buf_state);
2391  } while (!StartBufferIO(existing_hdr, true, false));
2392  }
2393  else
2394  {
2395  uint32 buf_state;
2396 
2397  buf_state = LockBufHdr(victim_buf_hdr);
2398 
2399  /* some sanity checks while we hold the buffer header lock */
2400  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2401  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2402 
2403  victim_buf_hdr->tag = tag;
2404 
2405  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2406  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2407  buf_state |= BM_PERMANENT;
2408 
2409  UnlockBufHdr(victim_buf_hdr, buf_state);
2410 
2411  LWLockRelease(partition_lock);
2412 
2413  /* XXX: could combine the locked operations in it with the above */
2414  StartBufferIO(victim_buf_hdr, true, false);
2415  }
2416  }
2417 
2419 
2420  /*
2421  * Note: if smgrzeroextend fails, we will end up with buffers that are
2422  * allocated but not marked BM_VALID. The next relation extension will
2423  * still select the same block number (because the relation didn't get any
2424  * longer on disk) and so future attempts to extend the relation will find
2425  * the same buffers (if they have not been recycled) but come right back
2426  * here to try smgrzeroextend again.
2427  *
2428  * We don't need to set checksum for all-zero pages.
2429  */
2430  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2431 
2432  /*
2433  * Release the file-extension lock; it's now OK for someone else to extend
2434  * the relation some more.
2435  *
2436  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2437  * take noticeable time.
2438  */
2439  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2441 
2443  io_start, extend_by);
2444 
2445  /* Set BM_VALID, terminate IO, and wake up any waiters */
2446  for (uint32 i = 0; i < extend_by; i++)
2447  {
2448  Buffer buf = buffers[i];
2449  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2450  bool lock = false;
2451 
2452  if (flags & EB_LOCK_FIRST && i == 0)
2453  lock = true;
2454  else if (flags & EB_LOCK_TARGET)
2455  {
2456  Assert(extend_upto != InvalidBlockNumber);
2457  if (first_block + i + 1 == extend_upto)
2458  lock = true;
2459  }
2460 
2461  if (lock)
2463 
2464  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2465  }
2466 
2467  pgBufferUsage.shared_blks_written += extend_by;
2468 
2469  *extended_by = extend_by;
2470 
2471  return first_block;
2472 }
2473 
2474 /*
2475  * BufferIsExclusiveLocked
2476  *
2477  * Checks if buffer is exclusive-locked.
2478  *
2479  * Buffer must be pinned.
2480  */
2481 bool
2483 {
2484  BufferDesc *bufHdr;
2485 
2486  if (BufferIsLocal(buffer))
2487  {
2488  int bufid = -buffer - 1;
2489 
2490  bufHdr = GetLocalBufferDescriptor(bufid);
2491  }
2492  else
2493  {
2494  bufHdr = GetBufferDescriptor(buffer - 1);
2495  }
2496 
2499  LW_EXCLUSIVE);
2500 }
2501 
2502 /*
2503  * BufferIsDirty
2504  *
2505  * Checks if buffer is already dirty.
2506  *
2507  * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2508  * the result may be stale before it's returned.)
2509  */
2510 bool
2512 {
2513  BufferDesc *bufHdr;
2514 
2515  if (BufferIsLocal(buffer))
2516  {
2517  int bufid = -buffer - 1;
2518 
2519  bufHdr = GetLocalBufferDescriptor(bufid);
2520  }
2521  else
2522  {
2523  bufHdr = GetBufferDescriptor(buffer - 1);
2524  }
2525 
2528  LW_EXCLUSIVE));
2529 
2530  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2531 }
2532 
2533 /*
2534  * MarkBufferDirty
2535  *
2536  * Marks buffer contents as dirty (actual write happens later).
2537  *
2538  * Buffer must be pinned and exclusive-locked. (If caller does not hold
2539  * exclusive lock, then somebody could be in process of writing the buffer,
2540  * leading to risk of bad data written to disk.)
2541  */
2542 void
2544 {
2545  BufferDesc *bufHdr;
2546  uint32 buf_state;
2547  uint32 old_buf_state;
2548 
2549  if (!BufferIsValid(buffer))
2550  elog(ERROR, "bad buffer ID: %d", buffer);
2551 
2552  if (BufferIsLocal(buffer))
2553  {
2555  return;
2556  }
2557 
2558  bufHdr = GetBufferDescriptor(buffer - 1);
2559 
2562  LW_EXCLUSIVE));
2563 
2564  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2565  for (;;)
2566  {
2567  if (old_buf_state & BM_LOCKED)
2568  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2569 
2570  buf_state = old_buf_state;
2571 
2572  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2573  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2574 
2575  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2576  buf_state))
2577  break;
2578  }
2579 
2580  /*
2581  * If the buffer was not dirty already, do vacuum accounting.
2582  */
2583  if (!(old_buf_state & BM_DIRTY))
2584  {
2585  VacuumPageDirty++;
2587  if (VacuumCostActive)
2589  }
2590 }
2591 
2592 /*
2593  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2594  *
2595  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2596  * compared to calling the two routines separately. Now it's mainly just
2597  * a convenience function. However, if the passed buffer is valid and
2598  * already contains the desired block, we just return it as-is; and that
2599  * does save considerable work compared to a full release and reacquire.
2600  *
2601  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2602  * buffer actually needs to be released. This case is the same as ReadBuffer,
2603  * but can save some tests in the caller.
2604  */
2605 Buffer
2607  Relation relation,
2608  BlockNumber blockNum)
2609 {
2610  ForkNumber forkNum = MAIN_FORKNUM;
2611  BufferDesc *bufHdr;
2612 
2613  if (BufferIsValid(buffer))
2614  {
2616  if (BufferIsLocal(buffer))
2617  {
2618  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2619  if (bufHdr->tag.blockNum == blockNum &&
2620  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2621  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2622  return buffer;
2624  }
2625  else
2626  {
2627  bufHdr = GetBufferDescriptor(buffer - 1);
2628  /* we have pin, so it's ok to examine tag without spinlock */
2629  if (bufHdr->tag.blockNum == blockNum &&
2630  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2631  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2632  return buffer;
2633  UnpinBuffer(bufHdr);
2634  }
2635  }
2636 
2637  return ReadBuffer(relation, blockNum);
2638 }
2639 
2640 /*
2641  * PinBuffer -- make buffer unavailable for replacement.
2642  *
2643  * For the default access strategy, the buffer's usage_count is incremented
2644  * when we first pin it; for other strategies we just make sure the usage_count
2645  * isn't zero. (The idea of the latter is that we don't want synchronized
2646  * heap scans to inflate the count, but we need it to not be zero to discourage
2647  * other backends from stealing buffers from our ring. As long as we cycle
2648  * through the ring faster than the global clock-sweep cycles, buffers in
2649  * our ring won't be chosen as victims for replacement by other backends.)
2650  *
2651  * This should be applied only to shared buffers, never local ones.
2652  *
2653  * Since buffers are pinned/unpinned very frequently, pin buffers without
2654  * taking the buffer header lock; instead update the state variable in loop of
2655  * CAS operations. Hopefully it's just a single CAS.
2656  *
2657  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2658  * must have been done already.
2659  *
2660  * Returns true if buffer is BM_VALID, else false. This provision allows
2661  * some callers to avoid an extra spinlock cycle.
2662  */
2663 static bool
2665 {
2667  bool result;
2668  PrivateRefCountEntry *ref;
2669 
2670  Assert(!BufferIsLocal(b));
2671  Assert(ReservedRefCountEntry != NULL);
2672 
2673  ref = GetPrivateRefCountEntry(b, true);
2674 
2675  if (ref == NULL)
2676  {
2677  uint32 buf_state;
2678  uint32 old_buf_state;
2679 
2680  ref = NewPrivateRefCountEntry(b);
2681 
2682  old_buf_state = pg_atomic_read_u32(&buf->state);
2683  for (;;)
2684  {
2685  if (old_buf_state & BM_LOCKED)
2686  old_buf_state = WaitBufHdrUnlocked(buf);
2687 
2688  buf_state = old_buf_state;
2689 
2690  /* increase refcount */
2691  buf_state += BUF_REFCOUNT_ONE;
2692 
2693  if (strategy == NULL)
2694  {
2695  /* Default case: increase usagecount unless already max. */
2697  buf_state += BUF_USAGECOUNT_ONE;
2698  }
2699  else
2700  {
2701  /*
2702  * Ring buffers shouldn't evict others from pool. Thus we
2703  * don't make usagecount more than 1.
2704  */
2705  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2706  buf_state += BUF_USAGECOUNT_ONE;
2707  }
2708 
2709  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2710  buf_state))
2711  {
2712  result = (buf_state & BM_VALID) != 0;
2713 
2714  /*
2715  * Assume that we acquired a buffer pin for the purposes of
2716  * Valgrind buffer client checks (even in !result case) to
2717  * keep things simple. Buffers that are unsafe to access are
2718  * not generally guaranteed to be marked undefined or
2719  * non-accessible in any case.
2720  */
2722  break;
2723  }
2724  }
2725  }
2726  else
2727  {
2728  /*
2729  * If we previously pinned the buffer, it is likely to be valid, but
2730  * it may not be if StartReadBuffers() was called and
2731  * WaitReadBuffers() hasn't been called yet. We'll check by loading
2732  * the flags without locking. This is racy, but it's OK to return
2733  * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2734  * it'll see that it's now valid.
2735  *
2736  * Note: We deliberately avoid a Valgrind client request here.
2737  * Individual access methods can optionally superimpose buffer page
2738  * client requests on top of our client requests to enforce that
2739  * buffers are only accessed while locked (and pinned). It's possible
2740  * that the buffer page is legitimately non-accessible here. We
2741  * cannot meddle with that.
2742  */
2743  result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2744  }
2745 
2746  ref->refcount++;
2747  Assert(ref->refcount > 0);
2749  return result;
2750 }
2751 
2752 /*
2753  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2754  * The spinlock is released before return.
2755  *
2756  * As this function is called with the spinlock held, the caller has to
2757  * previously call ReservePrivateRefCountEntry() and
2758  * ResourceOwnerEnlarge(CurrentResourceOwner);
2759  *
2760  * Currently, no callers of this function want to modify the buffer's
2761  * usage_count at all, so there's no need for a strategy parameter.
2762  * Also we don't bother with a BM_VALID test (the caller could check that for
2763  * itself).
2764  *
2765  * Also all callers only ever use this function when it's known that the
2766  * buffer can't have a preexisting pin by this backend. That allows us to skip
2767  * searching the private refcount array & hash, which is a boon, because the
2768  * spinlock is still held.
2769  *
2770  * Note: use of this routine is frequently mandatory, not just an optimization
2771  * to save a spin lock/unlock cycle, because we need to pin a buffer before
2772  * its state can change under us.
2773  */
2774 static void
2776 {
2777  Buffer b;
2778  PrivateRefCountEntry *ref;
2779  uint32 buf_state;
2780 
2781  /*
2782  * As explained, We don't expect any preexisting pins. That allows us to
2783  * manipulate the PrivateRefCount after releasing the spinlock
2784  */
2786 
2787  /*
2788  * Buffer can't have a preexisting pin, so mark its page as defined to
2789  * Valgrind (this is similar to the PinBuffer() case where the backend
2790  * doesn't already have a buffer pin)
2791  */
2793 
2794  /*
2795  * Since we hold the buffer spinlock, we can update the buffer state and
2796  * release the lock in one operation.
2797  */
2798  buf_state = pg_atomic_read_u32(&buf->state);
2799  Assert(buf_state & BM_LOCKED);
2800  buf_state += BUF_REFCOUNT_ONE;
2801  UnlockBufHdr(buf, buf_state);
2802 
2804 
2805  ref = NewPrivateRefCountEntry(b);
2806  ref->refcount++;
2807 
2809 }
2810 
2811 /*
2812  * UnpinBuffer -- make buffer available for replacement.
2813  *
2814  * This should be applied only to shared buffers, never local ones. This
2815  * always adjusts CurrentResourceOwner.
2816  */
2817 static void
2819 {
2821 
2824 }
2825 
2826 static void
2828 {
2829  PrivateRefCountEntry *ref;
2831 
2832  Assert(!BufferIsLocal(b));
2833 
2834  /* not moving as we're likely deleting it soon anyway */
2835  ref = GetPrivateRefCountEntry(b, false);
2836  Assert(ref != NULL);
2837  Assert(ref->refcount > 0);
2838  ref->refcount--;
2839  if (ref->refcount == 0)
2840  {
2841  uint32 buf_state;
2842  uint32 old_buf_state;
2843 
2844  /*
2845  * Mark buffer non-accessible to Valgrind.
2846  *
2847  * Note that the buffer may have already been marked non-accessible
2848  * within access method code that enforces that buffers are only
2849  * accessed while a buffer lock is held.
2850  */
2852 
2853  /* I'd better not still hold the buffer content lock */
2855 
2856  /*
2857  * Decrement the shared reference count.
2858  *
2859  * Since buffer spinlock holder can update status using just write,
2860  * it's not safe to use atomic decrement here; thus use a CAS loop.
2861  */
2862  old_buf_state = pg_atomic_read_u32(&buf->state);
2863  for (;;)
2864  {
2865  if (old_buf_state & BM_LOCKED)
2866  old_buf_state = WaitBufHdrUnlocked(buf);
2867 
2868  buf_state = old_buf_state;
2869 
2870  buf_state -= BUF_REFCOUNT_ONE;
2871 
2872  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2873  buf_state))
2874  break;
2875  }
2876 
2877  /* Support LockBufferForCleanup() */
2878  if (buf_state & BM_PIN_COUNT_WAITER)
2879  {
2880  /*
2881  * Acquire the buffer header lock, re-check that there's a waiter.
2882  * Another backend could have unpinned this buffer, and already
2883  * woken up the waiter. There's no danger of the buffer being
2884  * replaced after we unpinned it above, as it's pinned by the
2885  * waiter.
2886  */
2887  buf_state = LockBufHdr(buf);
2888 
2889  if ((buf_state & BM_PIN_COUNT_WAITER) &&
2890  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2891  {
2892  /* we just released the last pin other than the waiter's */
2893  int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2894 
2895  buf_state &= ~BM_PIN_COUNT_WAITER;
2896  UnlockBufHdr(buf, buf_state);
2897  ProcSendSignal(wait_backend_pgprocno);
2898  }
2899  else
2900  UnlockBufHdr(buf, buf_state);
2901  }
2903  }
2904 }
2905 
2906 #define ST_SORT sort_checkpoint_bufferids
2907 #define ST_ELEMENT_TYPE CkptSortItem
2908 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2909 #define ST_SCOPE static
2910 #define ST_DEFINE
2911 #include <lib/sort_template.h>
2912 
2913 /*
2914  * BufferSync -- Write out all dirty buffers in the pool.
2915  *
2916  * This is called at checkpoint time to write out all dirty shared buffers.
2917  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2918  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2919  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2920  * unlogged buffers, which are otherwise skipped. The remaining flags
2921  * currently have no effect here.
2922  */
2923 static void
2924 BufferSync(int flags)
2925 {
2926  uint32 buf_state;
2927  int buf_id;
2928  int num_to_scan;
2929  int num_spaces;
2930  int num_processed;
2931  int num_written;
2932  CkptTsStatus *per_ts_stat = NULL;
2933  Oid last_tsid;
2934  binaryheap *ts_heap;
2935  int i;
2936  int mask = BM_DIRTY;
2937  WritebackContext wb_context;
2938 
2939  /*
2940  * Unless this is a shutdown checkpoint or we have been explicitly told,
2941  * we write only permanent, dirty buffers. But at shutdown or end of
2942  * recovery, we write all dirty buffers.
2943  */
2946  mask |= BM_PERMANENT;
2947 
2948  /*
2949  * Loop over all buffers, and mark the ones that need to be written with
2950  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2951  * can estimate how much work needs to be done.
2952  *
2953  * This allows us to write only those pages that were dirty when the
2954  * checkpoint began, and not those that get dirtied while it proceeds.
2955  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2956  * later in this function, or by normal backends or the bgwriter cleaning
2957  * scan, the flag is cleared. Any buffer dirtied after this point won't
2958  * have the flag set.
2959  *
2960  * Note that if we fail to write some buffer, we may leave buffers with
2961  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2962  * certainly need to be written for the next checkpoint attempt, too.
2963  */
2964  num_to_scan = 0;
2965  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2966  {
2967  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2968 
2969  /*
2970  * Header spinlock is enough to examine BM_DIRTY, see comment in
2971  * SyncOneBuffer.
2972  */
2973  buf_state = LockBufHdr(bufHdr);
2974 
2975  if ((buf_state & mask) == mask)
2976  {
2977  CkptSortItem *item;
2978 
2979  buf_state |= BM_CHECKPOINT_NEEDED;
2980 
2981  item = &CkptBufferIds[num_to_scan++];
2982  item->buf_id = buf_id;
2983  item->tsId = bufHdr->tag.spcOid;
2984  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2985  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2986  item->blockNum = bufHdr->tag.blockNum;
2987  }
2988 
2989  UnlockBufHdr(bufHdr, buf_state);
2990 
2991  /* Check for barrier events in case NBuffers is large. */
2994  }
2995 
2996  if (num_to_scan == 0)
2997  return; /* nothing to do */
2998 
3000 
3001  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3002 
3003  /*
3004  * Sort buffers that need to be written to reduce the likelihood of random
3005  * IO. The sorting is also important for the implementation of balancing
3006  * writes between tablespaces. Without balancing writes we'd potentially
3007  * end up writing to the tablespaces one-by-one; possibly overloading the
3008  * underlying system.
3009  */
3010  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3011 
3012  num_spaces = 0;
3013 
3014  /*
3015  * Allocate progress status for each tablespace with buffers that need to
3016  * be flushed. This requires the to-be-flushed array to be sorted.
3017  */
3018  last_tsid = InvalidOid;
3019  for (i = 0; i < num_to_scan; i++)
3020  {
3021  CkptTsStatus *s;
3022  Oid cur_tsid;
3023 
3024  cur_tsid = CkptBufferIds[i].tsId;
3025 
3026  /*
3027  * Grow array of per-tablespace status structs, every time a new
3028  * tablespace is found.
3029  */
3030  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3031  {
3032  Size sz;
3033 
3034  num_spaces++;
3035 
3036  /*
3037  * Not worth adding grow-by-power-of-2 logic here - even with a
3038  * few hundred tablespaces this should be fine.
3039  */
3040  sz = sizeof(CkptTsStatus) * num_spaces;
3041 
3042  if (per_ts_stat == NULL)
3043  per_ts_stat = (CkptTsStatus *) palloc(sz);
3044  else
3045  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3046 
3047  s = &per_ts_stat[num_spaces - 1];
3048  memset(s, 0, sizeof(*s));
3049  s->tsId = cur_tsid;
3050 
3051  /*
3052  * The first buffer in this tablespace. As CkptBufferIds is sorted
3053  * by tablespace all (s->num_to_scan) buffers in this tablespace
3054  * will follow afterwards.
3055  */
3056  s->index = i;
3057 
3058  /*
3059  * progress_slice will be determined once we know how many buffers
3060  * are in each tablespace, i.e. after this loop.
3061  */
3062 
3063  last_tsid = cur_tsid;
3064  }
3065  else
3066  {
3067  s = &per_ts_stat[num_spaces - 1];
3068  }
3069 
3070  s->num_to_scan++;
3071 
3072  /* Check for barrier events. */
3075  }
3076 
3077  Assert(num_spaces > 0);
3078 
3079  /*
3080  * Build a min-heap over the write-progress in the individual tablespaces,
3081  * and compute how large a portion of the total progress a single
3082  * processed buffer is.
3083  */
3084  ts_heap = binaryheap_allocate(num_spaces,
3086  NULL);
3087 
3088  for (i = 0; i < num_spaces; i++)
3089  {
3090  CkptTsStatus *ts_stat = &per_ts_stat[i];
3091 
3092  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3093 
3094  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3095  }
3096 
3097  binaryheap_build(ts_heap);
3098 
3099  /*
3100  * Iterate through to-be-checkpointed buffers and write the ones (still)
3101  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3102  * tablespaces; otherwise the sorting would lead to only one tablespace
3103  * receiving writes at a time, making inefficient use of the hardware.
3104  */
3105  num_processed = 0;
3106  num_written = 0;
3107  while (!binaryheap_empty(ts_heap))
3108  {
3109  BufferDesc *bufHdr = NULL;
3110  CkptTsStatus *ts_stat = (CkptTsStatus *)
3112 
3113  buf_id = CkptBufferIds[ts_stat->index].buf_id;
3114  Assert(buf_id != -1);
3115 
3116  bufHdr = GetBufferDescriptor(buf_id);
3117 
3118  num_processed++;
3119 
3120  /*
3121  * We don't need to acquire the lock here, because we're only looking
3122  * at a single bit. It's possible that someone else writes the buffer
3123  * and clears the flag right after we check, but that doesn't matter
3124  * since SyncOneBuffer will then do nothing. However, there is a
3125  * further race condition: it's conceivable that between the time we
3126  * examine the bit here and the time SyncOneBuffer acquires the lock,
3127  * someone else not only wrote the buffer but replaced it with another
3128  * page and dirtied it. In that improbable case, SyncOneBuffer will
3129  * write the buffer though we didn't need to. It doesn't seem worth
3130  * guarding against this, though.
3131  */
3133  {
3134  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3135  {
3136  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3138  num_written++;
3139  }
3140  }
3141 
3142  /*
3143  * Measure progress independent of actually having to flush the buffer
3144  * - otherwise writing become unbalanced.
3145  */
3146  ts_stat->progress += ts_stat->progress_slice;
3147  ts_stat->num_scanned++;
3148  ts_stat->index++;
3149 
3150  /* Have all the buffers from the tablespace been processed? */
3151  if (ts_stat->num_scanned == ts_stat->num_to_scan)
3152  {
3153  binaryheap_remove_first(ts_heap);
3154  }
3155  else
3156  {
3157  /* update heap with the new progress */
3158  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3159  }
3160 
3161  /*
3162  * Sleep to throttle our I/O rate.
3163  *
3164  * (This will check for barrier events even if it doesn't sleep.)
3165  */
3166  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3167  }
3168 
3169  /*
3170  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3171  * IOContext will always be IOCONTEXT_NORMAL.
3172  */
3174 
3175  pfree(per_ts_stat);
3176  per_ts_stat = NULL;
3177  binaryheap_free(ts_heap);
3178 
3179  /*
3180  * Update checkpoint statistics. As noted above, this doesn't include
3181  * buffers written by other backends or bgwriter scan.
3182  */
3183  CheckpointStats.ckpt_bufs_written += num_written;
3184 
3185  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3186 }
3187 
3188 /*
3189  * BgBufferSync -- Write out some dirty buffers in the pool.
3190  *
3191  * This is called periodically by the background writer process.
3192  *
3193  * Returns true if it's appropriate for the bgwriter process to go into
3194  * low-power hibernation mode. (This happens if the strategy clock sweep
3195  * has been "lapped" and no buffer allocations have occurred recently,
3196  * or if the bgwriter has been effectively disabled by setting
3197  * bgwriter_lru_maxpages to 0.)
3198  */
3199 bool
3201 {
3202  /* info obtained from freelist.c */
3203  int strategy_buf_id;
3204  uint32 strategy_passes;
3205  uint32 recent_alloc;
3206 
3207  /*
3208  * Information saved between calls so we can determine the strategy
3209  * point's advance rate and avoid scanning already-cleaned buffers.
3210  */
3211  static bool saved_info_valid = false;
3212  static int prev_strategy_buf_id;
3213  static uint32 prev_strategy_passes;
3214  static int next_to_clean;
3215  static uint32 next_passes;
3216 
3217  /* Moving averages of allocation rate and clean-buffer density */
3218  static float smoothed_alloc = 0;
3219  static float smoothed_density = 10.0;
3220 
3221  /* Potentially these could be tunables, but for now, not */
3222  float smoothing_samples = 16;
3223  float scan_whole_pool_milliseconds = 120000.0;
3224 
3225  /* Used to compute how far we scan ahead */
3226  long strategy_delta;
3227  int bufs_to_lap;
3228  int bufs_ahead;
3229  float scans_per_alloc;
3230  int reusable_buffers_est;
3231  int upcoming_alloc_est;
3232  int min_scan_buffers;
3233 
3234  /* Variables for the scanning loop proper */
3235  int num_to_scan;
3236  int num_written;
3237  int reusable_buffers;
3238 
3239  /* Variables for final smoothed_density update */
3240  long new_strategy_delta;
3241  uint32 new_recent_alloc;
3242 
3243  /*
3244  * Find out where the freelist clock sweep currently is, and how many
3245  * buffer allocations have happened since our last call.
3246  */
3247  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3248 
3249  /* Report buffer alloc counts to pgstat */
3250  PendingBgWriterStats.buf_alloc += recent_alloc;
3251 
3252  /*
3253  * If we're not running the LRU scan, just stop after doing the stats
3254  * stuff. We mark the saved state invalid so that we can recover sanely
3255  * if LRU scan is turned back on later.
3256  */
3257  if (bgwriter_lru_maxpages <= 0)
3258  {
3259  saved_info_valid = false;
3260  return true;
3261  }
3262 
3263  /*
3264  * Compute strategy_delta = how many buffers have been scanned by the
3265  * clock sweep since last time. If first time through, assume none. Then
3266  * see if we are still ahead of the clock sweep, and if so, how many
3267  * buffers we could scan before we'd catch up with it and "lap" it. Note:
3268  * weird-looking coding of xxx_passes comparisons are to avoid bogus
3269  * behavior when the passes counts wrap around.
3270  */
3271  if (saved_info_valid)
3272  {
3273  int32 passes_delta = strategy_passes - prev_strategy_passes;
3274 
3275  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3276  strategy_delta += (long) passes_delta * NBuffers;
3277 
3278  Assert(strategy_delta >= 0);
3279 
3280  if ((int32) (next_passes - strategy_passes) > 0)
3281  {
3282  /* we're one pass ahead of the strategy point */
3283  bufs_to_lap = strategy_buf_id - next_to_clean;
3284 #ifdef BGW_DEBUG
3285  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3286  next_passes, next_to_clean,
3287  strategy_passes, strategy_buf_id,
3288  strategy_delta, bufs_to_lap);
3289 #endif
3290  }
3291  else if (next_passes == strategy_passes &&
3292  next_to_clean >= strategy_buf_id)
3293  {
3294  /* on same pass, but ahead or at least not behind */
3295  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3296 #ifdef BGW_DEBUG
3297  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3298  next_passes, next_to_clean,
3299  strategy_passes, strategy_buf_id,
3300  strategy_delta, bufs_to_lap);
3301 #endif
3302  }
3303  else
3304  {
3305  /*
3306  * We're behind, so skip forward to the strategy point and start
3307  * cleaning from there.
3308  */
3309 #ifdef BGW_DEBUG
3310  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3311  next_passes, next_to_clean,
3312  strategy_passes, strategy_buf_id,
3313  strategy_delta);
3314 #endif
3315  next_to_clean = strategy_buf_id;
3316  next_passes = strategy_passes;
3317  bufs_to_lap = NBuffers;
3318  }
3319  }
3320  else
3321  {
3322  /*
3323  * Initializing at startup or after LRU scanning had been off. Always
3324  * start at the strategy point.
3325  */
3326 #ifdef BGW_DEBUG
3327  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3328  strategy_passes, strategy_buf_id);
3329 #endif
3330  strategy_delta = 0;
3331  next_to_clean = strategy_buf_id;
3332  next_passes = strategy_passes;
3333  bufs_to_lap = NBuffers;
3334  }
3335 
3336  /* Update saved info for next time */
3337  prev_strategy_buf_id = strategy_buf_id;
3338  prev_strategy_passes = strategy_passes;
3339  saved_info_valid = true;
3340 
3341  /*
3342  * Compute how many buffers had to be scanned for each new allocation, ie,
3343  * 1/density of reusable buffers, and track a moving average of that.
3344  *
3345  * If the strategy point didn't move, we don't update the density estimate
3346  */
3347  if (strategy_delta > 0 && recent_alloc > 0)
3348  {
3349  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3350  smoothed_density += (scans_per_alloc - smoothed_density) /
3351  smoothing_samples;
3352  }
3353 
3354  /*
3355  * Estimate how many reusable buffers there are between the current
3356  * strategy point and where we've scanned ahead to, based on the smoothed
3357  * density estimate.
3358  */
3359  bufs_ahead = NBuffers - bufs_to_lap;
3360  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3361 
3362  /*
3363  * Track a moving average of recent buffer allocations. Here, rather than
3364  * a true average we want a fast-attack, slow-decline behavior: we
3365  * immediately follow any increase.
3366  */
3367  if (smoothed_alloc <= (float) recent_alloc)
3368  smoothed_alloc = recent_alloc;
3369  else
3370  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3371  smoothing_samples;
3372 
3373  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3374  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3375 
3376  /*
3377  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3378  * eventually underflow to zero, and the underflows produce annoying
3379  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3380  * zero, there's no point in tracking smaller and smaller values of
3381  * smoothed_alloc, so just reset it to exactly zero to avoid this
3382  * syndrome. It will pop back up as soon as recent_alloc increases.
3383  */
3384  if (upcoming_alloc_est == 0)
3385  smoothed_alloc = 0;
3386 
3387  /*
3388  * Even in cases where there's been little or no buffer allocation
3389  * activity, we want to make a small amount of progress through the buffer
3390  * cache so that as many reusable buffers as possible are clean after an
3391  * idle period.
3392  *
3393  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3394  * the BGW will be called during the scan_whole_pool time; slice the
3395  * buffer pool into that many sections.
3396  */
3397  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3398 
3399  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3400  {
3401 #ifdef BGW_DEBUG
3402  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3403  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3404 #endif
3405  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3406  }
3407 
3408  /*
3409  * Now write out dirty reusable buffers, working forward from the
3410  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3411  * enough buffers to match our estimate of the next cycle's allocation
3412  * requirements, or hit the bgwriter_lru_maxpages limit.
3413  */
3414 
3415  num_to_scan = bufs_to_lap;
3416  num_written = 0;
3417  reusable_buffers = reusable_buffers_est;
3418 
3419  /* Execute the LRU scan */
3420  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3421  {
3422  int sync_state = SyncOneBuffer(next_to_clean, true,
3423  wb_context);
3424 
3425  if (++next_to_clean >= NBuffers)
3426  {
3427  next_to_clean = 0;
3428  next_passes++;
3429  }
3430  num_to_scan--;
3431 
3432  if (sync_state & BUF_WRITTEN)
3433  {
3434  reusable_buffers++;
3435  if (++num_written >= bgwriter_lru_maxpages)
3436  {
3438  break;
3439  }
3440  }
3441  else if (sync_state & BUF_REUSABLE)
3442  reusable_buffers++;
3443  }
3444 
3445  PendingBgWriterStats.buf_written_clean += num_written;
3446 
3447 #ifdef BGW_DEBUG
3448  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3449  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3450  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3451  bufs_to_lap - num_to_scan,
3452  num_written,
3453  reusable_buffers - reusable_buffers_est);
3454 #endif
3455 
3456  /*
3457  * Consider the above scan as being like a new allocation scan.
3458  * Characterize its density and update the smoothed one based on it. This
3459  * effectively halves the moving average period in cases where both the
3460  * strategy and the background writer are doing some useful scanning,
3461  * which is helpful because a long memory isn't as desirable on the
3462  * density estimates.
3463  */
3464  new_strategy_delta = bufs_to_lap - num_to_scan;
3465  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3466  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3467  {
3468  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3469  smoothed_density += (scans_per_alloc - smoothed_density) /
3470  smoothing_samples;
3471 
3472 #ifdef BGW_DEBUG
3473  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3474  new_recent_alloc, new_strategy_delta,
3475  scans_per_alloc, smoothed_density);
3476 #endif
3477  }
3478 
3479  /* Return true if OK to hibernate */
3480  return (bufs_to_lap == 0 && recent_alloc == 0);
3481 }
3482 
3483 /*
3484  * SyncOneBuffer -- process a single buffer during syncing.
3485  *
3486  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3487  * buffers marked recently used, as these are not replacement candidates.
3488  *
3489  * Returns a bitmask containing the following flag bits:
3490  * BUF_WRITTEN: we wrote the buffer.
3491  * BUF_REUSABLE: buffer is available for replacement, ie, it has
3492  * pin count 0 and usage count 0.
3493  *
3494  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3495  * after locking it, but we don't care all that much.)
3496  */
3497 static int
3498 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3499 {
3500  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3501  int result = 0;
3502  uint32 buf_state;
3503  BufferTag tag;
3504 
3505  /* Make sure we can handle the pin */
3508 
3509  /*
3510  * Check whether buffer needs writing.
3511  *
3512  * We can make this check without taking the buffer content lock so long
3513  * as we mark pages dirty in access methods *before* logging changes with
3514  * XLogInsert(): if someone marks the buffer dirty just after our check we
3515  * don't worry because our checkpoint.redo points before log record for
3516  * upcoming changes and so we are not required to write such dirty buffer.
3517  */
3518  buf_state = LockBufHdr(bufHdr);
3519 
3520  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3521  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3522  {
3523  result |= BUF_REUSABLE;
3524  }
3525  else if (skip_recently_used)
3526  {
3527  /* Caller told us not to write recently-used buffers */
3528  UnlockBufHdr(bufHdr, buf_state);
3529  return result;
3530  }
3531 
3532  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3533  {
3534  /* It's clean, so nothing to do */
3535  UnlockBufHdr(bufHdr, buf_state);
3536  return result;
3537  }
3538 
3539  /*
3540  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3541  * buffer is clean by the time we've locked it.)
3542  */
3543  PinBuffer_Locked(bufHdr);
3545 
3547 
3549 
3550  tag = bufHdr->tag;
3551 
3552  UnpinBuffer(bufHdr);
3553 
3554  /*
3555  * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3556  * IOContext will always be IOCONTEXT_NORMAL.
3557  */
3559 
3560  return result | BUF_WRITTEN;
3561 }
3562 
3563 /*
3564  * AtEOXact_Buffers - clean up at end of transaction.
3565  *
3566  * As of PostgreSQL 8.0, buffer pins should get released by the
3567  * ResourceOwner mechanism. This routine is just a debugging
3568  * cross-check that no pins remain.
3569  */
3570 void
3571 AtEOXact_Buffers(bool isCommit)
3572 {
3574 
3575  AtEOXact_LocalBuffers(isCommit);
3576 
3578 }
3579 
3580 /*
3581  * Initialize access to shared buffer pool
3582  *
3583  * This is called during backend startup (whether standalone or under the
3584  * postmaster). It sets up for this backend's access to the already-existing
3585  * buffer pool.
3586  */
3587 void
3589 {
3590  HASHCTL hash_ctl;
3591 
3592  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3593 
3594  hash_ctl.keysize = sizeof(int32);
3595  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3596 
3597  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3598  HASH_ELEM | HASH_BLOBS);
3599 
3600  /*
3601  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3602  * the corresponding phase of backend shutdown.
3603  */
3604  Assert(MyProc != NULL);
3606 }
3607 
3608 /*
3609  * During backend exit, ensure that we released all shared-buffer locks and
3610  * assert that we have no remaining pins.
3611  */
3612 static void
3614 {
3615  UnlockBuffers();
3616 
3618 
3619  /* localbuf.c needs a chance too */
3621 }
3622 
3623 /*
3624  * CheckForBufferLeaks - ensure this backend holds no buffer pins
3625  *
3626  * As of PostgreSQL 8.0, buffer pins should get released by the
3627  * ResourceOwner mechanism. This routine is just a debugging
3628  * cross-check that no pins remain.
3629  */
3630 static void
3632 {
3633 #ifdef USE_ASSERT_CHECKING
3634  int RefCountErrors = 0;
3636  int i;
3637  char *s;
3638 
3639  /* check the array */
3640  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3641  {
3643 
3644  if (res->buffer != InvalidBuffer)
3645  {
3646  s = DebugPrintBufferRefcount(res->buffer);
3647  elog(WARNING, "buffer refcount leak: %s", s);
3648  pfree(s);
3649 
3650  RefCountErrors++;
3651  }
3652  }
3653 
3654  /* if necessary search the hash */
3656  {
3657  HASH_SEQ_STATUS hstat;
3658 
3660  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3661  {
3662  s = DebugPrintBufferRefcount(res->buffer);
3663  elog(WARNING, "buffer refcount leak: %s", s);
3664  pfree(s);
3665  RefCountErrors++;
3666  }
3667  }
3668 
3669  Assert(RefCountErrors == 0);
3670 #endif
3671 }
3672 
3673 /*
3674  * Helper routine to issue warnings when a buffer is unexpectedly pinned
3675  */
3676 char *
3678 {
3679  BufferDesc *buf;
3680  int32 loccount;
3681  char *path;
3682  char *result;
3683  ProcNumber backend;
3684  uint32 buf_state;
3685 
3687  if (BufferIsLocal(buffer))
3688  {
3690  loccount = LocalRefCount[-buffer - 1];
3691  backend = MyProcNumber;
3692  }
3693  else
3694  {
3696  loccount = GetPrivateRefCount(buffer);
3697  backend = INVALID_PROC_NUMBER;
3698  }
3699 
3700  /* theoretically we should lock the bufhdr here */
3701  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3702  BufTagGetForkNum(&buf->tag));
3703  buf_state = pg_atomic_read_u32(&buf->state);
3704 
3705  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3706  buffer, path,
3707  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3708  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3709  pfree(path);
3710  return result;
3711 }
3712 
3713 /*
3714  * CheckPointBuffers
3715  *
3716  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3717  *
3718  * Note: temporary relations do not participate in checkpoints, so they don't
3719  * need to be flushed.
3720  */
3721 void
3723 {
3724  BufferSync(flags);
3725 }
3726 
3727 /*
3728  * BufferGetBlockNumber
3729  * Returns the block number associated with a buffer.
3730  *
3731  * Note:
3732  * Assumes that the buffer is valid and pinned, else the
3733  * value may be obsolete immediately...
3734  */
3737 {
3738  BufferDesc *bufHdr;
3739 
3741 
3742  if (BufferIsLocal(buffer))
3743  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3744  else
3745  bufHdr = GetBufferDescriptor(buffer - 1);
3746 
3747  /* pinned, so OK to read tag without spinlock */
3748  return bufHdr->tag.blockNum;
3749 }
3750 
3751 /*
3752  * BufferGetTag
3753  * Returns the relfilelocator, fork number and block number associated with
3754  * a buffer.
3755  */
3756 void
3758  BlockNumber *blknum)
3759 {
3760  BufferDesc *bufHdr;
3761 
3762  /* Do the same checks as BufferGetBlockNumber. */
3764 
3765  if (BufferIsLocal(buffer))
3766  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3767  else
3768  bufHdr = GetBufferDescriptor(buffer - 1);
3769 
3770  /* pinned, so OK to read tag without spinlock */
3771  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3772  *forknum = BufTagGetForkNum(&bufHdr->tag);
3773  *blknum = bufHdr->tag.blockNum;
3774 }
3775 
3776 /*
3777  * FlushBuffer
3778  * Physically write out a shared buffer.
3779  *
3780  * NOTE: this actually just passes the buffer contents to the kernel; the
3781  * real write to disk won't happen until the kernel feels like it. This
3782  * is okay from our point of view since we can redo the changes from WAL.
3783  * However, we will need to force the changes to disk via fsync before
3784  * we can checkpoint WAL.
3785  *
3786  * The caller must hold a pin on the buffer and have share-locked the
3787  * buffer contents. (Note: a share-lock does not prevent updates of
3788  * hint bits in the buffer, so the page could change while the write
3789  * is in progress, but we assume that that will not invalidate the data
3790  * written.)
3791  *
3792  * If the caller has an smgr reference for the buffer's relation, pass it
3793  * as the second parameter. If not, pass NULL.
3794  */
3795 static void
3797  IOContext io_context)
3798 {
3799  XLogRecPtr recptr;
3800  ErrorContextCallback errcallback;
3801  instr_time io_start;
3802  Block bufBlock;
3803  char *bufToWrite;
3804  uint32 buf_state;
3805 
3806  /*
3807  * Try to start an I/O operation. If StartBufferIO returns false, then
3808  * someone else flushed the buffer before we could, so we need not do
3809  * anything.
3810  */
3811  if (!StartBufferIO(buf, false, false))
3812  return;
3813 
3814  /* Setup error traceback support for ereport() */
3816  errcallback.arg = (void *) buf;
3817  errcallback.previous = error_context_stack;
3818  error_context_stack = &errcallback;
3819 
3820  /* Find smgr relation for buffer */
3821  if (reln == NULL)
3823 
3824  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3825  buf->tag.blockNum,
3827  reln->smgr_rlocator.locator.dbOid,
3829 
3830  buf_state = LockBufHdr(buf);
3831 
3832  /*
3833  * Run PageGetLSN while holding header lock, since we don't have the
3834  * buffer locked exclusively in all cases.
3835  */
3836  recptr = BufferGetLSN(buf);
3837 
3838  /* To check if block content changes while flushing. - vadim 01/17/97 */
3839  buf_state &= ~BM_JUST_DIRTIED;
3840  UnlockBufHdr(buf, buf_state);
3841 
3842  /*
3843  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3844  * rule that log updates must hit disk before any of the data-file changes
3845  * they describe do.
3846  *
3847  * However, this rule does not apply to unlogged relations, which will be
3848  * lost after a crash anyway. Most unlogged relation pages do not bear
3849  * LSNs since we never emit WAL records for them, and therefore flushing
3850  * up through the buffer LSN would be useless, but harmless. However,
3851  * GiST indexes use LSNs internally to track page-splits, and therefore
3852  * unlogged GiST pages bear "fake" LSNs generated by
3853  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3854  * LSN counter could advance past the WAL insertion point; and if it did
3855  * happen, attempting to flush WAL through that location would fail, with
3856  * disastrous system-wide consequences. To make sure that can't happen,
3857  * skip the flush if the buffer isn't permanent.
3858  */
3859  if (buf_state & BM_PERMANENT)
3860  XLogFlush(recptr);
3861 
3862  /*
3863  * Now it's safe to write buffer to disk. Note that no one else should
3864  * have been able to write it while we were busy with log flushing because
3865  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3866  */
3867  bufBlock = BufHdrGetBlock(buf);
3868 
3869  /*
3870  * Update page checksum if desired. Since we have only shared lock on the
3871  * buffer, other processes might be updating hint bits in it, so we must
3872  * copy the page to private storage if we do checksumming.
3873  */
3874  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3875 
3877 
3878  /*
3879  * bufToWrite is either the shared buffer or a copy, as appropriate.
3880  */
3881  smgrwrite(reln,
3882  BufTagGetForkNum(&buf->tag),
3883  buf->tag.blockNum,
3884  bufToWrite,
3885  false);
3886 
3887  /*
3888  * When a strategy is in use, only flushes of dirty buffers already in the
3889  * strategy ring are counted as strategy writes (IOCONTEXT
3890  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3891  * statistics tracking.
3892  *
3893  * If a shared buffer initially added to the ring must be flushed before
3894  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3895  *
3896  * If a shared buffer which was added to the ring later because the
3897  * current strategy buffer is pinned or in use or because all strategy
3898  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3899  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3900  * (from_ring will be false).
3901  *
3902  * When a strategy is not in use, the write can only be a "regular" write
3903  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3904  */
3906  IOOP_WRITE, io_start, 1);
3907 
3909 
3910  /*
3911  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3912  * end the BM_IO_IN_PROGRESS state.
3913  */
3914  TerminateBufferIO(buf, true, 0, true);
3915 
3916  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3917  buf->tag.blockNum,
3919  reln->smgr_rlocator.locator.dbOid,
3921 
3922  /* Pop the error context stack */
3923  error_context_stack = errcallback.previous;
3924 }
3925 
3926 /*
3927  * RelationGetNumberOfBlocksInFork
3928  * Determines the current number of pages in the specified relation fork.
3929  *
3930  * Note that the accuracy of the result will depend on the details of the
3931  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3932  * it might not be.
3933  */
3936 {
3937  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3938  {
3939  /*
3940  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3941  * tableam returns the size in bytes - but for the purpose of this
3942  * routine, we want the number of blocks. Therefore divide, rounding
3943  * up.
3944  */
3945  uint64 szbytes;
3946 
3947  szbytes = table_relation_size(relation, forkNum);
3948 
3949  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3950  }
3951  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3952  {
3953  return smgrnblocks(RelationGetSmgr(relation), forkNum);
3954  }
3955  else
3956  Assert(false);
3957 
3958  return 0; /* keep compiler quiet */
3959 }
3960 
3961 /*
3962  * BufferIsPermanent
3963  * Determines whether a buffer will potentially still be around after
3964  * a crash. Caller must hold a buffer pin.
3965  */
3966 bool
3968 {
3969  BufferDesc *bufHdr;
3970 
3971  /* Local buffers are used only for temp relations. */
3972  if (BufferIsLocal(buffer))
3973  return false;
3974 
3975  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3978 
3979  /*
3980  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3981  * need not bother with the buffer header spinlock. Even if someone else
3982  * changes the buffer header state while we're doing this, the state is
3983  * changed atomically, so we'll read the old value or the new value, but
3984  * not random garbage.
3985  */
3986  bufHdr = GetBufferDescriptor(buffer - 1);
3987  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3988 }
3989 
3990 /*
3991  * BufferGetLSNAtomic
3992  * Retrieves the LSN of the buffer atomically using a buffer header lock.
3993  * This is necessary for some callers who may not have an exclusive lock
3994  * on the buffer.
3995  */
3996 XLogRecPtr
3998 {
3999  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
4000  char *page = BufferGetPage(buffer);
4001  XLogRecPtr lsn;
4002  uint32 buf_state;
4003 
4004  /*
4005  * If we don't need locking for correctness, fastpath out.
4006  */
4008  return PageGetLSN(page);
4009 
4010  /* Make sure we've got a real buffer, and that we hold a pin on it. */
4013 
4014  buf_state = LockBufHdr(bufHdr);
4015  lsn = PageGetLSN(page);
4016  UnlockBufHdr(bufHdr, buf_state);
4017 
4018  return lsn;
4019 }
4020 
4021 /* ---------------------------------------------------------------------
4022  * DropRelationBuffers
4023  *
4024  * This function removes from the buffer pool all the pages of the
4025  * specified relation forks that have block numbers >= firstDelBlock.
4026  * (In particular, with firstDelBlock = 0, all pages are removed.)
4027  * Dirty pages are simply dropped, without bothering to write them
4028  * out first. Therefore, this is NOT rollback-able, and so should be
4029  * used only with extreme caution!
4030  *
4031  * Currently, this is called only from smgr.c when the underlying file
4032  * is about to be deleted or truncated (firstDelBlock is needed for
4033  * the truncation case). The data in the affected pages would therefore
4034  * be deleted momentarily anyway, and there is no point in writing it.
4035  * It is the responsibility of higher-level code to ensure that the
4036  * deletion or truncation does not lose any data that could be needed
4037  * later. It is also the responsibility of higher-level code to ensure
4038  * that no other process could be trying to load more pages of the
4039  * relation into buffers.
4040  * --------------------------------------------------------------------
4041  */
4042 void
4044  int nforks, BlockNumber *firstDelBlock)
4045 {
4046  int i;
4047  int j;
4048  RelFileLocatorBackend rlocator;
4049  BlockNumber nForkBlock[MAX_FORKNUM];
4050  uint64 nBlocksToInvalidate = 0;
4051 
4052  rlocator = smgr_reln->smgr_rlocator;
4053 
4054  /* If it's a local relation, it's localbuf.c's problem. */
4055  if (RelFileLocatorBackendIsTemp(rlocator))
4056  {
4057  if (rlocator.backend == MyProcNumber)
4058  {
4059  for (j = 0; j < nforks; j++)
4060  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4061  firstDelBlock[j]);
4062  }
4063  return;
4064  }
4065 
4066  /*
4067  * To remove all the pages of the specified relation forks from the buffer
4068  * pool, we need to scan the entire buffer pool but we can optimize it by
4069  * finding the buffers from BufMapping table provided we know the exact
4070  * size of each fork of the relation. The exact size is required to ensure
4071  * that we don't leave any buffer for the relation being dropped as
4072  * otherwise the background writer or checkpointer can lead to a PANIC
4073  * error while flushing buffers corresponding to files that don't exist.
4074  *
4075  * To know the exact size, we rely on the size cached for each fork by us
4076  * during recovery which limits the optimization to recovery and on
4077  * standbys but we can easily extend it once we have shared cache for
4078  * relation size.
4079  *
4080  * In recovery, we cache the value returned by the first lseek(SEEK_END)
4081  * and the future writes keeps the cached value up-to-date. See
4082  * smgrextend. It is possible that the value of the first lseek is smaller
4083  * than the actual number of existing blocks in the file due to buggy
4084  * Linux kernels that might not have accounted for the recent write. But
4085  * that should be fine because there must not be any buffers after that
4086  * file size.
4087  */
4088  for (i = 0; i < nforks; i++)
4089  {
4090  /* Get the number of blocks for a relation's fork */
4091  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4092 
4093  if (nForkBlock[i] == InvalidBlockNumber)
4094  {
4095  nBlocksToInvalidate = InvalidBlockNumber;
4096  break;
4097  }
4098 
4099  /* calculate the number of blocks to be invalidated */
4100  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4101  }
4102 
4103  /*
4104  * We apply the optimization iff the total number of blocks to invalidate
4105  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4106  */
4107  if (BlockNumberIsValid(nBlocksToInvalidate) &&
4108  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4109  {
4110  for (j = 0; j < nforks; j++)
4111  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4112  nForkBlock[j], firstDelBlock[j]);
4113  return;
4114  }
4115 
4116  for (i = 0; i < NBuffers; i++)
4117  {
4118  BufferDesc *bufHdr = GetBufferDescriptor(i);
4119  uint32 buf_state;
4120 
4121  /*
4122  * We can make this a tad faster by prechecking the buffer tag before
4123  * we attempt to lock the buffer; this saves a lot of lock
4124  * acquisitions in typical cases. It should be safe because the
4125  * caller must have AccessExclusiveLock on the relation, or some other
4126  * reason to be certain that no one is loading new pages of the rel
4127  * into the buffer pool. (Otherwise we might well miss such pages
4128  * entirely.) Therefore, while the tag might be changing while we
4129  * look at it, it can't be changing *to* a value we care about, only
4130  * *away* from such a value. So false negatives are impossible, and
4131  * false positives are safe because we'll recheck after getting the
4132  * buffer lock.
4133  *
4134  * We could check forkNum and blockNum as well as the rlocator, but
4135  * the incremental win from doing so seems small.
4136  */
4137  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4138  continue;
4139 
4140  buf_state = LockBufHdr(bufHdr);
4141 
4142  for (j = 0; j < nforks; j++)
4143  {
4144  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4145  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4146  bufHdr->tag.blockNum >= firstDelBlock[j])
4147  {
4148  InvalidateBuffer(bufHdr); /* releases spinlock */
4149  break;
4150  }
4151  }
4152  if (j >= nforks)
4153  UnlockBufHdr(bufHdr, buf_state);
4154  }
4155 }
4156 
4157 /* ---------------------------------------------------------------------
4158  * DropRelationsAllBuffers
4159  *
4160  * This function removes from the buffer pool all the pages of all
4161  * forks of the specified relations. It's equivalent to calling
4162  * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4163  * --------------------------------------------------------------------
4164  */
4165 void
4166 DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4167 {
4168  int i;
4169  int n = 0;
4170  SMgrRelation *rels;
4171  BlockNumber (*block)[MAX_FORKNUM + 1];
4172  uint64 nBlocksToInvalidate = 0;
4173  RelFileLocator *locators;
4174  bool cached = true;
4175  bool use_bsearch;
4176 
4177  if (nlocators == 0)
4178  return;
4179 
4180  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4181 
4182  /* If it's a local relation, it's localbuf.c's problem. */
4183  for (i = 0; i < nlocators; i++)
4184  {
4185  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4186  {
4187  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4188  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4189  }
4190  else
4191  rels[n++] = smgr_reln[i];
4192  }
4193 
4194  /*
4195  * If there are no non-local relations, then we're done. Release the
4196  * memory and return.
4197  */
4198  if (n == 0)
4199  {
4200  pfree(rels);
4201  return;
4202  }
4203 
4204  /*
4205  * This is used to remember the number of blocks for all the relations
4206  * forks.
4207  */
4208  block = (BlockNumber (*)[MAX_FORKNUM + 1])
4209  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4210 
4211  /*
4212  * We can avoid scanning the entire buffer pool if we know the exact size
4213  * of each of the given relation forks. See DropRelationBuffers.
4214  */
4215  for (i = 0; i < n && cached; i++)
4216  {
4217  for (int j = 0; j <= MAX_FORKNUM; j++)
4218  {
4219  /* Get the number of blocks for a relation's fork. */
4220  block[i][j] = smgrnblocks_cached(rels[i], j);
4221 
4222  /* We need to only consider the relation forks that exists. */
4223  if (block[i][j] == InvalidBlockNumber)
4224  {
4225  if (!smgrexists(rels[i], j))
4226  continue;
4227  cached = false;
4228  break;
4229  }
4230 
4231  /* calculate the total number of blocks to be invalidated */
4232  nBlocksToInvalidate += block[i][j];
4233  }
4234  }
4235 
4236  /*
4237  * We apply the optimization iff the total number of blocks to invalidate
4238  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4239  */
4240  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4241  {
4242  for (i = 0; i < n; i++)
4243  {
4244  for (int j = 0; j <= MAX_FORKNUM; j++)
4245  {
4246  /* ignore relation forks that doesn't exist */
4247  if (!BlockNumberIsValid(block[i][j]))
4248  continue;
4249 
4250  /* drop all the buffers for a particular relation fork */
4251  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4252  j, block[i][j], 0);
4253  }
4254  }
4255 
4256  pfree(block);
4257  pfree(rels);
4258  return;
4259  }
4260 
4261  pfree(block);
4262  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4263  for (i = 0; i < n; i++)
4264  locators[i] = rels[i]->smgr_rlocator.locator;
4265 
4266  /*
4267  * For low number of relations to drop just use a simple walk through, to
4268  * save the bsearch overhead. The threshold to use is rather a guess than
4269  * an exactly determined value, as it depends on many factors (CPU and RAM
4270  * speeds, amount of shared buffers etc.).
4271  */
4272  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4273 
4274  /* sort the list of rlocators if necessary */
4275  if (use_bsearch)
4276  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4277 
4278  for (i = 0; i < NBuffers; i++)
4279  {
4280  RelFileLocator *rlocator = NULL;
4281  BufferDesc *bufHdr = GetBufferDescriptor(i);
4282  uint32 buf_state;
4283 
4284  /*
4285  * As in DropRelationBuffers, an unlocked precheck should be safe and
4286  * saves some cycles.
4287  */
4288 
4289  if (!use_bsearch)
4290  {
4291  int j;
4292 
4293  for (j = 0; j < n; j++)
4294  {
4295  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4296  {
4297  rlocator = &locators[j];
4298  break;
4299  }
4300  }
4301  }
4302  else
4303  {
4304  RelFileLocator locator;
4305 
4306  locator = BufTagGetRelFileLocator(&bufHdr->tag);
4307  rlocator = bsearch((const void *) &(locator),
4308  locators, n, sizeof(RelFileLocator),
4310  }
4311 
4312  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4313  if (rlocator == NULL)
4314  continue;
4315 
4316  buf_state = LockBufHdr(bufHdr);
4317  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4318  InvalidateBuffer(bufHdr); /* releases spinlock */
4319  else
4320  UnlockBufHdr(bufHdr, buf_state);
4321  }
4322 
4323  pfree(locators);
4324  pfree(rels);
4325 }
4326 
4327 /* ---------------------------------------------------------------------
4328  * FindAndDropRelationBuffers
4329  *
4330  * This function performs look up in BufMapping table and removes from the
4331  * buffer pool all the pages of the specified relation fork that has block
4332  * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4333  * pages are removed.)
4334  * --------------------------------------------------------------------
4335  */
4336 static void
4338  BlockNumber nForkBlock,
4339  BlockNumber firstDelBlock)
4340 {
4341  BlockNumber curBlock;
4342 
4343  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4344  {
4345  uint32 bufHash; /* hash value for tag */
4346  BufferTag bufTag; /* identity of requested block */
4347  LWLock *bufPartitionLock; /* buffer partition lock for it */
4348  int buf_id;
4349  BufferDesc *bufHdr;
4350  uint32 buf_state;
4351 
4352  /* create a tag so we can lookup the buffer */
4353  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4354 
4355  /* determine its hash code and partition lock ID */
4356  bufHash = BufTableHashCode(&bufTag);
4357  bufPartitionLock = BufMappingPartitionLock(bufHash);
4358 
4359  /* Check that it is in the buffer pool. If not, do nothing. */
4360  LWLockAcquire(bufPartitionLock, LW_SHARED);
4361  buf_id = BufTableLookup(&bufTag, bufHash);
4362  LWLockRelease(bufPartitionLock);
4363 
4364  if (buf_id < 0)
4365  continue;
4366 
4367  bufHdr = GetBufferDescriptor(buf_id);
4368 
4369  /*
4370  * We need to lock the buffer header and recheck if the buffer is
4371  * still associated with the same block because the buffer could be
4372  * evicted by some other backend loading blocks for a different
4373  * relation after we release lock on the BufMapping table.
4374  */
4375  buf_state = LockBufHdr(bufHdr);
4376 
4377  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4378  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4379  bufHdr->tag.blockNum >= firstDelBlock)
4380  InvalidateBuffer(bufHdr); /* releases spinlock */
4381  else
4382  UnlockBufHdr(bufHdr, buf_state);
4383  }
4384 }
4385 
4386 /* ---------------------------------------------------------------------
4387  * DropDatabaseBuffers
4388  *
4389  * This function removes all the buffers in the buffer cache for a
4390  * particular database. Dirty pages are simply dropped, without
4391  * bothering to write them out first. This is used when we destroy a
4392  * database, to avoid trying to flush data to disk when the directory
4393  * tree no longer exists. Implementation is pretty similar to
4394  * DropRelationBuffers() which is for destroying just one relation.
4395  * --------------------------------------------------------------------
4396  */
4397 void
4399 {
4400  int i;
4401 
4402  /*
4403  * We needn't consider local buffers, since by assumption the target
4404  * database isn't our own.
4405  */
4406 
4407  for (i = 0; i < NBuffers; i++)
4408  {
4409  BufferDesc *bufHdr = GetBufferDescriptor(i);
4410  uint32 buf_state;
4411 
4412  /*
4413  * As in DropRelationBuffers, an unlocked precheck should be safe and
4414  * saves some cycles.
4415  */
4416  if (bufHdr->tag.dbOid != dbid)
4417  continue;
4418 
4419  buf_state = LockBufHdr(bufHdr);
4420  if (bufHdr->tag.dbOid == dbid)
4421  InvalidateBuffer(bufHdr); /* releases spinlock */
4422  else
4423  UnlockBufHdr(bufHdr, buf_state);
4424  }
4425 }
4426 
4427 /* -----------------------------------------------------------------
4428  * PrintBufferDescs
4429  *
4430  * this function prints all the buffer descriptors, for debugging
4431  * use only.
4432  * -----------------------------------------------------------------
4433  */
4434 #ifdef NOT_USED
4435 void
4436 PrintBufferDescs(void)
4437 {
4438  int i;
4439 
4440  for (i = 0; i < NBuffers; ++i)
4441  {
4444 
4445  /* theoretically we should lock the bufhdr here */
4446  elog(LOG,
4447  "[%02d] (freeNext=%d, rel=%s, "
4448  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4449  i, buf->freeNext,
4452  buf->tag.blockNum, buf->flags,
4453  buf->refcount, GetPrivateRefCount(b));
4454  }
4455 }
4456 #endif
4457 
4458 #ifdef NOT_USED
4459 void
4460 PrintPinnedBufs(void)
4461 {
4462  int i;
4463 
4464  for (i = 0; i < NBuffers; ++i)
4465  {
4468 
4469  if (GetPrivateRefCount(b) > 0)
4470  {
4471  /* theoretically we should lock the bufhdr here */
4472  elog(LOG,
4473  "[%02d] (freeNext=%d, rel=%s, "
4474  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4475  i, buf->freeNext,
4477  BufTagGetForkNum(&buf->tag)),
4478  buf->tag.blockNum, buf->flags,
4479  buf->refcount, GetPrivateRefCount(b));
4480  }
4481  }
4482 }
4483 #endif
4484 
4485 /* ---------------------------------------------------------------------
4486  * FlushRelationBuffers
4487  *
4488  * This function writes all dirty pages of a relation out to disk
4489  * (or more accurately, out to kernel disk buffers), ensuring that the
4490  * kernel has an up-to-date view of the relation.
4491  *
4492  * Generally, the caller should be holding AccessExclusiveLock on the
4493  * target relation to ensure that no other backend is busy dirtying
4494  * more blocks of the relation; the effects can't be expected to last
4495  * after the lock is released.
4496  *
4497  * XXX currently it sequentially searches the buffer pool, should be
4498  * changed to more clever ways of searching. This routine is not
4499  * used in any performance-critical code paths, so it's not worth
4500  * adding additional overhead to normal paths to make it go faster.
4501  * --------------------------------------------------------------------
4502  */
4503 void
4505 {
4506  int i;
4507  BufferDesc *bufHdr;
4508  SMgrRelation srel = RelationGetSmgr(rel);
4509 
4510  if (RelationUsesLocalBuffers(rel))
4511  {
4512  for (i = 0; i < NLocBuffer; i++)
4513  {
4514  uint32 buf_state;
4515  instr_time io_start;
4516 
4517  bufHdr = GetLocalBufferDescriptor(i);
4518  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4519  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4520  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4521  {
4522  ErrorContextCallback errcallback;
4523  Page localpage;
4524 
4525  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4526 
4527  /* Setup error traceback support for ereport() */
4529  errcallback.arg = (void *) bufHdr;
4530  errcallback.previous = error_context_stack;
4531  error_context_stack = &errcallback;
4532 
4533  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4534 
4536 
4537  smgrwrite(srel,
4538  BufTagGetForkNum(&bufHdr->tag),
4539  bufHdr->tag.blockNum,
4540  localpage,
4541  false);
4542 
4545  io_start, 1);
4546 
4547  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4548  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4549 
4551 
4552  /* Pop the error context stack */
4553  error_context_stack = errcallback.previous;
4554  }
4555  }
4556 
4557  return;
4558  }
4559 
4560  for (i = 0; i < NBuffers; i++)
4561  {
4562  uint32 buf_state;
4563 
4564  bufHdr = GetBufferDescriptor(i);
4565 
4566  /*
4567  * As in DropRelationBuffers, an unlocked precheck should be safe and
4568  * saves some cycles.
4569  */
4570  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4571  continue;
4572 
4573  /* Make sure we can handle the pin */
4576 
4577  buf_state = LockBufHdr(bufHdr);
4578  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4579  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4580  {
4581  PinBuffer_Locked(bufHdr);
4585  UnpinBuffer(bufHdr);
4586  }
4587  else
4588  UnlockBufHdr(bufHdr, buf_state);
4589  }
4590 }
4591 
4592 /* ---------------------------------------------------------------------
4593  * FlushRelationsAllBuffers
4594  *
4595  * This function flushes out of the buffer pool all the pages of all
4596  * forks of the specified smgr relations. It's equivalent to calling
4597  * FlushRelationBuffers once per relation. The relations are assumed not
4598  * to use local buffers.
4599  * --------------------------------------------------------------------
4600  */
4601 void
4603 {
4604  int i;
4605  SMgrSortArray *srels;
4606  bool use_bsearch;
4607 
4608  if (nrels == 0)
4609  return;
4610 
4611  /* fill-in array for qsort */
4612  srels = palloc(sizeof(SMgrSortArray) * nrels);
4613 
4614  for (i = 0; i < nrels; i++)
4615  {
4616  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4617 
4618  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4619  srels[i].srel = smgrs[i];
4620  }
4621 
4622  /*
4623  * Save the bsearch overhead for low number of relations to sync. See
4624  * DropRelationsAllBuffers for details.
4625  */
4626  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4627 
4628  /* sort the list of SMgrRelations if necessary */
4629  if (use_bsearch)
4630  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4631 
4632  for (i = 0; i < NBuffers; i++)
4633  {
4634  SMgrSortArray *srelent = NULL;
4635  BufferDesc *bufHdr = GetBufferDescriptor(i);
4636  uint32 buf_state;
4637 
4638  /*
4639  * As in DropRelationBuffers, an unlocked precheck should be safe and
4640  * saves some cycles.
4641  */
4642 
4643  if (!use_bsearch)
4644  {
4645  int j;
4646 
4647  for (j = 0; j < nrels; j++)
4648  {
4649  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4650  {
4651  srelent = &srels[j];
4652  break;
4653  }
4654  }
4655  }
4656  else
4657  {
4658  RelFileLocator rlocator;
4659 
4660  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4661  srelent = bsearch((const void *) &(rlocator),
4662  srels, nrels, sizeof(SMgrSortArray),
4664  }
4665 
4666  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4667  if (srelent == NULL)
4668  continue;
4669 
4670  /* Make sure we can handle the pin */
4673 
4674  buf_state = LockBufHdr(bufHdr);
4675  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4676  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4677  {
4678  PinBuffer_Locked(bufHdr);
4680  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4682  UnpinBuffer(bufHdr);
4683  }
4684  else
4685  UnlockBufHdr(bufHdr, buf_state);
4686  }
4687 
4688  pfree(srels);
4689 }
4690 
4691 /* ---------------------------------------------------------------------
4692  * RelationCopyStorageUsingBuffer
4693  *
4694  * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4695  * of using smgrread and smgrextend this will copy using bufmgr APIs.
4696  *
4697  * Refer comments atop CreateAndCopyRelationData() for details about
4698  * 'permanent' parameter.
4699  * --------------------------------------------------------------------
4700  */
4701 static void
4703  RelFileLocator dstlocator,
4704  ForkNumber forkNum, bool permanent)
4705 {
4706  Buffer srcBuf;
4707  Buffer dstBuf;
4708  Page srcPage;
4709  Page dstPage;
4710  bool use_wal;
4711  BlockNumber nblocks;
4712  BlockNumber blkno;
4714  BufferAccessStrategy bstrategy_src;
4715  BufferAccessStrategy bstrategy_dst;
4717  ReadStream *src_stream;
4718  SMgrRelation src_smgr;
4719 
4720  /*
4721  * In general, we want to write WAL whenever wal_level > 'minimal', but we
4722  * can skip it when copying any fork of an unlogged relation other than
4723  * the init fork.
4724  */
4725  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4726 
4727  /* Get number of blocks in the source relation. */
4729  forkNum);
4730 
4731  /* Nothing to copy; just return. */
4732  if (nblocks == 0)
4733  return;
4734 
4735  /*
4736  * Bulk extend the destination relation of the same size as the source
4737  * relation before starting to copy block by block.
4738  */
4739  memset(buf.data, 0, BLCKSZ);
4740  smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4741  buf.data, true);
4742 
4743  /* This is a bulk operation, so use buffer access strategies. */
4744  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4745  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4746 
4747  /* Initalize streaming read */
4748  p.blocknum = 0;
4749  p.nblocks = nblocks;
4750  src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
4752  bstrategy_src,
4753  src_smgr,
4754  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
4755  forkNum,
4757  &p,
4758  0);
4759 
4760  /* Iterate over each block of the source relation file. */
4761  for (blkno = 0; blkno < nblocks; blkno++)
4762  {
4764 
4765  /* Read block from source relation. */
4766  srcBuf = read_stream_next_buffer(src_stream, NULL);
4767  LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4768  srcPage = BufferGetPage(srcBuf);
4769 
4770  dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
4771  BufferGetBlockNumber(srcBuf),
4772  RBM_ZERO_AND_LOCK, bstrategy_dst,
4773  permanent);
4774  dstPage = BufferGetPage(dstBuf);
4775 
4777 
4778  /* Copy page data from the source to the destination. */
4779  memcpy(dstPage, srcPage, BLCKSZ);
4780  MarkBufferDirty(dstBuf);
4781 
4782  /* WAL-log the copied page. */
4783  if (use_wal)
4784  log_newpage_buffer(dstBuf, true);
4785 
4786  END_CRIT_SECTION();
4787 
4788  UnlockReleaseBuffer(dstBuf);
4789  UnlockReleaseBuffer(srcBuf);
4790  }
4791  Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
4792  read_stream_end(src_stream);
4793 
4794  FreeAccessStrategy(bstrategy_src);
4795  FreeAccessStrategy(bstrategy_dst);
4796 }
4797 
4798 /* ---------------------------------------------------------------------
4799  * CreateAndCopyRelationData
4800  *
4801  * Create destination relation storage and copy all forks from the
4802  * source relation to the destination.
4803  *
4804  * Pass permanent as true for permanent relations and false for
4805  * unlogged relations. Currently this API is not supported for
4806  * temporary relations.
4807  * --------------------------------------------------------------------
4808  */
4809 void
4811  RelFileLocator dst_rlocator, bool permanent)
4812 {
4813  char relpersistence;
4814  SMgrRelation src_rel;
4815  SMgrRelation dst_rel;
4816 
4817  /* Set the relpersistence. */
4818  relpersistence = permanent ?
4819  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4820 
4821  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4822  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4823 
4824  /*
4825  * Create and copy all forks of the relation. During create database we
4826  * have a separate cleanup mechanism which deletes complete database
4827  * directory. Therefore, each individual relation doesn't need to be
4828  * registered for cleanup.
4829  */
4830  RelationCreateStorage(dst_rlocator, relpersistence, false);
4831 
4832  /* copy main fork. */
4833  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4834  permanent);
4835 
4836  /* copy those extra forks that exist */
4837  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4838  forkNum <= MAX_FORKNUM; forkNum++)
4839  {
4840  if (smgrexists(src_rel, forkNum))
4841  {
4842  smgrcreate(dst_rel, forkNum, false);
4843 
4844  /*
4845  * WAL log creation if the relation is persistent, or this is the
4846  * init fork of an unlogged relation.
4847  */
4848  if (permanent || forkNum == INIT_FORKNUM)
4849  log_smgrcreate(&dst_rlocator, forkNum);
4850 
4851  /* Copy a fork's data, block by block. */
4852  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4853  permanent);
4854  }
4855  }
4856 }
4857 
4858 /* ---------------------------------------------------------------------
4859  * FlushDatabaseBuffers
4860  *
4861  * This function writes all dirty pages of a database out to disk
4862  * (or more accurately, out to kernel disk buffers), ensuring that the
4863  * kernel has an up-to-date view of the database.
4864  *
4865  * Generally, the caller should be holding an appropriate lock to ensure
4866  * no other backend is active in the target database; otherwise more
4867  * pages could get dirtied.
4868  *
4869  * Note we don't worry about flushing any pages of temporary relations.
4870  * It's assumed these wouldn't be interesting.
4871  * --------------------------------------------------------------------
4872  */
4873 void
4875 {
4876  int i;
4877  BufferDesc *bufHdr;
4878 
4879  for (i = 0; i < NBuffers; i++)
4880  {
4881  uint32 buf_state;
4882 
4883  bufHdr = GetBufferDescriptor(i);
4884 
4885  /*
4886  * As in DropRelationBuffers, an unlocked precheck should be safe and
4887  * saves some cycles.
4888  */
4889  if (bufHdr->tag.dbOid != dbid)
4890  continue;
4891 
4892  /* Make sure we can handle the pin */
4895 
4896  buf_state = LockBufHdr(bufHdr);
4897  if (bufHdr->tag.dbOid == dbid &&
4898  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4899  {
4900  PinBuffer_Locked(bufHdr);
4904  UnpinBuffer(bufHdr);
4905  }
4906  else
4907  UnlockBufHdr(bufHdr, buf_state);
4908  }
4909 }
4910 
4911 /*
4912  * Flush a previously, shared or exclusively, locked and pinned buffer to the
4913  * OS.
4914  */
4915 void
4917 {
4918  BufferDesc *bufHdr;
4919 
4920  /* currently not needed, but no fundamental reason not to support */
4921  Assert(!BufferIsLocal(buffer));
4922 
4923  Assert(BufferIsPinned(buffer));
4924 
4925  bufHdr = GetBufferDescriptor(buffer - 1);
4926 
4928 
4930 }
4931 
4932 /*
4933  * ReleaseBuffer -- release the pin on a buffer
4934  */
4935 void
4937 {
4938  if (!BufferIsValid(buffer))
4939  elog(ERROR, "bad buffer ID: %d", buffer);
4940 
4941  if (BufferIsLocal(buffer))
4942  UnpinLocalBuffer(buffer);
4943  else
4944  UnpinBuffer(GetBufferDescriptor(buffer - 1));
4945 }
4946 
4947 /*
4948  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4949  *
4950  * This is just a shorthand for a common combination.
4951  */
4952 void
4954 {
4955  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4956  ReleaseBuffer(buffer);
4957 }
4958 
4959 /*
4960  * IncrBufferRefCount
4961  * Increment the pin count on a buffer that we have *already* pinned
4962  * at least once.
4963  *
4964  * This function cannot be used on a buffer we do not have pinned,
4965  * because it doesn't change the shared buffer state.
4966  */
4967 void
4969 {
4970  Assert(BufferIsPinned(buffer));
4972  if (BufferIsLocal(buffer))
4973  LocalRefCount[-buffer - 1]++;
4974  else
4975  {
4976  PrivateRefCountEntry *ref;
4977 
4978  ref = GetPrivateRefCountEntry(buffer, true);
4979  Assert(ref != NULL);
4980  ref->refcount++;
4981  }
4983 }
4984 
4985 /*
4986  * MarkBufferDirtyHint
4987  *
4988  * Mark a buffer dirty for non-critical changes.
4989  *
4990  * This is essentially the same as MarkBufferDirty, except:
4991  *
4992  * 1. The caller does not write WAL; so if checksums are enabled, we may need
4993  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4994  * 2. The caller might have only share-lock instead of exclusive-lock on the
4995  * buffer's content lock.
4996  * 3. This function does not guarantee that the buffer is always marked dirty
4997  * (due to a race condition), so it cannot be used for important changes.
4998  */
4999 void
5000 MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
5001 {
5002  BufferDesc *bufHdr;
5003  Page page = BufferGetPage(buffer);
5004 
5005  if (!BufferIsValid(buffer))
5006  elog(ERROR, "bad buffer ID: %d", buffer);
5007 
5008  if (BufferIsLocal(buffer))
5009  {
5010  MarkLocalBufferDirty(buffer);
5011  return;
5012  }
5013 
5014  bufHdr = GetBufferDescriptor(buffer - 1);
5015 
5016  Assert(GetPrivateRefCount(buffer) > 0);
5017  /* here, either share or exclusive lock is OK */
5019 
5020  /*
5021  * This routine might get called many times on the same page, if we are
5022  * making the first scan after commit of an xact that added/deleted many
5023  * tuples. So, be as quick as we can if the buffer is already dirty. We
5024  * do this by not acquiring spinlock if it looks like the status bits are
5025  * already set. Since we make this test unlocked, there's a chance we
5026  * might fail to notice that the flags have just been cleared, and failed
5027  * to reset them, due to memory-ordering issues. But since this function
5028  * is only intended to be used in cases where failing to write out the
5029  * data would be harmless anyway, it doesn't really matter.
5030  */
5031  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5033  {
5035  bool dirtied = false;
5036  bool delayChkptFlags = false;
5037  uint32 buf_state;
5038 
5039  /*
5040  * If we need to protect hint bit updates from torn writes, WAL-log a
5041  * full page image of the page. This full page image is only necessary
5042  * if the hint bit update is the first change to the page since the
5043  * last checkpoint.
5044  *
5045  * We don't check full_page_writes here because that logic is included
5046  * when we call XLogInsert() since the value changes dynamically.
5047  */
5048  if (XLogHintBitIsNeeded() &&
5049  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5050  {
5051  /*
5052  * If we must not write WAL, due to a relfilelocator-specific
5053  * condition or being in recovery, don't dirty the page. We can
5054  * set the hint, just not dirty the page as a result so the hint
5055  * is lost when we evict the page or shutdown.
5056  *
5057  * See src/backend/storage/page/README for longer discussion.
5058  */
5059  if (RecoveryInProgress() ||
5061  return;
5062 
5063  /*
5064  * If the block is already dirty because we either made a change
5065  * or set a hint already, then we don't need to write a full page
5066  * image. Note that aggressive cleaning of blocks dirtied by hint
5067  * bit setting would increase the call rate. Bulk setting of hint
5068  * bits would reduce the call rate...
5069  *
5070  * We must issue the WAL record before we mark the buffer dirty.
5071  * Otherwise we might write the page before we write the WAL. That
5072  * causes a race condition, since a checkpoint might occur between
5073  * writing the WAL record and marking the buffer dirty. We solve
5074  * that with a kluge, but one that is already in use during
5075  * transaction commit to prevent race conditions. Basically, we
5076  * simply prevent the checkpoint WAL record from being written
5077  * until we have marked the buffer dirty. We don't start the
5078  * checkpoint flush until we have marked dirty, so our checkpoint
5079  * must flush the change to disk successfully or the checkpoint
5080  * never gets written, so crash recovery will fix.
5081  *
5082  * It's possible we may enter here without an xid, so it is
5083  * essential that CreateCheckPoint waits for virtual transactions
5084  * rather than full transactionids.
5085  */
5088  delayChkptFlags = true;
5089  lsn = XLogSaveBufferForHint(buffer, buffer_std);
5090  }
5091 
5092  buf_state = LockBufHdr(bufHdr);
5093 
5094  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5095 
5096  if (!(buf_state & BM_DIRTY))
5097  {
5098  dirtied = true; /* Means "will be dirtied by this action" */
5099 
5100  /*
5101  * Set the page LSN if we wrote a backup block. We aren't supposed
5102  * to set this when only holding a share lock but as long as we
5103  * serialise it somehow we're OK. We choose to set LSN while
5104  * holding the buffer header lock, which causes any reader of an
5105  * LSN who holds only a share lock to also obtain a buffer header
5106  * lock before using PageGetLSN(), which is enforced in
5107  * BufferGetLSNAtomic().
5108  *
5109  * If checksums are enabled, you might think we should reset the
5110  * checksum here. That will happen when the page is written
5111  * sometime later in this checkpoint cycle.
5112  */
5113  if (!XLogRecPtrIsInvalid(lsn))
5114  PageSetLSN(page, lsn);
5115  }
5116 
5117  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5118  UnlockBufHdr(bufHdr, buf_state);
5119 
5120  if (delayChkptFlags)
5122 
5123  if (dirtied)
5124  {
5125  VacuumPageDirty++;
5127  if (VacuumCostActive)
5129  }
5130  }
5131 }
5132 
5133 /*
5134  * Release buffer content locks for shared buffers.
5135  *
5136  * Used to clean up after errors.
5137  *
5138  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5139  * of releasing buffer content locks per se; the only thing we need to deal
5140  * with here is clearing any PIN_COUNT request that was in progress.
5141  */
5142 void
5144 {
5146 
5147  if (buf)
5148  {
5149  uint32 buf_state;
5150 
5151  buf_state = LockBufHdr(buf);
5152 
5153  /*
5154  * Don't complain if flag bit not set; it could have been reset but we
5155  * got a cancel/die interrupt before getting the signal.
5156  */
5157  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5158  buf->wait_backend_pgprocno == MyProcNumber)
5159  buf_state &= ~BM_PIN_COUNT_WAITER;
5160 
5161  UnlockBufHdr(buf, buf_state);
5162 
5163  PinCountWaitBuf = NULL;
5164  }
5165 }
5166 
5167 /*
5168  * Acquire or release the content_lock for the buffer.
5169  */
5170 void
5171 LockBuffer(Buffer buffer, int mode)
5172 {
5173  BufferDesc *buf;
5174 
5175  Assert(BufferIsPinned(buffer));
5176  if (BufferIsLocal(buffer))
5177  return; /* local buffers need no lock */
5178 
5179  buf = GetBufferDescriptor(buffer - 1);
5180 
5181  if (mode == BUFFER_LOCK_UNLOCK)
5183  else if (mode == BUFFER_LOCK_SHARE)
5185  else if (mode == BUFFER_LOCK_EXCLUSIVE)
5187  else
5188  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5189 }
5190 
5191 /*
5192  * Acquire the content_lock for the buffer, but only if we don't have to wait.
5193  *
5194  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5195  */
5196 bool
5198 {
5199  BufferDesc *buf;
5200 
5201  Assert(BufferIsPinned(buffer));
5202  if (BufferIsLocal(buffer))
5203  return true; /* act as though we got it */
5204 
5205  buf = GetBufferDescriptor(buffer - 1);
5206 
5208  LW_EXCLUSIVE);
5209 }
5210 
5211 /*
5212  * Verify that this backend is pinning the buffer exactly once.
5213  *
5214  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5215  * holds a pin on the buffer. We do not care whether some other backend does.
5216  */
5217 void
5219 {
5220  if (BufferIsLocal(buffer))
5221  {
5222  if (LocalRefCount[-buffer - 1] != 1)
5223  elog(ERROR, "incorrect local pin count: %d",
5224  LocalRefCount[-buffer - 1]);
5225  }
5226  else
5227  {
5228  if (GetPrivateRefCount(buffer) != 1)
5229  elog(ERROR, "incorrect local pin count: %d",
5230  GetPrivateRefCount(buffer));
5231  }
5232 }
5233 
5234 /*
5235  * LockBufferForCleanup - lock a buffer in preparation for deleting items
5236  *
5237  * Items may be deleted from a disk page only when the caller (a) holds an
5238  * exclusive lock on the buffer and (b) has observed that no other backend
5239  * holds a pin on the buffer. If there is a pin, then the other backend
5240  * might have a pointer into the buffer (for example, a heapscan reference
5241  * to an item --- see README for more details). It's OK if a pin is added
5242  * after the cleanup starts, however; the newly-arrived backend will be
5243  * unable to look at the page until we release the exclusive lock.
5244  *
5245  * To implement this protocol, a would-be deleter must pin the buffer and
5246  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5247  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5248  * it has successfully observed pin count = 1.
5249  */
5250 void
5252 {
5253  BufferDesc *bufHdr;
5254  TimestampTz waitStart = 0;
5255  bool waiting = false;
5256  bool logged_recovery_conflict = false;
5257 
5258  Assert(BufferIsPinned(buffer));
5259  Assert(PinCountWaitBuf == NULL);
5260 
5261  CheckBufferIsPinnedOnce(buffer);
5262 
5263  /* Nobody else to wait for */
5264  if (BufferIsLocal(buffer))
5265  return;
5266 
5267  bufHdr = GetBufferDescriptor(buffer - 1);
5268 
5269  for (;;)
5270  {
5271  uint32 buf_state;
5272 
5273  /* Try to acquire lock */
5275  buf_state = LockBufHdr(bufHdr);
5276 
5277  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5278  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5279  {
5280  /* Successfully acquired exclusive lock with pincount 1 */
5281  UnlockBufHdr(bufHdr, buf_state);
5282 
5283  /*
5284  * Emit the log message if recovery conflict on buffer pin was
5285  * resolved but the startup process waited longer than
5286  * deadlock_timeout for it.
5287  */
5288  if (logged_recovery_conflict)
5290  waitStart, GetCurrentTimestamp(),
5291  NULL, false);
5292 
5293  if (waiting)
5294  {
5295  /* reset ps display to remove the suffix if we added one */
5297  waiting = false;
5298  }
5299  return;
5300  }
5301  /* Failed, so mark myself as waiting for pincount 1 */
5302  if (buf_state & BM_PIN_COUNT_WAITER)
5303  {
5304  UnlockBufHdr(bufHdr, buf_state);
5305  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5306  elog(ERROR, "multiple backends attempting to wait for pincount 1");
5307  }
5309  PinCountWaitBuf = bufHdr;
5310  buf_state |= BM_PIN_COUNT_WAITER;
5311  UnlockBufHdr(bufHdr, buf_state);
5312  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5313 
5314  /* Wait to be signaled by UnpinBuffer() */
5315  if (InHotStandby)
5316  {
5317  if (!waiting)
5318  {
5319  /* adjust the process title to indicate that it's waiting */
5320  set_ps_display_suffix("waiting");
5321  waiting = true;
5322  }
5323 
5324  /*
5325  * Emit the log message if the startup process is waiting longer
5326  * than deadlock_timeout for recovery conflict on buffer pin.
5327  *
5328  * Skip this if first time through because the startup process has
5329  * not started waiting yet in this case. So, the wait start
5330  * timestamp is set after this logic.
5331  */
5332  if (waitStart != 0 && !logged_recovery_conflict)
5333  {
5335 
5336  if (TimestampDifferenceExceeds(waitStart, now,
5337  DeadlockTimeout))
5338  {
5340  waitStart, now, NULL, true);
5341  logged_recovery_conflict = true;
5342  }
5343  }
5344 
5345  /*
5346  * Set the wait start timestamp if logging is enabled and first
5347  * time through.
5348  */
5349  if (log_recovery_conflict_waits && waitStart == 0)
5350  waitStart = GetCurrentTimestamp();
5351 
5352  /* Publish the bufid that Startup process waits on */
5353  SetStartupBufferPinWaitBufId(buffer - 1);
5354  /* Set alarm and then wait to be signaled by UnpinBuffer() */
5356  /* Reset the published bufid */
5358  }
5359  else
5360  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5361 
5362  /*
5363  * Remove flag marking us as waiter. Normally this will not be set
5364  * anymore, but ProcWaitForSignal() can return for other signals as
5365  * well. We take care to only reset the flag if we're the waiter, as
5366  * theoretically another backend could have started waiting. That's
5367  * impossible with the current usages due to table level locking, but
5368  * better be safe.
5369  */
5370  buf_state = LockBufHdr(bufHdr);
5371  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5373  buf_state &= ~BM_PIN_COUNT_WAITER;
5374  UnlockBufHdr(bufHdr, buf_state);
5375 
5376  PinCountWaitBuf = NULL;
5377  /* Loop back and try again */
5378  }
5379 }
5380 
5381 /*
5382  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5383  * requests cancellation of all pin holders that are blocking it.
5384  */
5385 bool
5387 {
5388  int bufid = GetStartupBufferPinWaitBufId();
5389 
5390  /*
5391  * If we get woken slowly then it's possible that the Startup process was
5392  * already woken by other backends before we got here. Also possible that
5393  * we get here by multiple interrupts or interrupts at inappropriate
5394  * times, so make sure we do nothing if the bufid is not set.
5395  */
5396  if (bufid < 0)
5397  return false;
5398 
5399  if (GetPrivateRefCount(bufid + 1) > 0)
5400  return true;
5401 
5402  return false;
5403 }
5404 
5405 /*
5406  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5407  *
5408  * We won't loop, but just check once to see if the pin count is OK. If
5409  * not, return false with no lock held.
5410  */
5411 bool
5413 {
5414  BufferDesc *bufHdr;
5415  uint32 buf_state,
5416  refcount;
5417 
5418  Assert(BufferIsValid(buffer));
5419 
5420  if (BufferIsLocal(buffer))
5421  {
5422  refcount = LocalRefCount[-buffer - 1];
5423  /* There should be exactly one pin */
5424  Assert(refcount > 0);
5425  if (refcount != 1)
5426  return false;
5427  /* Nobody else to wait for */
5428  return true;
5429  }
5430 
5431  /* There should be exactly one local pin */
5432  refcount = GetPrivateRefCount(buffer);
5433  Assert(refcount);
5434  if (refcount != 1)
5435  return false;
5436 
5437  /* Try to acquire lock */
5438  if (!ConditionalLockBuffer(buffer))
5439  return false;
5440 
5441  bufHdr = GetBufferDescriptor(buffer - 1);
5442  buf_state = LockBufHdr(bufHdr);
5443  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5444 
5445  Assert(refcount > 0);
5446  if (refcount == 1)
5447  {
5448  /* Successfully acquired exclusive lock with pincount 1 */
5449  UnlockBufHdr(bufHdr, buf_state);
5450  return true;
5451  }
5452 
5453  /* Failed, so release the lock */
5454  UnlockBufHdr(bufHdr, buf_state);
5455  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5456  return false;
5457 }
5458 
5459 /*
5460  * IsBufferCleanupOK - as above, but we already have the lock
5461  *
5462  * Check whether it's OK to perform cleanup on a buffer we've already
5463  * locked. If we observe that the pin count is 1, our exclusive lock
5464  * happens to be a cleanup lock, and we can proceed with anything that
5465  * would have been allowable had we sought a cleanup lock originally.
5466  */
5467 bool
5469 {
5470  BufferDesc *bufHdr;
5471  uint32 buf_state;
5472 
5473  Assert(BufferIsValid(buffer));
5474 
5475  if (BufferIsLocal(buffer))
5476  {
5477  /* There should be exactly one pin */
5478  if (LocalRefCount[-buffer - 1] != 1)
5479  return false;
5480  /* Nobody else to wait for */
5481  return true;
5482  }
5483 
5484  /* There should be exactly one local pin */
5485  if (GetPrivateRefCount(buffer) != 1)
5486  return false;
5487 
5488  bufHdr = GetBufferDescriptor(buffer - 1);
5489 
5490  /* caller must hold exclusive lock on buffer */
5492  LW_EXCLUSIVE));
5493 
5494  buf_state = LockBufHdr(bufHdr);
5495 
5496  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5497  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5498  {
5499  /* pincount is OK. */
5500  UnlockBufHdr(bufHdr, buf_state);
5501  return true;
5502  }
5503 
5504  UnlockBufHdr(bufHdr, buf_state);
5505  return false;
5506 }
5507 
5508 
5509 /*
5510  * Functions for buffer I/O handling
5511  *
5512  * Note: We assume that nested buffer I/O never occurs.
5513  * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5514  *
5515  * Also note that these are used only for shared buffers, not local ones.
5516  */
5517 
5518 /*
5519  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5520  */
5521 static void
5523 {
5525 
5527  for (;;)
5528  {
5529  uint32 buf_state;
5530 
5531  /*
5532  * It may not be necessary to acquire the spinlock to check the flag
5533  * here, but since this test is essential for correctness, we'd better
5534  * play it safe.
5535  */
5536  buf_state = LockBufHdr(buf);
5537  UnlockBufHdr(buf, buf_state);
5538 
5539  if (!(buf_state & BM_IO_IN_PROGRESS))
5540  break;
5541  ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5542  }
5544 }
5545 
5546 /*
5547  * StartBufferIO: begin I/O on this buffer
5548  * (Assumptions)
5549  * My process is executing no IO
5550  * The buffer is Pinned
5551  *
5552  * In some scenarios there are race conditions in which multiple backends
5553  * could attempt the same I/O operation concurrently. If someone else
5554  * has already started I/O on this buffer then we will block on the
5555  * I/O condition variable until he's done.
5556  *
5557  * Input operations are only attempted on buffers that are not BM_VALID,
5558  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5559  * so we can always tell if the work is already done.
5560  *
5561  * Returns true if we successfully marked the buffer as I/O busy,
5562  * false if someone else already did the work.
5563  *
5564  * If nowait is true, then we don't wait for an I/O to be finished by another
5565  * backend. In that case, false indicates either that the I/O was already
5566  * finished, or is still in progress. This is useful for callers that want to
5567  * find out if they can perform the I/O as part of a larger operation, without
5568  * waiting for the answer or distinguishing the reasons why not.
5569  */
5570 static bool
5571 StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5572 {
5573  uint32 buf_state;
5574 
5576 
5577  for (;;)
5578  {
5579  buf_state = LockBufHdr(buf);
5580 
5581  if (!(buf_state & BM_IO_IN_PROGRESS))
5582  break;
5583  UnlockBufHdr(buf, buf_state);
5584  if (nowait)
5585  return false;
5586  WaitIO(buf);
5587  }
5588 
5589  /* Once we get here, there is definitely no I/O active on this buffer */
5590 
5591  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5592  {
5593  /* someone else already did the I/O */
5594  UnlockBufHdr(buf, buf_state);
5595  return false;
5596  }
5597 
5598  buf_state |= BM_IO_IN_PROGRESS;
5599  UnlockBufHdr(buf, buf_state);
5600 
5603 
5604  return true;
5605 }
5606 
5607 /*
5608  * TerminateBufferIO: release a buffer we were doing I/O on
5609  * (Assumptions)
5610  * My process is executing IO for the buffer
5611  * BM_IO_IN_PROGRESS bit is set for the buffer
5612  * The buffer is Pinned
5613  *
5614  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5615  * buffer's BM_DIRTY flag. This is appropriate when terminating a
5616  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5617  * marking the buffer clean if it was re-dirtied while we were writing.
5618  *
5619  * set_flag_bits gets ORed into the buffer's flags. It must include
5620  * BM_IO_ERROR in a failure case. For successful completion it could
5621  * be 0, or BM_VALID if we just finished reading in the page.
5622  *
5623  * If forget_owner is true, we release the buffer I/O from the current
5624  * resource owner. (forget_owner=false is used when the resource owner itself
5625  * is being released)
5626  */
5627 static void
5628 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5629  bool forget_owner)
5630 {
5631  uint32 buf_state;
5632 
5633  buf_state = LockBufHdr(buf);
5634 
5635  Assert(buf_state & BM_IO_IN_PROGRESS);
5636 
5637  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5638  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5639  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5640 
5641  buf_state |= set_flag_bits;
5642  UnlockBufHdr(buf, buf_state);
5643 
5644  if (forget_owner)
5647 
5649 }
5650 
5651 /*
5652  * AbortBufferIO: Clean up active buffer I/O after an error.
5653  *
5654  * All LWLocks we might have held have been released,
5655  * but we haven't yet released buffer pins, so the buffer is still pinned.
5656  *
5657  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5658  * possible the error condition wasn't related to the I/O.
5659  *
5660  * Note: this does not remove the buffer I/O from the resource owner.
5661  * That's correct when we're releasing the whole resource owner, but
5662  * beware if you use this in other contexts.
5663  */
5664 static void
5666 {
5667  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5668  uint32 buf_state;
5669 
5670  buf_state = LockBufHdr(buf_hdr);
5671  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5672 
5673  if (!(buf_state & BM_VALID))
5674  {
5675  Assert(!(buf_state & BM_DIRTY));
5676  UnlockBufHdr(buf_hdr, buf_state);
5677  }
5678  else
5679  {
5680  Assert(buf_state & BM_DIRTY);
5681  UnlockBufHdr(buf_hdr, buf_state);
5682 
5683  /* Issue notice if this is not the first failure... */
5684  if (buf_state & BM_IO_ERROR)
5685  {
5686  /* Buffer is pinned, so we can read tag without spinlock */
5687  char *path;
5688 
5689  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5690  BufTagGetForkNum(&buf_hdr->tag));
5691  ereport(WARNING,
5692  (errcode(ERRCODE_IO_ERROR),
5693  errmsg("could not write block %u of %s",
5694  buf_hdr->tag.blockNum, path),
5695  errdetail("Multiple failures --- write error might be permanent.")));
5696  pfree(path);
5697  }
5698  }
5699 
5700  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5701 }
5702 
5703 /*
5704  * Error context callback for errors occurring during shared buffer writes.
5705  */
5706 static void
5708 {
5709  BufferDesc *bufHdr = (BufferDesc *) arg;
5710 
5711  /* Buffer is pinned, so we can read the tag without locking the spinlock */
5712  if (bufHdr != NULL)
5713  {
5714  char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5715  BufTagGetForkNum(&bufHdr->tag));
5716 
5717  errcontext("writing block %u of relation %s",
5718  bufHdr->tag.blockNum, path);
5719  pfree(path);
5720  }
5721 }
5722 
5723 /*
5724  * Error context callback for errors occurring during local buffer writes.
5725  */
5726 static void
5728 {
5729  BufferDesc *bufHdr = (BufferDesc *) arg;
5730 
5731  if (bufHdr != NULL)
5732  {
5733  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5734  MyProcNumber,
5735  BufTagGetForkNum(&bufHdr->tag));
5736 
5737  errcontext("writing block %u of relation %s",
5738  bufHdr->tag.blockNum, path);
5739  pfree(path);
5740  }
5741 }
5742 
5743 /*
5744  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5745  */
5746 static int
5747 rlocator_comparator(const void *p1, const void *p2)
5748 {
5749  RelFileLocator n1 = *(const RelFileLocator *) p1;
5750  RelFileLocator n2 = *(const RelFileLocator *) p2;
5751 
5752  if (n1.relNumber < n2.relNumber)
5753  return -1;
5754  else if (n1.relNumber > n2.relNumber)
5755  return 1;
5756 
5757  if (n1.dbOid < n2.dbOid)
5758  return -1;
5759  else if (n1.dbOid > n2.dbOid)
5760  return 1;
5761 
5762  if (n1.spcOid < n2.spcOid)
5763  return -1;
5764  else if (n1.spcOid > n2.spcOid)
5765  return 1;
5766  else
5767  return 0;
5768 }
5769 
5770 /*
5771  * Lock buffer header - set BM_LOCKED in buffer state.
5772  */
5773 uint32
5775 {
5776  SpinDelayStatus delayStatus;
5777  uint32 old_buf_state;
5778 
5780 
5781  init_local_spin_delay(&delayStatus);
5782 
5783  while (true)
5784  {
5785  /* set BM_LOCKED flag */
5786  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5787  /* if it wasn't set before we're OK */
5788  if (!(old_buf_state & BM_LOCKED))
5789  break;
5790  perform_spin_delay(&delayStatus);
5791  }
5792  finish_spin_delay(&delayStatus);
5793  return old_buf_state | BM_LOCKED;
5794 }
5795 
5796 /*
5797  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5798  * state at that point.
5799  *
5800  * Obviously the buffer could be locked by the time the value is returned, so
5801  * this is primarily useful in CAS style loops.
5802  */
5803 static uint32
5805 {
5806  SpinDelayStatus delayStatus;
5807  uint32 buf_state;
5808 
5809  init_local_spin_delay(&delayStatus);
5810 
5811  buf_state = pg_atomic_read_u32(&buf->state);
5812 
5813  while (buf_state & BM_LOCKED)
5814  {
5815  perform_spin_delay(&delayStatus);
5816  buf_state = pg_atomic_read_u32(&buf->state);
5817  }
5818 
5819  finish_spin_delay(&delayStatus);
5820 
5821  return buf_state;
5822 }
5823 
5824 /*
5825  * BufferTag comparator.
5826  */
5827 static inline int
5829 {
5830  int ret;
5831  RelFileLocator rlocatora;
5832  RelFileLocator rlocatorb;
5833 
5834  rlocatora = BufTagGetRelFileLocator(ba);
5835  rlocatorb = BufTagGetRelFileLocator(bb);
5836 
5837  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5838 
5839  if (ret != 0)
5840  return ret;
5841 
5842  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5843  return -1;
5844  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5845  return 1;
5846 
5847  if (ba->blockNum < bb->blockNum)
5848  return -1;
5849  if (ba->blockNum > bb->blockNum)
5850  return 1;
5851 
5852  return 0;
5853 }
5854 
5855 /*
5856  * Comparator determining the writeout order in a checkpoint.
5857  *
5858  * It is important that tablespaces are compared first, the logic balancing
5859  * writes between tablespaces relies on it.
5860  */
5861 static inline int
5863 {
5864  /* compare tablespace */
5865  if (a->tsId < b->tsId)
5866  return -1;
5867  else if (a->tsId > b->tsId)
5868  return 1;
5869  /* compare relation */
5870  if (a->relNumber < b->relNumber)
5871  return -1;
5872  else if (a->relNumber > b->relNumber)
5873  return 1;
5874  /* compare fork */
5875  else if (a->forkNum < b->forkNum)
5876  return -1;
5877  else if (a->forkNum > b->forkNum)
5878  return 1;
5879  /* compare block number */
5880  else if (a->blockNum < b->blockNum)
5881  return -1;
5882  else if (a->blockNum > b->blockNum)
5883  return 1;
5884  /* equal page IDs are unlikely, but not impossible */
5885  return 0;
5886 }
5887 
5888 /*
5889  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5890  * progress.
5891  */
5892 static int
5894 {
5895  CkptTsStatus *sa = (CkptTsStatus *) a;
5896  CkptTsStatus *sb = (CkptTsStatus *) b;
5897 
5898  /* we want a min-heap, so return 1 for the a < b */
5899  if (sa->progress < sb->progress)
5900  return 1;
5901  else if (sa->progress == sb->progress)
5902  return 0;
5903  else
5904  return -1;
5905 }
5906 
5907 /*
5908  * Initialize a writeback context, discarding potential previous state.
5909  *
5910  * *max_pending is a pointer instead of an immediate value, so the coalesce
5911  * limits can easily changed by the GUC mechanism, and so calling code does
5912  * not have to check the current configuration. A value of 0 means that no
5913  * writeback control will be performed.
5914  */
5915 void
5917 {
5918  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5919 
5920  context->max_pending = max_pending;
5921  context->nr_pending = 0;
5922 }
5923 
5924 /*
5925  * Add buffer to list of pending writeback requests.
5926  */
5927 void
5929  BufferTag *tag)
5930 {
5931  PendingWriteback *pending;
5932 
5934  return;
5935 
5936  /*
5937  * Add buffer to the pending writeback array, unless writeback control is
5938  * disabled.
5939  */
5940  if (*wb_context->max_pending > 0)
5941  {
5943 
5944  pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5945 
5946  pending->tag = *tag;
5947  }
5948 
5949  /*
5950  * Perform pending flushes if the writeback limit is exceeded. This
5951  * includes the case where previously an item has been added, but control
5952  * is now disabled.
5953  */
5954  if (wb_context->nr_pending >= *wb_context->max_pending)
5955  IssuePendingWritebacks(wb_context, io_context);
5956 }
5957 
5958 #define ST_SORT sort_pending_writebacks
5959 #define ST_ELEMENT_TYPE PendingWriteback
5960 #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5961 #define ST_SCOPE static
5962 #define ST_DEFINE
5963 #include <lib/sort_template.h>
5964 
5965 /*
5966  * Issue all pending writeback requests, previously scheduled with
5967  * ScheduleBufferTagForWriteback, to the OS.
5968  *
5969  * Because this is only used to improve the OSs IO scheduling we try to never
5970  * error out - it's just a hint.
5971  */
5972 void
5974 {
5975  instr_time io_start;
5976  int i;
5977 
5978  if (wb_context->nr_pending == 0)
5979  return;
5980 
5981  /*
5982  * Executing the writes in-order can make them a lot faster, and allows to
5983  * merge writeback requests to consecutive blocks into larger writebacks.
5984  */
5985  sort_pending_writebacks(wb_context->pending_writebacks,
5986  wb_context->nr_pending);
5987 
5989 
5990  /*
5991  * Coalesce neighbouring writes, but nothing else. For that we iterate
5992  * through the, now sorted, array of pending flushes, and look forward to
5993  * find all neighbouring (or identical) writes.
5994  */
5995  for (i = 0; i < wb_context->nr_pending; i++)
5996  {
5999  SMgrRelation reln;
6000  int ahead;
6001  BufferTag tag;
6002  RelFileLocator currlocator;
6003  Size nblocks = 1;
6004 
6005  cur = &wb_context->pending_writebacks[i];
6006  tag = cur->tag;
6007  currlocator = BufTagGetRelFileLocator(&tag);
6008 
6009  /*
6010  * Peek ahead, into following writeback requests, to see if they can
6011  * be combined with the current one.
6012  */
6013  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6014  {
6015 
6016  next = &wb_context->pending_writebacks[i + ahead + 1];
6017 
6018  /* different file, stop */
6019  if (!RelFileLocatorEquals(currlocator,
6020  BufTagGetRelFileLocator(&next->tag)) ||
6021  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6022  break;
6023 
6024  /* ok, block queued twice, skip */
6025  if (cur->tag.blockNum == next->tag.blockNum)
6026  continue;
6027 
6028  /* only merge consecutive writes */
6029  if (cur->tag.blockNum + 1 != next->tag.blockNum)
6030  break;
6031 
6032  nblocks++;
6033  cur = next;
6034  }
6035 
6036  i += ahead;
6037 
6038  /* and finally tell the kernel to write the data to storage */
6039  reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6040  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6041  }
6042 
6043  /*
6044  * Assume that writeback requests are only issued for buffers containing
6045  * blocks of permanent relations.
6046  */
6048  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
6049 
6050  wb_context->nr_pending = 0;
6051 }
6052 
6053 /* ResourceOwner callbacks */
6054 
6055 static void
6057 {
6058  Buffer buffer = DatumGetInt32(res);
6059 
6060  AbortBufferIO(buffer);
6061 }
6062 
6063 static char *
6065 {
6066  Buffer buffer = DatumGetInt32(res);
6067 
6068  return psprintf("lost track of buffer IO on buffer %d", buffer);
6069 }
6070 
6071 static void
6073 {
6074  Buffer buffer = DatumGetInt32(res);
6075 
6076  /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6077  if (!BufferIsValid(buffer))
6078  elog(ERROR, "bad buffer ID: %d", buffer);
6079 
6080  if (BufferIsLocal(buffer))
6081  UnpinLocalBufferNoOwner(buffer);
6082  else
6084 }
6085 
6086 static char *
6088 {
6090 }
6091 
6092 /*
6093  * Try to evict the current block in a shared buffer.
6094  *
6095  * This function is intended for testing/development use only!
6096  *
6097  * To succeed, the buffer must not be pinned on entry, so if the caller had a
6098  * particular block in mind, it might already have been replaced by some other
6099  * block by the time this function runs. It's also unpinned on return, so the
6100  * buffer might be occupied again by the time control is returned, potentially
6101  * even by the same block. This inherent raciness without other interlocking
6102  * makes the function unsuitable for non-testing usage.
6103  *
6104  * Returns true if the buffer was valid and it has now been made invalid.
6105  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6106  * or if the buffer becomes dirty again while we're trying to write it out.
6107  */
6108 bool
6110 {
6111  BufferDesc *desc;
6112  uint32 buf_state;
6113  bool result;
6114 
6115  /* Make sure we can pin the buffer. */
6118 
6120  desc = GetBufferDescriptor(buf - 1);
6121 
6122  /* Lock the header and check if it's valid. */
6123  buf_state = LockBufHdr(desc);
6124  if ((buf_state & BM_VALID) == 0)
6125  {
6126  UnlockBufHdr(desc, buf_state);
6127  return false;
6128  }
6129 
6130  /* Check that it's not pinned already. */
6131  if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6132  {
6133  UnlockBufHdr(desc, buf_state);
6134  return false;
6135  }
6136 
6137  PinBuffer_Locked(desc); /* releases spinlock */
6138 
6139  /* If it was dirty, try to clean it once. */
6140  if (buf_state & BM_DIRTY)
6141  {
6145  }
6146 
6147  /* This will return false if it becomes dirty or someone else pins it. */
6148  result = InvalidateVictimBuffer(desc);
6149 
6150  UnpinBuffer(desc);
6151 
6152  return result;
6153 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:342
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:403
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:288
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:232
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1791
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1655
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1619
int BgWriterDelay
Definition: bgwriter.c:57
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define MaxBlockNumber
Definition: block.h:35
static int32 next
Definition: blutils.c:221
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
#define BufferIsLocal(buffer)
Definition: buf.h:37
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:63
#define BM_PERMANENT
Definition: buf_internals.h:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:45
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:43
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_FLAG_MASK
Definition: buf_internals.h:48
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:67
#define BM_DIRTY
Definition: buf_internals.h:61
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:60
#define BM_JUST_DIRTIED
Definition: buf_internals.h:66
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:52
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:64
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:46
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:51
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:62
#define BM_IO_ERROR
Definition: buf_internals.h:65
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:68
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
bool track_io_timing
Definition: bufmgr.c:170
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5218
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:4602
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:4968
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4398
static int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
Definition: bufmgr.c:5862
bool BufferIsExclusiveLocked(Buffer buffer)
Definition: bufmgr.c:2482
const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:262
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:3736
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:343
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4043
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:2606
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:666
static uint32 PrivateRefCountClock
Definition: bufmgr.c:238
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3796
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6056
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1377
static bool WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1407
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2664
const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:253
bool zero_damaged_pages
Definition: bufmgr.c:167
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:87
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2775
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5804
static int buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
Definition: bufmgr.c:5828
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:5468
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:69
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
Definition: bufmgr.c:5628
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6064
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:873
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:3571
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:3677
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:5665
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:905
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1217
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1046
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1617
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3631
void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
Definition: bufmgr.c:4810
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4166
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5747
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
Definition: bufmgr.c:937
static BlockNumber copy_storage_using_buffer_read_stream_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: bufmgr.c:154
struct SMgrSortArray SMgrSortArray
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1893
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3613
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5893
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:3757
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:68
#define BUF_REUSABLE
Definition: bufmgr.c:77
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5727
static void BufferSync(int flags)
Definition: bufmgr.c:2924
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6087
void CheckPointBuffers(int flags)
Definition: bufmgr.c:3722
bool BufferIsDirty(Buffer buffer)
Definition: bufmgr.c:2511
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2202
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:3200
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:3967
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:96
void UnlockBuffers(void)
Definition: bufmgr.c:5143
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:576
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1961
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5197
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:3935
int bgwriter_flush_after
Definition: bufmgr.c:199
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4936
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4337
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:3997
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:5386
int checkpoint_flush_after
Definition: bufmgr.c:198
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4953
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1133
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:2827
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5707
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5928
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1420
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5916
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2543
void InitBufferPoolAccess(void)
Definition: bufmgr.c:3588
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
Definition: bufmgr.c:1282
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:501
double bgwriter_lru_multiplier
Definition: bufmgr.c:169
int backend_flush_after
Definition: bufmgr.c:200
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2127
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:277
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:203
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:443
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2158
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5251
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5171
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:239
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:5000
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:4504
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5973
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:466
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:857
bool ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
Definition: bufmgr.c:697
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:79
int maintenance_io_concurrency
Definition: bufmgr.c:185
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2818
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:4874
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1795
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4702
int effective_io_concurrency
Definition: bufmgr.c:178
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:369
bool EvictUnpinnedBuffer(Buffer buf)
Definition: bufmgr.c:6109
struct PrivateRefCountEntry PrivateRefCountEntry
struct CkptTsStatus CkptTsStatus
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1392
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:820
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5774
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6072
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:235
int io_combine_limit
Definition: bufmgr.c:192
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3498
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:773
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:236
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:237
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5412
int bgwriter_lru_maxpages
Definition: bufmgr.c:168
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5522
static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:5571
#define BUF_WRITTEN
Definition: bufmgr.c:76
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:4916
@ BAS_BULKREAD
Definition: bufmgr.h:36
@ BAS_BULKWRITE
Definition: bufmgr.h:38
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:190
#define P_NEW
Definition: bufmgr.h:184
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:111
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
#define DEFAULT_IO_COMBINE_LIMIT
Definition: bufmgr.h:165
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:367
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:113
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:164
#define DEFAULT_EFFECTIVE_IO_CONCURRENCY
Definition: bufmgr.h:158
#define DEFAULT_MAINTENANCE_IO_CONCURRENCY
Definition: bufmgr.h:159
void * Block
Definition: bufmgr.h:25
@ EB_LOCK_TARGET
Definition: bufmgr.h:92
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:89
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:77
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:83
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:74
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
ReadBufferMode
Definition: bufmgr.h:44
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:50
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:48
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:46
@ RBM_NORMAL
Definition: bufmgr.h:45
#define BMR_REL(p_rel)
Definition: bufmgr.h:107
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
Pointer Page
Definition: bufpage.h:81
#define PIV_LOG_WARNING
Definition: bufpage.h:468
static bool PageIsNew(Page page)
Definition: bufpage.h:233
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:386
#define PIV_REPORT_STAT
Definition: bufpage.h:469
unsigned int uint32
Definition: c.h:506
#define likely(x)
Definition: c.h:310
signed int int32
Definition: c.h:494
#define Assert(condition)
Definition: c.h:858
double float8
Definition: c.h:630
#define pg_attribute_always_inline
Definition: c.h:234
#define unlikely(x)
Definition: c.h:311
#define lengthof(array)
Definition: c.h:788
#define MemSet(start, val, len)
Definition: c.h:1020
size_t Size
Definition: c.h:605
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:711
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int64 TimestampTz
Definition: timestamp.h:39
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1395
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
struct cursor * cur
Definition: ecpg.c:28
int errdetail(const char *fmt,...)
Definition: elog.c:1203
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:681
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:758
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:798
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:39
int64 VacuumPageHit
Definition: globals.c:155
int NBuffers
Definition: globals.c:140
ProcNumber MyProcNumber
Definition: globals.c:88
int VacuumCostPageMiss
Definition: globals.c:150
int64 VacuumPageMiss
Definition: globals.c:156
bool VacuumCostActive
Definition: globals.c:160
int64 VacuumPageDirty
Definition: globals.c:157
int VacuumCostBalance
Definition: globals.c:159
int MaxBackends
Definition: globals.c:144
int VacuumCostPageDirty
Definition: globals.c:151
int VacuumCostPageHit
Definition: globals.c:149
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
BufferUsage pgBufferUsage
Definition: instrument.c:20
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int j
Definition: isn.c:74
int i
Definition: isn.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
static volatile sig_atomic_t waiting
Definition: latch.c:162
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:420
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:470
int32 * LocalRefCount
Definition: localbuf.c:46
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:681
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:116
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:830
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:655
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
int NLocBuffer
Definition: localbuf.c:42
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:69
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:688
#define ExclusiveLock
Definition: lockdefs.h:42
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1893
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1937
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void pfree(void *pointer)
Definition: mcxt.c:1521
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static PgChecksumMode mode
Definition: pg_checksums.c:56
static int64 current_size
Definition: pg_checksums.c:64
#define WRITEBACK_MAX_PENDING_FLUSHES
#define DEFAULT_BACKEND_FLUSH_AFTER
#define DEFAULT_CHECKPOINT_FLUSH_AFTER
#define DEFAULT_BGWRITER_FLUSH_AFTER
static char * buf
Definition: pg_test_fsync.c:73
IOObject
Definition: pgstat.h:279
@ IOOBJECT_RELATION
Definition: pgstat.h:280
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:281
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:635
IOContext
Definition: pgstat.h:287
@ IOCONTEXT_NORMAL
Definition: pgstat.h:290
@ IOOP_EXTEND
Definition: pgstat.h:299
@ IOOP_READ
Definition: pgstat.h:302
@ IOOP_WRITEBACK
Definition: pgstat.h:305
@ IOOP_HIT
Definition: pgstat.h:301
@ IOOP_EVICT
Definition: pgstat.h:298
@ IOOP_REUSE
Definition: pgstat.h:303
@ IOOP_WRITE
Definition: pgstat.h:304
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:640
PgStat_BgWriterStats PendingBgWriterStats
PgStat_CheckpointerStats PendingCheckpointerStats
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:100
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:122
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
#define qsort(a, b, c, d)
Definition: port.h:453
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
#define NUM_AUXILIARY_PROCS
Definition: proc.h:439
#define DELAY_CHKPT_START
Definition: proc.h:114
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:464
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:421
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:369
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
tree context
Definition: radixtree.h:1835
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:620
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:590
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:850
#define READ_STREAM_FULL
Definition: read_stream.h:43
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:658
#define RelationIsValid(relation)
Definition: rel.h:478
#define RelFileLocatorBackendIsTemp(rlocator)
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
@ INIT_FORKNUM
Definition: relpath.h:53
#define MAX_FORKNUM
Definition: relpath.h:62
#define relpath(rlocator, forknum)
Definition: relpath.h:94
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:127
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:187
#define init_local_spin_delay(status)
Definition: s_lock.h:778
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:655
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:643
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:198
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:411
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:679
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:560
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:535
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:398
void smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:600
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:585
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1878
PGPROC * MyProc
Definition: proc.c:66
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:671
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:659
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1866
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:792
bool log_recovery_conflict_waits
Definition: standby.c:41
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:273
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:532
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
struct SMgrRelationData * smgr
Definition: bufmgr.h:103
int64 shared_blks_dirtied
Definition: instrument.h:28
int64 local_blks_hit
Definition: instrument.h:30
int64 local_blks_written
Definition: instrument.h:33
int64 shared_blks_read
Definition: instrument.h:27
int64 shared_blks_written
Definition: instrument.h:29
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:115
int index
Definition: bufmgr.c:123
int num_scanned
Definition: bufmgr.c:120
float8 progress
Definition: bufmgr.c:114
int num_to_scan
Definition: bufmgr.c:118
Oid tsId
Definition: bufmgr.c:105
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76
Definition: dynahash.c:220
Definition: lwlock.h:42
int delayChkptFlags
Definition: proc.h:235
PgStat_Counter buf_written_clean
Definition: pgstat.h:255
PgStat_Counter maxwritten_clean
Definition: pgstat.h:256
PgStat_Counter buf_alloc
Definition: pgstat.h:257
PgStat_Counter buffers_written
Definition: pgstat.h:270
Buffer recent_buffer
Definition: bufmgr.h:60
ForkNumber forknum
Definition: bufmgr.h:121
int16 io_buffers_len
Definition: bufmgr.h:133
Buffer * buffers
Definition: bufmgr.h:129
BufferAccessStrategy strategy
Definition: bufmgr.h:122
BlockNumber blocknum
Definition: bufmgr.h:130
struct SMgrRelationData * smgr
Definition: bufmgr.h:119
RelFileLocator locator
RelFileNumber relNumber
RelFileLocator rd_locator
Definition: rel.h:57
Form_pg_class rd_rel
Definition: rel.h:111
const char * name
Definition: resowner.h:93
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37
SMgrRelation srel
Definition: bufmgr.c:136
RelFileLocator rlocator
Definition: bufmgr.c:135
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
Definition: buf_internals.h:98
Oid spcOid
Definition: buf_internals.h:94
Oid dbOid
Definition: buf_internals.h:95
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1868
bool RecoveryInProgress(void)
Definition: xlog.c:6304
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3125
CheckpointStatsData CheckpointStats
Definition: xlog.c:207
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2794
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139
#define XLogIsNeeded()
Definition: xlog.h:109
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237
#define InHotStandby
Definition: xlogutils.h:60