PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * StartReadBuffer() -- as above, with separate wait step
23  * StartReadBuffers() -- multiple block version
24  * WaitReadBuffers() -- second step of above
25  *
26  * ReleaseBuffer() -- unpin a buffer
27  *
28  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29  * The disk write is delayed until buffer replacement or checkpoint.
30  *
31  * See also these files:
32  * freelist.c -- chooses victim for buffer replacement
33  * buf_table.c -- manages the buffer lookup table
34  */
35 #include "postgres.h"
36 
37 #include <sys/file.h>
38 #include <unistd.h>
39 
40 #include "access/tableam.h"
41 #include "access/xloginsert.h"
42 #include "access/xlogutils.h"
43 #include "catalog/storage.h"
44 #include "catalog/storage_xlog.h"
45 #include "executor/instrument.h"
46 #include "lib/binaryheap.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "postmaster/bgwriter.h"
51 #include "storage/buf_internals.h"
52 #include "storage/bufmgr.h"
53 #include "storage/fd.h"
54 #include "storage/ipc.h"
55 #include "storage/lmgr.h"
56 #include "storage/proc.h"
57 #include "storage/read_stream.h"
58 #include "storage/smgr.h"
59 #include "storage/standby.h"
60 #include "utils/memdebug.h"
61 #include "utils/ps_status.h"
62 #include "utils/rel.h"
63 #include "utils/resowner.h"
64 #include "utils/timestamp.h"
65 
66 
67 /* Note: these two macros only work on shared buffers, not local ones! */
68 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
69 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
70 
71 /* Note: this macro only works on local buffers, not shared ones! */
72 #define LocalBufHdrGetBlock(bufHdr) \
73  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
74 
75 /* Bits in SyncOneBuffer's return value */
76 #define BUF_WRITTEN 0x01
77 #define BUF_REUSABLE 0x02
78 
79 #define RELS_BSEARCH_THRESHOLD 20
80 
81 /*
82  * This is the size (in the number of blocks) above which we scan the
83  * entire buffer pool to remove the buffers for all the pages of relation
84  * being dropped. For the relations with size below this threshold, we find
85  * the buffers by doing lookups in BufMapping table.
86  */
87 #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
88 
89 typedef struct PrivateRefCountEntry
90 {
94 
95 /* 64 bytes, about the size of a cache line on common systems */
96 #define REFCOUNT_ARRAY_ENTRIES 8
97 
98 /*
99  * Status of buffers to checkpoint for a particular tablespace, used
100  * internally in BufferSync.
101  */
102 typedef struct CkptTsStatus
103 {
104  /* oid of the tablespace */
106 
107  /*
108  * Checkpoint progress for this tablespace. To make progress comparable
109  * between tablespaces the progress is, for each tablespace, measured as a
110  * number between 0 and the total number of to-be-checkpointed pages. Each
111  * page checkpointed in this tablespace increments this space's progress
112  * by progress_slice.
113  */
116 
117  /* number of to-be checkpointed pages in this tablespace */
119  /* already processed pages in this tablespace */
121 
122  /* current offset in CkptBufferIds for this tablespace */
123  int index;
125 
126 /*
127  * Type for array used to sort SMgrRelations
128  *
129  * FlushRelationsAllBuffers shares the same comparator function with
130  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
131  * compatible.
132  */
133 typedef struct SMgrSortArray
134 {
135  RelFileLocator rlocator; /* This must be the first member */
138 
139 /* GUC variables */
140 bool zero_damaged_pages = false;
143 bool track_io_timing = false;
144 
145 /*
146  * How many buffers PrefetchBuffer callers should try to stay ahead of their
147  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
148  * for buffers not belonging to tablespaces that have their
149  * effective_io_concurrency parameter set.
150  */
152 
153 /*
154  * Like effective_io_concurrency, but used by maintenance code paths that might
155  * benefit from a higher setting because they work on behalf of many sessions.
156  * Overridden by the tablespace setting of the same name.
157  */
159 
160 /*
161  * Limit on how many blocks should be handled in single I/O operations.
162  * StartReadBuffers() callers should respect it, as should other operations
163  * that call smgr APIs directly.
164  */
166 
167 /*
168  * GUC variables about triggering kernel writeback for buffers written; OS
169  * dependent defaults are set via the GUC mechanism.
170  */
174 
175 /* local state for LockBufferForCleanup */
177 
178 /*
179  * Backend-Private refcount management:
180  *
181  * Each buffer also has a private refcount that keeps track of the number of
182  * times the buffer is pinned in the current process. This is so that the
183  * shared refcount needs to be modified only once if a buffer is pinned more
184  * than once by an individual backend. It's also used to check that no buffers
185  * are still pinned at the end of transactions and when exiting.
186  *
187  *
188  * To avoid - as we used to - requiring an array with NBuffers entries to keep
189  * track of local buffers, we use a small sequentially searched array
190  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
191  * keep track of backend local pins.
192  *
193  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
194  * refcounts are kept track of in the array; after that, new array entries
195  * displace old ones into the hash table. That way a frequently used entry
196  * can't get "stuck" in the hashtable while infrequent ones clog the array.
197  *
198  * Note that in most scenarios the number of pinned buffers will not exceed
199  * REFCOUNT_ARRAY_ENTRIES.
200  *
201  *
202  * To enter a buffer into the refcount tracking mechanism first reserve a free
203  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
204  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
205  * memory allocations in NewPrivateRefCountEntry() which can be important
206  * because in some scenarios it's called with a spinlock held...
207  */
209 static HTAB *PrivateRefCountHash = NULL;
213 
214 static void ReservePrivateRefCountEntry(void);
217 static inline int32 GetPrivateRefCount(Buffer buffer);
219 
220 /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
221 static void ResOwnerReleaseBufferIO(Datum res);
222 static char *ResOwnerPrintBufferIO(Datum res);
223 static void ResOwnerReleaseBufferPin(Datum res);
224 static char *ResOwnerPrintBufferPin(Datum res);
225 
227 {
228  .name = "buffer io",
229  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
230  .release_priority = RELEASE_PRIO_BUFFER_IOS,
231  .ReleaseResource = ResOwnerReleaseBufferIO,
232  .DebugPrint = ResOwnerPrintBufferIO
233 };
234 
236 {
237  .name = "buffer pin",
238  .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
239  .release_priority = RELEASE_PRIO_BUFFER_PINS,
240  .ReleaseResource = ResOwnerReleaseBufferPin,
241  .DebugPrint = ResOwnerPrintBufferPin
242 };
243 
244 /*
245  * Ensure that the PrivateRefCountArray has sufficient space to store one more
246  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
247  * a new entry - but it's perfectly fine to not use a reserved entry.
248  */
249 static void
251 {
252  /* Already reserved (or freed), nothing to do */
253  if (ReservedRefCountEntry != NULL)
254  return;
255 
256  /*
257  * First search for a free entry the array, that'll be sufficient in the
258  * majority of cases.
259  */
260  {
261  int i;
262 
263  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
264  {
266 
268 
269  if (res->buffer == InvalidBuffer)
270  {
272  return;
273  }
274  }
275  }
276 
277  /*
278  * No luck. All array entries are full. Move one array entry into the hash
279  * table.
280  */
281  {
282  /*
283  * Move entry from the current clock position in the array into the
284  * hashtable. Use that slot.
285  */
286  PrivateRefCountEntry *hashent;
287  bool found;
288 
289  /* select victim slot */
292 
293  /* Better be used, otherwise we shouldn't get here. */
295 
296  /* enter victim array entry into hashtable */
299  HASH_ENTER,
300  &found);
301  Assert(!found);
303 
304  /* clear the now free array slot */
307 
309  }
310 }
311 
312 /*
313  * Fill a previously reserved refcount entry.
314  */
315 static PrivateRefCountEntry *
317 {
319 
320  /* only allowed to be called when a reservation has been made */
321  Assert(ReservedRefCountEntry != NULL);
322 
323  /* use up the reserved entry */
325  ReservedRefCountEntry = NULL;
326 
327  /* and fill it */
328  res->buffer = buffer;
329  res->refcount = 0;
330 
331  return res;
332 }
333 
334 /*
335  * Return the PrivateRefCount entry for the passed buffer.
336  *
337  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
338  * do_move is true, and the entry resides in the hashtable the entry is
339  * optimized for frequent access by moving it to the array.
340  */
341 static PrivateRefCountEntry *
343 {
345  int i;
346 
349 
350  /*
351  * First search for references in the array, that'll be sufficient in the
352  * majority of cases.
353  */
354  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
355  {
357 
358  if (res->buffer == buffer)
359  return res;
360  }
361 
362  /*
363  * By here we know that the buffer, if already pinned, isn't residing in
364  * the array.
365  *
366  * Only look up the buffer in the hashtable if we've previously overflowed
367  * into it.
368  */
369  if (PrivateRefCountOverflowed == 0)
370  return NULL;
371 
373 
374  if (res == NULL)
375  return NULL;
376  else if (!do_move)
377  {
378  /* caller doesn't want us to move the hash entry into the array */
379  return res;
380  }
381  else
382  {
383  /* move buffer from hashtable into the free array slot */
384  bool found;
386 
387  /* Ensure there's a free array slot */
389 
390  /* Use up the reserved slot */
391  Assert(ReservedRefCountEntry != NULL);
393  ReservedRefCountEntry = NULL;
394  Assert(free->buffer == InvalidBuffer);
395 
396  /* and fill it */
397  free->buffer = buffer;
398  free->refcount = res->refcount;
399 
400  /* delete from hashtable */
402  Assert(found);
405 
406  return free;
407  }
408 }
409 
410 /*
411  * Returns how many times the passed buffer is pinned by this backend.
412  *
413  * Only works for shared memory buffers!
414  */
415 static inline int32
417 {
419 
422 
423  /*
424  * Not moving the entry - that's ok for the current users, but we might
425  * want to change this one day.
426  */
427  ref = GetPrivateRefCountEntry(buffer, false);
428 
429  if (ref == NULL)
430  return 0;
431  return ref->refcount;
432 }
433 
434 /*
435  * Release resources used to track the reference count of a buffer which we no
436  * longer have pinned and don't want to pin again immediately.
437  */
438 static void
440 {
441  Assert(ref->refcount == 0);
442 
443  if (ref >= &PrivateRefCountArray[0] &&
445  {
446  ref->buffer = InvalidBuffer;
447 
448  /*
449  * Mark the just used entry as reserved - in many scenarios that
450  * allows us to avoid ever having to search the array/hash for free
451  * entries.
452  */
453  ReservedRefCountEntry = ref;
454  }
455  else
456  {
457  bool found;
458  Buffer buffer = ref->buffer;
459 
461  Assert(found);
464  }
465 }
466 
467 /*
468  * BufferIsPinned
469  * True iff the buffer is pinned (also checks for valid buffer number).
470  *
471  * NOTE: what we check here is that *this* backend holds a pin on
472  * the buffer. We do not care whether some other backend does.
473  */
474 #define BufferIsPinned(bufnum) \
475 ( \
476  !BufferIsValid(bufnum) ? \
477  false \
478  : \
479  BufferIsLocal(bufnum) ? \
480  (LocalRefCount[-(bufnum) - 1] > 0) \
481  : \
482  (GetPrivateRefCount(bufnum) > 0) \
483 )
484 
485 
487  SMgrRelation smgr, char smgr_persistence,
488  ForkNumber forkNum, BlockNumber blockNum,
491  ForkNumber fork,
492  BufferAccessStrategy strategy,
493  uint32 flags,
494  uint32 extend_by,
495  BlockNumber extend_upto,
496  Buffer *buffers,
497  uint32 *extended_by);
499  ForkNumber fork,
500  BufferAccessStrategy strategy,
501  uint32 flags,
502  uint32 extend_by,
503  BlockNumber extend_upto,
504  Buffer *buffers,
505  uint32 *extended_by);
506 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
507 static void PinBuffer_Locked(BufferDesc *buf);
508 static void UnpinBuffer(BufferDesc *buf);
509 static void UnpinBufferNoOwner(BufferDesc *buf);
510 static void BufferSync(int flags);
512 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
513  WritebackContext *wb_context);
514 static void WaitIO(BufferDesc *buf);
515 static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
516 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
517  uint32 set_flag_bits, bool forget_owner);
518 static void AbortBufferIO(Buffer buffer);
519 static void shared_buffer_write_error_callback(void *arg);
520 static void local_buffer_write_error_callback(void *arg);
521 static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
522  char relpersistence,
523  ForkNumber forkNum,
524  BlockNumber blockNum,
525  BufferAccessStrategy strategy,
526  bool *foundPtr, IOContext io_context);
527 static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
528 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
529  IOObject io_object, IOContext io_context);
530 static void FindAndDropRelationBuffers(RelFileLocator rlocator,
531  ForkNumber forkNum,
532  BlockNumber nForkBlock,
533  BlockNumber firstDelBlock);
534 static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
535  RelFileLocator dstlocator,
536  ForkNumber forkNum, bool permanent);
537 static void AtProcExit_Buffers(int code, Datum arg);
538 static void CheckForBufferLeaks(void);
539 static int rlocator_comparator(const void *p1, const void *p2);
540 static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
541 static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
542 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
543 
544 
545 /*
546  * Implementation of PrefetchBuffer() for shared buffers.
547  */
550  ForkNumber forkNum,
551  BlockNumber blockNum)
552 {
553  PrefetchBufferResult result = {InvalidBuffer, false};
554  BufferTag newTag; /* identity of requested block */
555  uint32 newHash; /* hash value for newTag */
556  LWLock *newPartitionLock; /* buffer partition lock for it */
557  int buf_id;
558 
559  Assert(BlockNumberIsValid(blockNum));
560 
561  /* create a tag so we can lookup the buffer */
562  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
563  forkNum, blockNum);
564 
565  /* determine its hash code and partition lock ID */
566  newHash = BufTableHashCode(&newTag);
567  newPartitionLock = BufMappingPartitionLock(newHash);
568 
569  /* see if the block is in the buffer pool already */
570  LWLockAcquire(newPartitionLock, LW_SHARED);
571  buf_id = BufTableLookup(&newTag, newHash);
572  LWLockRelease(newPartitionLock);
573 
574  /* If not in buffers, initiate prefetch */
575  if (buf_id < 0)
576  {
577 #ifdef USE_PREFETCH
578  /*
579  * Try to initiate an asynchronous read. This returns false in
580  * recovery if the relation file doesn't exist.
581  */
582  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
583  smgrprefetch(smgr_reln, forkNum, blockNum, 1))
584  {
585  result.initiated_io = true;
586  }
587 #endif /* USE_PREFETCH */
588  }
589  else
590  {
591  /*
592  * Report the buffer it was in at that time. The caller may be able
593  * to avoid a buffer table lookup, but it's not pinned and it must be
594  * rechecked!
595  */
596  result.recent_buffer = buf_id + 1;
597  }
598 
599  /*
600  * If the block *is* in buffers, we do nothing. This is not really ideal:
601  * the block might be just about to be evicted, which would be stupid
602  * since we know we are going to need it soon. But the only easy answer
603  * is to bump the usage_count, which does not seem like a great solution:
604  * when the caller does ultimately touch the block, usage_count would get
605  * bumped again, resulting in too much favoritism for blocks that are
606  * involved in a prefetch sequence. A real fix would involve some
607  * additional per-buffer state, and it's not clear that there's enough of
608  * a problem to justify that.
609  */
610 
611  return result;
612 }
613 
614 /*
615  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
616  *
617  * This is named by analogy to ReadBuffer but doesn't actually allocate a
618  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
619  * block will not be delayed by the I/O. Prefetching is optional.
620  *
621  * There are three possible outcomes:
622  *
623  * 1. If the block is already cached, the result includes a valid buffer that
624  * could be used by the caller to avoid the need for a later buffer lookup, but
625  * it's not pinned, so the caller must recheck it.
626  *
627  * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
628  * true. Currently there is no way to know if the data was already cached by
629  * the kernel and therefore didn't really initiate I/O, and no way to know when
630  * the I/O completes other than using synchronous ReadBuffer().
631  *
632  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
633  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
634  * lack of a kernel facility), direct I/O is enabled, or the underlying
635  * relation file wasn't found and we are in recovery. (If the relation file
636  * wasn't found and we are not in recovery, an error is raised).
637  */
640 {
641  Assert(RelationIsValid(reln));
642  Assert(BlockNumberIsValid(blockNum));
643 
644  if (RelationUsesLocalBuffers(reln))
645  {
646  /* see comments in ReadBufferExtended */
647  if (RELATION_IS_OTHER_TEMP(reln))
648  ereport(ERROR,
649  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
650  errmsg("cannot access temporary tables of other sessions")));
651 
652  /* pass it off to localbuf.c */
653  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
654  }
655  else
656  {
657  /* pass it to the shared buffer version */
658  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
659  }
660 }
661 
662 /*
663  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
664  *
665  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
666  * successful. Return true if the buffer is valid and still has the expected
667  * tag. In that case, the buffer is pinned and the usage count is bumped.
668  */
669 bool
671  Buffer recent_buffer)
672 {
673  BufferDesc *bufHdr;
674  BufferTag tag;
675  uint32 buf_state;
676  bool have_private_ref;
677 
678  Assert(BufferIsValid(recent_buffer));
679 
682  InitBufferTag(&tag, &rlocator, forkNum, blockNum);
683 
684  if (BufferIsLocal(recent_buffer))
685  {
686  int b = -recent_buffer - 1;
687 
688  bufHdr = GetLocalBufferDescriptor(b);
689  buf_state = pg_atomic_read_u32(&bufHdr->state);
690 
691  /* Is it still valid and holding the right tag? */
692  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
693  {
694  PinLocalBuffer(bufHdr, true);
695 
697 
698  return true;
699  }
700  }
701  else
702  {
703  bufHdr = GetBufferDescriptor(recent_buffer - 1);
704  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
705 
706  /*
707  * Do we already have this buffer pinned with a private reference? If
708  * so, it must be valid and it is safe to check the tag without
709  * locking. If not, we have to lock the header first and then check.
710  */
711  if (have_private_ref)
712  buf_state = pg_atomic_read_u32(&bufHdr->state);
713  else
714  buf_state = LockBufHdr(bufHdr);
715 
716  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
717  {
718  /*
719  * It's now safe to pin the buffer. We can't pin first and ask
720  * questions later, because it might confuse code paths like
721  * InvalidateBuffer() if we pinned a random non-matching buffer.
722  */
723  if (have_private_ref)
724  PinBuffer(bufHdr, NULL); /* bump pin count */
725  else
726  PinBuffer_Locked(bufHdr); /* pin for first time */
727 
729 
730  return true;
731  }
732 
733  /* If we locked the header above, now unlock. */
734  if (!have_private_ref)
735  UnlockBufHdr(bufHdr, buf_state);
736  }
737 
738  return false;
739 }
740 
741 /*
742  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
743  * fork with RBM_NORMAL mode and default strategy.
744  */
745 Buffer
747 {
748  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
749 }
750 
751 /*
752  * ReadBufferExtended -- returns a buffer containing the requested
753  * block of the requested relation. If the blknum
754  * requested is P_NEW, extend the relation file and
755  * allocate a new block. (Caller is responsible for
756  * ensuring that only one backend tries to extend a
757  * relation at the same time!)
758  *
759  * Returns: the buffer number for the buffer containing
760  * the block read. The returned buffer has been pinned.
761  * Does not return on error --- elog's instead.
762  *
763  * Assume when this function is called, that reln has been opened already.
764  *
765  * In RBM_NORMAL mode, the page is read from disk, and the page header is
766  * validated. An error is thrown if the page header is not valid. (But
767  * note that an all-zero page is considered "valid"; see
768  * PageIsVerifiedExtended().)
769  *
770  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
771  * valid, the page is zeroed instead of throwing an error. This is intended
772  * for non-critical data, where the caller is prepared to repair errors.
773  *
774  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
775  * filled with zeros instead of reading it from disk. Useful when the caller
776  * is going to fill the page from scratch, since this saves I/O and avoids
777  * unnecessary failure if the page-on-disk has corrupt page headers.
778  * The page is returned locked to ensure that the caller has a chance to
779  * initialize the page before it's made visible to others.
780  * Caution: do not use this mode to read a page that is beyond the relation's
781  * current physical EOF; that is likely to cause problems in md.c when
782  * the page is modified and written out. P_NEW is OK, though.
783  *
784  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
785  * a cleanup-strength lock on the page.
786  *
787  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
788  *
789  * If strategy is not NULL, a nondefault buffer access strategy is used.
790  * See buffer/README for details.
791  */
792 inline Buffer
795 {
796  Buffer buf;
797 
798  /*
799  * Reject attempts to read non-local temporary relations; we would be
800  * likely to get wrong data since we have no visibility into the owning
801  * session's local buffers.
802  */
803  if (RELATION_IS_OTHER_TEMP(reln))
804  ereport(ERROR,
805  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
806  errmsg("cannot access temporary tables of other sessions")));
807 
808  /*
809  * Read the buffer, and update pgstat counters to reflect a cache hit or
810  * miss.
811  */
812  buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
813  forkNum, blockNum, mode, strategy);
814 
815  return buf;
816 }
817 
818 
819 /*
820  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
821  * a relcache entry for the relation.
822  *
823  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
824  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
825  * cannot be used for temporary relations (and making that work might be
826  * difficult, unless we only want to read temporary relations for our own
827  * ProcNumber).
828  */
829 Buffer
831  BlockNumber blockNum, ReadBufferMode mode,
832  BufferAccessStrategy strategy, bool permanent)
833 {
834  SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
835 
836  return ReadBuffer_common(NULL, smgr,
837  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
838  forkNum, blockNum,
839  mode, strategy);
840 }
841 
842 /*
843  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
844  */
845 Buffer
847  ForkNumber forkNum,
848  BufferAccessStrategy strategy,
849  uint32 flags)
850 {
851  Buffer buf;
852  uint32 extend_by = 1;
853 
854  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
855  &buf, &extend_by);
856 
857  return buf;
858 }
859 
860 /*
861  * Extend relation by multiple blocks.
862  *
863  * Tries to extend the relation by extend_by blocks. Depending on the
864  * availability of resources the relation may end up being extended by a
865  * smaller number of pages (unless an error is thrown, always by at least one
866  * page). *extended_by is updated to the number of pages the relation has been
867  * extended to.
868  *
869  * buffers needs to be an array that is at least extend_by long. Upon
870  * completion, the first extend_by array elements will point to a pinned
871  * buffer.
872  *
873  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
874  * locked. This is useful for callers that want a buffer that is guaranteed to
875  * be empty.
876  */
879  ForkNumber fork,
880  BufferAccessStrategy strategy,
881  uint32 flags,
882  uint32 extend_by,
883  Buffer *buffers,
884  uint32 *extended_by)
885 {
886  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
887  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
888  Assert(extend_by > 0);
889 
890  if (bmr.smgr == NULL)
891  {
892  bmr.smgr = RelationGetSmgr(bmr.rel);
893  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
894  }
895 
896  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
897  extend_by, InvalidBlockNumber,
898  buffers, extended_by);
899 }
900 
901 /*
902  * Extend the relation so it is at least extend_to blocks large, return buffer
903  * (extend_to - 1).
904  *
905  * This is useful for callers that want to write a specific page, regardless
906  * of the current size of the relation (e.g. useful for visibilitymap and for
907  * crash recovery).
908  */
909 Buffer
911  ForkNumber fork,
912  BufferAccessStrategy strategy,
913  uint32 flags,
914  BlockNumber extend_to,
916 {
918  uint32 extended_by = 0;
920  Buffer buffers[64];
921 
922  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
923  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
924  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
925 
926  if (bmr.smgr == NULL)
927  {
928  bmr.smgr = RelationGetSmgr(bmr.rel);
929  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
930  }
931 
932  /*
933  * If desired, create the file if it doesn't exist. If
934  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
935  * an smgrexists call.
936  */
937  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
938  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
940  !smgrexists(bmr.smgr, fork))
941  {
943 
944  /* recheck, fork might have been created concurrently */
945  if (!smgrexists(bmr.smgr, fork))
946  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
947 
949  }
950 
951  /*
952  * If requested, invalidate size cache, so that smgrnblocks asks the
953  * kernel.
954  */
955  if (flags & EB_CLEAR_SIZE_CACHE)
957 
958  /*
959  * Estimate how many pages we'll need to extend by. This avoids acquiring
960  * unnecessarily many victim buffers.
961  */
962  current_size = smgrnblocks(bmr.smgr, fork);
963 
964  /*
965  * Since no-one else can be looking at the page contents yet, there is no
966  * difference between an exclusive lock and a cleanup-strength lock. Note
967  * that we pass the original mode to ReadBuffer_common() below, when
968  * falling back to reading the buffer to a concurrent relation extension.
969  */
971  flags |= EB_LOCK_TARGET;
972 
973  while (current_size < extend_to)
974  {
975  uint32 num_pages = lengthof(buffers);
976  BlockNumber first_block;
977 
978  if ((uint64) current_size + num_pages > extend_to)
979  num_pages = extend_to - current_size;
980 
981  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
982  num_pages, extend_to,
983  buffers, &extended_by);
984 
985  current_size = first_block + extended_by;
986  Assert(num_pages != 0 || current_size >= extend_to);
987 
988  for (uint32 i = 0; i < extended_by; i++)
989  {
990  if (first_block + i != extend_to - 1)
991  ReleaseBuffer(buffers[i]);
992  else
993  buffer = buffers[i];
994  }
995  }
996 
997  /*
998  * It's possible that another backend concurrently extended the relation.
999  * In that case read the buffer.
1000  *
1001  * XXX: Should we control this via a flag?
1002  */
1003  if (buffer == InvalidBuffer)
1004  {
1005  Assert(extended_by == 0);
1007  fork, extend_to - 1, mode, strategy);
1008  }
1009 
1010  return buffer;
1011 }
1012 
1013 /*
1014  * Lock and optionally zero a buffer, as part of the implementation of
1015  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1016  * pinned. If the buffer is not already valid, it is zeroed and made valid.
1017  */
1018 static void
1020 {
1021  BufferDesc *bufHdr;
1022  bool need_to_zero;
1023  bool isLocalBuf = BufferIsLocal(buffer);
1024 
1026 
1027  if (already_valid)
1028  {
1029  /*
1030  * If the caller already knew the buffer was valid, we can skip some
1031  * header interaction. The caller just wants to lock the buffer.
1032  */
1033  need_to_zero = false;
1034  }
1035  else if (isLocalBuf)
1036  {
1037  /* Simple case for non-shared buffers. */
1038  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1039  need_to_zero = (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1040  }
1041  else
1042  {
1043  /*
1044  * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1045  * concurrently. Even though we aren't doing I/O, that ensures that
1046  * we don't zero a page that someone else has pinned. An exclusive
1047  * content lock wouldn't be enough, because readers are allowed to
1048  * drop the content lock after determining that a tuple is visible
1049  * (see buffer access rules in README).
1050  */
1051  bufHdr = GetBufferDescriptor(buffer - 1);
1052  need_to_zero = StartBufferIO(bufHdr, true, false);
1053  }
1054 
1055  if (need_to_zero)
1056  {
1057  memset(BufferGetPage(buffer), 0, BLCKSZ);
1058 
1059  /*
1060  * Grab the buffer content lock before marking the page as valid, to
1061  * make sure that no other backend sees the zeroed page before the
1062  * caller has had a chance to initialize it.
1063  *
1064  * Since no-one else can be looking at the page contents yet, there is
1065  * no difference between an exclusive lock and a cleanup-strength
1066  * lock. (Note that we cannot use LockBuffer() or
1067  * LockBufferForCleanup() here, because they assert that the buffer is
1068  * already valid.)
1069  */
1070  if (!isLocalBuf)
1072 
1073  if (isLocalBuf)
1074  {
1075  /* Only need to adjust flags */
1076  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1077 
1078  buf_state |= BM_VALID;
1079  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1080  }
1081  else
1082  {
1083  /* Set BM_VALID, terminate IO, and wake up any waiters */
1084  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1085  }
1086  }
1087  else if (!isLocalBuf)
1088  {
1089  /*
1090  * The buffer is valid, so we can't zero it. The caller still expects
1091  * the page to be locked on return.
1092  */
1093  if (mode == RBM_ZERO_AND_LOCK)
1095  else
1097  }
1098 }
1099 
1100 /*
1101  * Pin a buffer for a given block. *foundPtr is set to true if the block was
1102  * already present, or false if more work is required to either read it in or
1103  * zero it.
1104  */
1107  SMgrRelation smgr,
1108  char persistence,
1109  ForkNumber forkNum,
1110  BlockNumber blockNum,
1111  BufferAccessStrategy strategy,
1112  bool *foundPtr)
1113 {
1114  BufferDesc *bufHdr;
1115  IOContext io_context;
1116  IOObject io_object;
1117 
1118  Assert(blockNum != P_NEW);
1119 
1120  /* Persistence should be set before */
1121  Assert((persistence == RELPERSISTENCE_TEMP ||
1122  persistence == RELPERSISTENCE_PERMANENT ||
1123  persistence == RELPERSISTENCE_UNLOGGED));
1124 
1125  if (persistence == RELPERSISTENCE_TEMP)
1126  {
1127  io_context = IOCONTEXT_NORMAL;
1128  io_object = IOOBJECT_TEMP_RELATION;
1129  }
1130  else
1131  {
1132  io_context = IOContextForStrategy(strategy);
1133  io_object = IOOBJECT_RELATION;
1134  }
1135 
1136  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1138  smgr->smgr_rlocator.locator.dbOid,
1140  smgr->smgr_rlocator.backend);
1141 
1142  if (persistence == RELPERSISTENCE_TEMP)
1143  {
1144  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1145  if (*foundPtr)
1147  }
1148  else
1149  {
1150  bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1151  strategy, foundPtr, io_context);
1152  if (*foundPtr)
1154  }
1155  if (rel)
1156  {
1157  /*
1158  * While pgBufferUsage's "read" counter isn't bumped unless we reach
1159  * WaitReadBuffers() (so, not for hits, and not for buffers that are
1160  * zeroed instead), the per-relation stats always count them.
1161  */
1163  if (*foundPtr)
1165  }
1166  if (*foundPtr)
1167  {
1168  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1169  if (VacuumCostActive)
1171 
1172  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1174  smgr->smgr_rlocator.locator.dbOid,
1176  smgr->smgr_rlocator.backend,
1177  true);
1178  }
1179 
1180  return BufferDescriptorGetBuffer(bufHdr);
1181 }
1182 
1183 /*
1184  * ReadBuffer_common -- common logic for all ReadBuffer variants
1185  *
1186  * smgr is required, rel is optional unless using P_NEW.
1187  */
1189 ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1190  ForkNumber forkNum,
1191  BlockNumber blockNum, ReadBufferMode mode,
1192  BufferAccessStrategy strategy)
1193 {
1194  ReadBuffersOperation operation;
1195  Buffer buffer;
1196  int flags;
1197  char persistence;
1198 
1199  /*
1200  * Backward compatibility path, most code should use ExtendBufferedRel()
1201  * instead, as acquiring the extension lock inside ExtendBufferedRel()
1202  * scales a lot better.
1203  */
1204  if (unlikely(blockNum == P_NEW))
1205  {
1207 
1208  /*
1209  * Since no-one else can be looking at the page contents yet, there is
1210  * no difference between an exclusive lock and a cleanup-strength
1211  * lock.
1212  */
1214  flags |= EB_LOCK_FIRST;
1215 
1216  return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1217  }
1218 
1219  if (rel)
1220  persistence = rel->rd_rel->relpersistence;
1221  else
1222  persistence = smgr_persistence;
1223 
1225  mode == RBM_ZERO_AND_LOCK))
1226  {
1227  bool found;
1228 
1229  buffer = PinBufferForBlock(rel, smgr, persistence,
1230  forkNum, blockNum, strategy, &found);
1231  ZeroAndLockBuffer(buffer, mode, found);
1232  return buffer;
1233  }
1234 
1235  if (mode == RBM_ZERO_ON_ERROR)
1237  else
1238  flags = 0;
1239  operation.smgr = smgr;
1240  operation.rel = rel;
1241  operation.persistence = persistence;
1242  operation.forknum = forkNum;
1243  operation.strategy = strategy;
1244  if (StartReadBuffer(&operation,
1245  &buffer,
1246  blockNum,
1247  flags))
1248  WaitReadBuffers(&operation);
1249 
1250  return buffer;
1251 }
1252 
1253 static pg_attribute_always_inline bool
1255  Buffer *buffers,
1256  BlockNumber blockNum,
1257  int *nblocks,
1258  int flags)
1259 {
1260  int actual_nblocks = *nblocks;
1261  int io_buffers_len = 0;
1262 
1263  Assert(*nblocks > 0);
1264  Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1265 
1266  for (int i = 0; i < actual_nblocks; ++i)
1267  {
1268  bool found;
1269 
1270  buffers[i] = PinBufferForBlock(operation->rel,
1271  operation->smgr,
1272  operation->persistence,
1273  operation->forknum,
1274  blockNum + i,
1275  operation->strategy,
1276  &found);
1277 
1278  if (found)
1279  {
1280  /*
1281  * Terminate the read as soon as we get a hit. It could be a
1282  * single buffer hit, or it could be a hit that follows a readable
1283  * range. We don't want to create more than one readable range,
1284  * so we stop here.
1285  */
1286  actual_nblocks = i + 1;
1287  break;
1288  }
1289  else
1290  {
1291  /* Extend the readable range to cover this block. */
1292  io_buffers_len++;
1293  }
1294  }
1295  *nblocks = actual_nblocks;
1296 
1297  if (likely(io_buffers_len == 0))
1298  return false;
1299 
1300  /* Populate information needed for I/O. */
1301  operation->buffers = buffers;
1302  operation->blocknum = blockNum;
1303  operation->flags = flags;
1304  operation->nblocks = actual_nblocks;
1305  operation->io_buffers_len = io_buffers_len;
1306 
1307  if (flags & READ_BUFFERS_ISSUE_ADVICE)
1308  {
1309  /*
1310  * In theory we should only do this if PinBufferForBlock() had to
1311  * allocate new buffers above. That way, if two calls to
1312  * StartReadBuffers() were made for the same blocks before
1313  * WaitReadBuffers(), only the first would issue the advice. That'd be
1314  * a better simulation of true asynchronous I/O, which would only
1315  * start the I/O once, but isn't done here for simplicity. Note also
1316  * that the following call might actually issue two advice calls if we
1317  * cross a segment boundary; in a true asynchronous version we might
1318  * choose to process only one real I/O at a time in that case.
1319  */
1320  smgrprefetch(operation->smgr,
1321  operation->forknum,
1322  blockNum,
1323  operation->io_buffers_len);
1324  }
1325 
1326  /* Indicate that WaitReadBuffers() should be called. */
1327  return true;
1328 }
1329 
1330 /*
1331  * Begin reading a range of blocks beginning at blockNum and extending for
1332  * *nblocks. On return, up to *nblocks pinned buffers holding those blocks
1333  * are written into the buffers array, and *nblocks is updated to contain the
1334  * actual number, which may be fewer than requested. Caller sets some of the
1335  * members of operation; see struct definition.
1336  *
1337  * If false is returned, no I/O is necessary. If true is returned, one I/O
1338  * has been started, and WaitReadBuffers() must be called with the same
1339  * operation object before the buffers are accessed. Along with the operation
1340  * object, the caller-supplied array of buffers must remain valid until
1341  * WaitReadBuffers() is called.
1342  *
1343  * Currently the I/O is only started with optional operating system advice if
1344  * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1345  * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1346  * be initiated here.
1347  */
1348 bool
1350  Buffer *buffers,
1351  BlockNumber blockNum,
1352  int *nblocks,
1353  int flags)
1354 {
1355  return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
1356 }
1357 
1358 /*
1359  * Single block version of the StartReadBuffers(). This might save a few
1360  * instructions when called from another translation unit, because it is
1361  * specialized for nblocks == 1.
1362  */
1363 bool
1365  Buffer *buffer,
1366  BlockNumber blocknum,
1367  int flags)
1368 {
1369  int nblocks = 1;
1370  bool result;
1371 
1372  result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
1373  Assert(nblocks == 1); /* single block can't be short */
1374 
1375  return result;
1376 }
1377 
1378 static inline bool
1380 {
1381  if (BufferIsLocal(buffer))
1382  {
1383  BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1384 
1385  return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1386  }
1387  else
1388  return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1389 }
1390 
1391 void
1393 {
1394  Buffer *buffers;
1395  int nblocks;
1396  BlockNumber blocknum;
1397  ForkNumber forknum;
1398  IOContext io_context;
1399  IOObject io_object;
1400  char persistence;
1401 
1402  /*
1403  * Currently operations are only allowed to include a read of some range,
1404  * with an optional extra buffer that is already pinned at the end. So
1405  * nblocks can be at most one more than io_buffers_len.
1406  */
1407  Assert((operation->nblocks == operation->io_buffers_len) ||
1408  (operation->nblocks == operation->io_buffers_len + 1));
1409 
1410  /* Find the range of the physical read we need to perform. */
1411  nblocks = operation->io_buffers_len;
1412  if (nblocks == 0)
1413  return; /* nothing to do */
1414 
1415  buffers = &operation->buffers[0];
1416  blocknum = operation->blocknum;
1417  forknum = operation->forknum;
1418  persistence = operation->persistence;
1419 
1420  if (persistence == RELPERSISTENCE_TEMP)
1421  {
1422  io_context = IOCONTEXT_NORMAL;
1423  io_object = IOOBJECT_TEMP_RELATION;
1424  }
1425  else
1426  {
1427  io_context = IOContextForStrategy(operation->strategy);
1428  io_object = IOOBJECT_RELATION;
1429  }
1430 
1431  /*
1432  * We count all these blocks as read by this backend. This is traditional
1433  * behavior, but might turn out to be not true if we find that someone
1434  * else has beaten us and completed the read of some of these blocks. In
1435  * that case the system globally double-counts, but we traditionally don't
1436  * count this as a "hit", and we don't have a separate counter for "miss,
1437  * but another backend completed the read".
1438  */
1439  if (persistence == RELPERSISTENCE_TEMP)
1440  pgBufferUsage.local_blks_read += nblocks;
1441  else
1442  pgBufferUsage.shared_blks_read += nblocks;
1443 
1444  for (int i = 0; i < nblocks; ++i)
1445  {
1446  int io_buffers_len;
1447  Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
1448  void *io_pages[MAX_IO_COMBINE_LIMIT];
1449  instr_time io_start;
1450  BlockNumber io_first_block;
1451 
1452  /*
1453  * Skip this block if someone else has already completed it. If an
1454  * I/O is already in progress in another backend, this will wait for
1455  * the outcome: either done, or something went wrong and we will
1456  * retry.
1457  */
1458  if (!WaitReadBuffersCanStartIO(buffers[i], false))
1459  {
1460  /*
1461  * Report this as a 'hit' for this backend, even though it must
1462  * have started out as a miss in PinBufferForBlock().
1463  */
1464  TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
1465  operation->smgr->smgr_rlocator.locator.spcOid,
1466  operation->smgr->smgr_rlocator.locator.dbOid,
1467  operation->smgr->smgr_rlocator.locator.relNumber,
1468  operation->smgr->smgr_rlocator.backend,
1469  true);
1470  continue;
1471  }
1472 
1473  /* We found a buffer that we need to read in. */
1474  io_buffers[0] = buffers[i];
1475  io_pages[0] = BufferGetBlock(buffers[i]);
1476  io_first_block = blocknum + i;
1477  io_buffers_len = 1;
1478 
1479  /*
1480  * How many neighboring-on-disk blocks can we can scatter-read into
1481  * other buffers at the same time? In this case we don't wait if we
1482  * see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
1483  * for the head block, so we should get on with that I/O as soon as
1484  * possible. We'll come back to this block again, above.
1485  */
1486  while ((i + 1) < nblocks &&
1487  WaitReadBuffersCanStartIO(buffers[i + 1], true))
1488  {
1489  /* Must be consecutive block numbers. */
1490  Assert(BufferGetBlockNumber(buffers[i + 1]) ==
1491  BufferGetBlockNumber(buffers[i]) + 1);
1492 
1493  io_buffers[io_buffers_len] = buffers[++i];
1494  io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1495  }
1496 
1498  smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
1499  pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1500  io_buffers_len);
1501 
1502  /* Verify each block we read, and terminate the I/O. */
1503  for (int j = 0; j < io_buffers_len; ++j)
1504  {
1505  BufferDesc *bufHdr;
1506  Block bufBlock;
1507 
1508  if (persistence == RELPERSISTENCE_TEMP)
1509  {
1510  bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
1511  bufBlock = LocalBufHdrGetBlock(bufHdr);
1512  }
1513  else
1514  {
1515  bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
1516  bufBlock = BufHdrGetBlock(bufHdr);
1517  }
1518 
1519  /* check for garbage data */
1520  if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1522  {
1523  if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
1524  {
1525  ereport(WARNING,
1527  errmsg("invalid page in block %u of relation %s; zeroing out page",
1528  io_first_block + j,
1529  relpath(operation->smgr->smgr_rlocator, forknum))));
1530  memset(bufBlock, 0, BLCKSZ);
1531  }
1532  else
1533  ereport(ERROR,
1535  errmsg("invalid page in block %u of relation %s",
1536  io_first_block + j,
1537  relpath(operation->smgr->smgr_rlocator, forknum))));
1538  }
1539 
1540  /* Terminate I/O and set BM_VALID. */
1541  if (persistence == RELPERSISTENCE_TEMP)
1542  {
1543  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1544 
1545  buf_state |= BM_VALID;
1546  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1547  }
1548  else
1549  {
1550  /* Set BM_VALID, terminate IO, and wake up any waiters */
1551  TerminateBufferIO(bufHdr, false, BM_VALID, true);
1552  }
1553 
1554  /* Report I/Os as completing individually. */
1555  TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
1556  operation->smgr->smgr_rlocator.locator.spcOid,
1557  operation->smgr->smgr_rlocator.locator.dbOid,
1558  operation->smgr->smgr_rlocator.locator.relNumber,
1559  operation->smgr->smgr_rlocator.backend,
1560  false);
1561  }
1562 
1563  if (VacuumCostActive)
1564  VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1565  }
1566 }
1567 
1568 /*
1569  * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1570  * buffer. If no buffer exists already, selects a replacement victim and
1571  * evicts the old page, but does NOT read in new page.
1572  *
1573  * "strategy" can be a buffer replacement strategy object, or NULL for
1574  * the default strategy. The selected buffer's usage_count is advanced when
1575  * using the default strategy, but otherwise possibly not (see PinBuffer).
1576  *
1577  * The returned buffer is pinned and is already marked as holding the
1578  * desired page. If it already did have the desired page, *foundPtr is
1579  * set true. Otherwise, *foundPtr is set false.
1580  *
1581  * io_context is passed as an output parameter to avoid calling
1582  * IOContextForStrategy() when there is a shared buffers hit and no IO
1583  * statistics need be captured.
1584  *
1585  * No locks are held either at entry or exit.
1586  */
1588 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1589  BlockNumber blockNum,
1590  BufferAccessStrategy strategy,
1591  bool *foundPtr, IOContext io_context)
1592 {
1593  BufferTag newTag; /* identity of requested block */
1594  uint32 newHash; /* hash value for newTag */
1595  LWLock *newPartitionLock; /* buffer partition lock for it */
1596  int existing_buf_id;
1597  Buffer victim_buffer;
1598  BufferDesc *victim_buf_hdr;
1599  uint32 victim_buf_state;
1600 
1601  /* Make sure we will have room to remember the buffer pin */
1604 
1605  /* create a tag so we can lookup the buffer */
1606  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1607 
1608  /* determine its hash code and partition lock ID */
1609  newHash = BufTableHashCode(&newTag);
1610  newPartitionLock = BufMappingPartitionLock(newHash);
1611 
1612  /* see if the block is in the buffer pool already */
1613  LWLockAcquire(newPartitionLock, LW_SHARED);
1614  existing_buf_id = BufTableLookup(&newTag, newHash);
1615  if (existing_buf_id >= 0)
1616  {
1617  BufferDesc *buf;
1618  bool valid;
1619 
1620  /*
1621  * Found it. Now, pin the buffer so no one can steal it from the
1622  * buffer pool, and check to see if the correct data has been loaded
1623  * into the buffer.
1624  */
1625  buf = GetBufferDescriptor(existing_buf_id);
1626 
1627  valid = PinBuffer(buf, strategy);
1628 
1629  /* Can release the mapping lock as soon as we've pinned it */
1630  LWLockRelease(newPartitionLock);
1631 
1632  *foundPtr = true;
1633 
1634  if (!valid)
1635  {
1636  /*
1637  * We can only get here if (a) someone else is still reading in
1638  * the page, (b) a previous read attempt failed, or (c) someone
1639  * called StartReadBuffers() but not yet WaitReadBuffers().
1640  */
1641  *foundPtr = false;
1642  }
1643 
1644  return buf;
1645  }
1646 
1647  /*
1648  * Didn't find it in the buffer pool. We'll have to initialize a new
1649  * buffer. Remember to unlock the mapping lock while doing the work.
1650  */
1651  LWLockRelease(newPartitionLock);
1652 
1653  /*
1654  * Acquire a victim buffer. Somebody else might try to do the same, we
1655  * don't hold any conflicting locks. If so we'll have to undo our work
1656  * later.
1657  */
1658  victim_buffer = GetVictimBuffer(strategy, io_context);
1659  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1660 
1661  /*
1662  * Try to make a hashtable entry for the buffer under its new tag. If
1663  * somebody else inserted another buffer for the tag, we'll release the
1664  * victim buffer we acquired and use the already inserted one.
1665  */
1666  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1667  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1668  if (existing_buf_id >= 0)
1669  {
1670  BufferDesc *existing_buf_hdr;
1671  bool valid;
1672 
1673  /*
1674  * Got a collision. Someone has already done what we were about to do.
1675  * We'll just handle this as if it were found in the buffer pool in
1676  * the first place. First, give up the buffer we were planning to
1677  * use.
1678  *
1679  * We could do this after releasing the partition lock, but then we'd
1680  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1681  * before acquiring the lock, for the rare case of such a collision.
1682  */
1683  UnpinBuffer(victim_buf_hdr);
1684 
1685  /*
1686  * The victim buffer we acquired previously is clean and unused, let
1687  * it be found again quickly
1688  */
1689  StrategyFreeBuffer(victim_buf_hdr);
1690 
1691  /* remaining code should match code at top of routine */
1692 
1693  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1694 
1695  valid = PinBuffer(existing_buf_hdr, strategy);
1696 
1697  /* Can release the mapping lock as soon as we've pinned it */
1698  LWLockRelease(newPartitionLock);
1699 
1700  *foundPtr = true;
1701 
1702  if (!valid)
1703  {
1704  /*
1705  * We can only get here if (a) someone else is still reading in
1706  * the page, (b) a previous read attempt failed, or (c) someone
1707  * called StartReadBuffers() but not yet WaitReadBuffers().
1708  */
1709  *foundPtr = false;
1710  }
1711 
1712  return existing_buf_hdr;
1713  }
1714 
1715  /*
1716  * Need to lock the buffer header too in order to change its tag.
1717  */
1718  victim_buf_state = LockBufHdr(victim_buf_hdr);
1719 
1720  /* some sanity checks while we hold the buffer header lock */
1721  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1722  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1723 
1724  victim_buf_hdr->tag = newTag;
1725 
1726  /*
1727  * Make sure BM_PERMANENT is set for buffers that must be written at every
1728  * checkpoint. Unlogged buffers only need to be written at shutdown
1729  * checkpoints, except for their "init" forks, which need to be treated
1730  * just like permanent relations.
1731  */
1732  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1733  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1734  victim_buf_state |= BM_PERMANENT;
1735 
1736  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1737 
1738  LWLockRelease(newPartitionLock);
1739 
1740  /*
1741  * Buffer contents are currently invalid.
1742  */
1743  *foundPtr = false;
1744 
1745  return victim_buf_hdr;
1746 }
1747 
1748 /*
1749  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1750  * freelist.
1751  *
1752  * The buffer header spinlock must be held at entry. We drop it before
1753  * returning. (This is sane because the caller must have locked the
1754  * buffer in order to be sure it should be dropped.)
1755  *
1756  * This is used only in contexts such as dropping a relation. We assume
1757  * that no other backend could possibly be interested in using the page,
1758  * so the only reason the buffer might be pinned is if someone else is
1759  * trying to write it out. We have to let them finish before we can
1760  * reclaim the buffer.
1761  *
1762  * The buffer could get reclaimed by someone else while we are waiting
1763  * to acquire the necessary locks; if so, don't mess it up.
1764  */
1765 static void
1767 {
1768  BufferTag oldTag;
1769  uint32 oldHash; /* hash value for oldTag */
1770  LWLock *oldPartitionLock; /* buffer partition lock for it */
1771  uint32 oldFlags;
1772  uint32 buf_state;
1773 
1774  /* Save the original buffer tag before dropping the spinlock */
1775  oldTag = buf->tag;
1776 
1777  buf_state = pg_atomic_read_u32(&buf->state);
1778  Assert(buf_state & BM_LOCKED);
1779  UnlockBufHdr(buf, buf_state);
1780 
1781  /*
1782  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1783  * worth storing the hashcode in BufferDesc so we need not recompute it
1784  * here? Probably not.
1785  */
1786  oldHash = BufTableHashCode(&oldTag);
1787  oldPartitionLock = BufMappingPartitionLock(oldHash);
1788 
1789 retry:
1790 
1791  /*
1792  * Acquire exclusive mapping lock in preparation for changing the buffer's
1793  * association.
1794  */
1795  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1796 
1797  /* Re-lock the buffer header */
1798  buf_state = LockBufHdr(buf);
1799 
1800  /* If it's changed while we were waiting for lock, do nothing */
1801  if (!BufferTagsEqual(&buf->tag, &oldTag))
1802  {
1803  UnlockBufHdr(buf, buf_state);
1804  LWLockRelease(oldPartitionLock);
1805  return;
1806  }
1807 
1808  /*
1809  * We assume the only reason for it to be pinned is that someone else is
1810  * flushing the page out. Wait for them to finish. (This could be an
1811  * infinite loop if the refcount is messed up... it would be nice to time
1812  * out after awhile, but there seems no way to be sure how many loops may
1813  * be needed. Note that if the other guy has pinned the buffer but not
1814  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1815  * be busy-looping here.)
1816  */
1817  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1818  {
1819  UnlockBufHdr(buf, buf_state);
1820  LWLockRelease(oldPartitionLock);
1821  /* safety check: should definitely not be our *own* pin */
1823  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1824  WaitIO(buf);
1825  goto retry;
1826  }
1827 
1828  /*
1829  * Clear out the buffer's tag and flags. We must do this to ensure that
1830  * linear scans of the buffer array don't think the buffer is valid.
1831  */
1832  oldFlags = buf_state & BUF_FLAG_MASK;
1833  ClearBufferTag(&buf->tag);
1834  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1835  UnlockBufHdr(buf, buf_state);
1836 
1837  /*
1838  * Remove the buffer from the lookup hashtable, if it was in there.
1839  */
1840  if (oldFlags & BM_TAG_VALID)
1841  BufTableDelete(&oldTag, oldHash);
1842 
1843  /*
1844  * Done with mapping lock.
1845  */
1846  LWLockRelease(oldPartitionLock);
1847 
1848  /*
1849  * Insert the buffer at the head of the list of free buffers.
1850  */
1852 }
1853 
1854 /*
1855  * Helper routine for GetVictimBuffer()
1856  *
1857  * Needs to be called on a buffer with a valid tag, pinned, but without the
1858  * buffer header spinlock held.
1859  *
1860  * Returns true if the buffer can be reused, in which case the buffer is only
1861  * pinned by this backend and marked as invalid, false otherwise.
1862  */
1863 static bool
1865 {
1866  uint32 buf_state;
1867  uint32 hash;
1868  LWLock *partition_lock;
1869  BufferTag tag;
1870 
1872 
1873  /* have buffer pinned, so it's safe to read tag without lock */
1874  tag = buf_hdr->tag;
1875 
1876  hash = BufTableHashCode(&tag);
1877  partition_lock = BufMappingPartitionLock(hash);
1878 
1879  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1880 
1881  /* lock the buffer header */
1882  buf_state = LockBufHdr(buf_hdr);
1883 
1884  /*
1885  * We have the buffer pinned nobody else should have been able to unset
1886  * this concurrently.
1887  */
1888  Assert(buf_state & BM_TAG_VALID);
1889  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1890  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1891 
1892  /*
1893  * If somebody else pinned the buffer since, or even worse, dirtied it,
1894  * give up on this buffer: It's clearly in use.
1895  */
1896  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1897  {
1898  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1899 
1900  UnlockBufHdr(buf_hdr, buf_state);
1901  LWLockRelease(partition_lock);
1902 
1903  return false;
1904  }
1905 
1906  /*
1907  * Clear out the buffer's tag and flags and usagecount. This is not
1908  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1909  * doing anything with the buffer. But currently it's beneficial, as the
1910  * cheaper pre-check for several linear scans of shared buffers use the
1911  * tag (see e.g. FlushDatabaseBuffers()).
1912  */
1913  ClearBufferTag(&buf_hdr->tag);
1914  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1915  UnlockBufHdr(buf_hdr, buf_state);
1916 
1917  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1918 
1919  /* finally delete buffer from the buffer mapping table */
1920  BufTableDelete(&tag, hash);
1921 
1922  LWLockRelease(partition_lock);
1923 
1924  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1925  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1927 
1928  return true;
1929 }
1930 
1931 static Buffer
1933 {
1934  BufferDesc *buf_hdr;
1935  Buffer buf;
1936  uint32 buf_state;
1937  bool from_ring;
1938 
1939  /*
1940  * Ensure, while the spinlock's not yet held, that there's a free refcount
1941  * entry, and a resource owner slot for the pin.
1942  */
1945 
1946  /* we return here if a prospective victim buffer gets used concurrently */
1947 again:
1948 
1949  /*
1950  * Select a victim buffer. The buffer is returned with its header
1951  * spinlock still held!
1952  */
1953  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1954  buf = BufferDescriptorGetBuffer(buf_hdr);
1955 
1956  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1957 
1958  /* Pin the buffer and then release the buffer spinlock */
1959  PinBuffer_Locked(buf_hdr);
1960 
1961  /*
1962  * We shouldn't have any other pins for this buffer.
1963  */
1965 
1966  /*
1967  * If the buffer was dirty, try to write it out. There is a race
1968  * condition here, in that someone might dirty it after we released the
1969  * buffer header lock above, or even while we are writing it out (since
1970  * our share-lock won't prevent hint-bit updates). We will recheck the
1971  * dirty bit after re-locking the buffer header.
1972  */
1973  if (buf_state & BM_DIRTY)
1974  {
1975  LWLock *content_lock;
1976 
1977  Assert(buf_state & BM_TAG_VALID);
1978  Assert(buf_state & BM_VALID);
1979 
1980  /*
1981  * We need a share-lock on the buffer contents to write it out (else
1982  * we might write invalid data, eg because someone else is compacting
1983  * the page contents while we write). We must use a conditional lock
1984  * acquisition here to avoid deadlock. Even though the buffer was not
1985  * pinned (and therefore surely not locked) when StrategyGetBuffer
1986  * returned it, someone else could have pinned and exclusive-locked it
1987  * by the time we get here. If we try to get the lock unconditionally,
1988  * we'd block waiting for them; if they later block waiting for us,
1989  * deadlock ensues. (This has been observed to happen when two
1990  * backends are both trying to split btree index pages, and the second
1991  * one just happens to be trying to split the page the first one got
1992  * from StrategyGetBuffer.)
1993  */
1994  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1995  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1996  {
1997  /*
1998  * Someone else has locked the buffer, so give it up and loop back
1999  * to get another one.
2000  */
2001  UnpinBuffer(buf_hdr);
2002  goto again;
2003  }
2004 
2005  /*
2006  * If using a nondefault strategy, and writing the buffer would
2007  * require a WAL flush, let the strategy decide whether to go ahead
2008  * and write/reuse the buffer or to choose another victim. We need a
2009  * lock to inspect the page LSN, so this can't be done inside
2010  * StrategyGetBuffer.
2011  */
2012  if (strategy != NULL)
2013  {
2014  XLogRecPtr lsn;
2015 
2016  /* Read the LSN while holding buffer header lock */
2017  buf_state = LockBufHdr(buf_hdr);
2018  lsn = BufferGetLSN(buf_hdr);
2019  UnlockBufHdr(buf_hdr, buf_state);
2020 
2021  if (XLogNeedsFlush(lsn)
2022  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2023  {
2024  LWLockRelease(content_lock);
2025  UnpinBuffer(buf_hdr);
2026  goto again;
2027  }
2028  }
2029 
2030  /* OK, do the I/O */
2031  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2032  LWLockRelease(content_lock);
2033 
2035  &buf_hdr->tag);
2036  }
2037 
2038 
2039  if (buf_state & BM_VALID)
2040  {
2041  /*
2042  * When a BufferAccessStrategy is in use, blocks evicted from shared
2043  * buffers are counted as IOOP_EVICT in the corresponding context
2044  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2045  * strategy in two cases: 1) while initially claiming buffers for the
2046  * strategy ring 2) to replace an existing strategy ring buffer
2047  * because it is pinned or in use and cannot be reused.
2048  *
2049  * Blocks evicted from buffers already in the strategy ring are
2050  * counted as IOOP_REUSE in the corresponding strategy context.
2051  *
2052  * At this point, we can accurately count evictions and reuses,
2053  * because we have successfully claimed the valid buffer. Previously,
2054  * we may have been forced to release the buffer due to concurrent
2055  * pinners or erroring out.
2056  */
2058  from_ring ? IOOP_REUSE : IOOP_EVICT);
2059  }
2060 
2061  /*
2062  * If the buffer has an entry in the buffer mapping table, delete it. This
2063  * can fail because another backend could have pinned or dirtied the
2064  * buffer.
2065  */
2066  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2067  {
2068  UnpinBuffer(buf_hdr);
2069  goto again;
2070  }
2071 
2072  /* a final set of sanity checks */
2073 #ifdef USE_ASSERT_CHECKING
2074  buf_state = pg_atomic_read_u32(&buf_hdr->state);
2075 
2076  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2077  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2078 
2080 #endif
2081 
2082  return buf;
2083 }
2084 
2085 /*
2086  * Limit the number of pins a batch operation may additionally acquire, to
2087  * avoid running out of pinnable buffers.
2088  *
2089  * One additional pin is always allowed, as otherwise the operation likely
2090  * cannot be performed at all.
2091  *
2092  * The number of allowed pins for a backend is computed based on
2093  * shared_buffers and the maximum number of connections possible. That's very
2094  * pessimistic, but outside of toy-sized shared_buffers it should allow
2095  * sufficient pins.
2096  */
2097 void
2098 LimitAdditionalPins(uint32 *additional_pins)
2099 {
2100  uint32 max_backends;
2101  int max_proportional_pins;
2102 
2103  if (*additional_pins <= 1)
2104  return;
2105 
2106  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2107  max_proportional_pins = NBuffers / max_backends;
2108 
2109  /*
2110  * Subtract the approximate number of buffers already pinned by this
2111  * backend. We get the number of "overflowed" pins for free, but don't
2112  * know the number of pins in PrivateRefCountArray. The cost of
2113  * calculating that exactly doesn't seem worth it, so just assume the max.
2114  */
2115  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2116 
2117  if (max_proportional_pins <= 0)
2118  max_proportional_pins = 1;
2119 
2120  if (*additional_pins > max_proportional_pins)
2121  *additional_pins = max_proportional_pins;
2122 }
2123 
2124 /*
2125  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2126  * avoid duplicating the tracing and relpersistence related logic.
2127  */
2128 static BlockNumber
2130  ForkNumber fork,
2131  BufferAccessStrategy strategy,
2132  uint32 flags,
2133  uint32 extend_by,
2134  BlockNumber extend_upto,
2135  Buffer *buffers,
2136  uint32 *extended_by)
2137 {
2138  BlockNumber first_block;
2139 
2140  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2144  bmr.smgr->smgr_rlocator.backend,
2145  extend_by);
2146 
2147  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2148  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2149  extend_by, extend_upto,
2150  buffers, &extend_by);
2151  else
2152  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2153  extend_by, extend_upto,
2154  buffers, &extend_by);
2155  *extended_by = extend_by;
2156 
2157  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2161  bmr.smgr->smgr_rlocator.backend,
2162  *extended_by,
2163  first_block);
2164 
2165  return first_block;
2166 }
2167 
2168 /*
2169  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2170  * shared buffers.
2171  */
2172 static BlockNumber
2174  ForkNumber fork,
2175  BufferAccessStrategy strategy,
2176  uint32 flags,
2177  uint32 extend_by,
2178  BlockNumber extend_upto,
2179  Buffer *buffers,
2180  uint32 *extended_by)
2181 {
2182  BlockNumber first_block;
2183  IOContext io_context = IOContextForStrategy(strategy);
2184  instr_time io_start;
2185 
2186  LimitAdditionalPins(&extend_by);
2187 
2188  /*
2189  * Acquire victim buffers for extension without holding extension lock.
2190  * Writing out victim buffers is the most expensive part of extending the
2191  * relation, particularly when doing so requires WAL flushes. Zeroing out
2192  * the buffers is also quite expensive, so do that before holding the
2193  * extension lock as well.
2194  *
2195  * These pages are pinned by us and not valid. While we hold the pin they
2196  * can't be acquired as victim buffers by another backend.
2197  */
2198  for (uint32 i = 0; i < extend_by; i++)
2199  {
2200  Block buf_block;
2201 
2202  buffers[i] = GetVictimBuffer(strategy, io_context);
2203  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2204 
2205  /* new buffers are zero-filled */
2206  MemSet((char *) buf_block, 0, BLCKSZ);
2207  }
2208 
2209  /*
2210  * Lock relation against concurrent extensions, unless requested not to.
2211  *
2212  * We use the same extension lock for all forks. That's unnecessarily
2213  * restrictive, but currently extensions for forks don't happen often
2214  * enough to make it worth locking more granularly.
2215  *
2216  * Note that another backend might have extended the relation by the time
2217  * we get the lock.
2218  */
2219  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2221 
2222  /*
2223  * If requested, invalidate size cache, so that smgrnblocks asks the
2224  * kernel.
2225  */
2226  if (flags & EB_CLEAR_SIZE_CACHE)
2228 
2229  first_block = smgrnblocks(bmr.smgr, fork);
2230 
2231  /*
2232  * Now that we have the accurate relation size, check if the caller wants
2233  * us to extend to only up to a specific size. If there were concurrent
2234  * extensions, we might have acquired too many buffers and need to release
2235  * them.
2236  */
2237  if (extend_upto != InvalidBlockNumber)
2238  {
2239  uint32 orig_extend_by = extend_by;
2240 
2241  if (first_block > extend_upto)
2242  extend_by = 0;
2243  else if ((uint64) first_block + extend_by > extend_upto)
2244  extend_by = extend_upto - first_block;
2245 
2246  for (uint32 i = extend_by; i < orig_extend_by; i++)
2247  {
2248  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2249 
2250  /*
2251  * The victim buffer we acquired previously is clean and unused,
2252  * let it be found again quickly
2253  */
2254  StrategyFreeBuffer(buf_hdr);
2255  UnpinBuffer(buf_hdr);
2256  }
2257 
2258  if (extend_by == 0)
2259  {
2260  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2262  *extended_by = extend_by;
2263  return first_block;
2264  }
2265  }
2266 
2267  /* Fail if relation is already at maximum possible length */
2268  if ((uint64) first_block + extend_by >= MaxBlockNumber)
2269  ereport(ERROR,
2270  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2271  errmsg("cannot extend relation %s beyond %u blocks",
2272  relpath(bmr.smgr->smgr_rlocator, fork),
2273  MaxBlockNumber)));
2274 
2275  /*
2276  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2277  *
2278  * This needs to happen before we extend the relation, because as soon as
2279  * we do, other backends can start to read in those pages.
2280  */
2281  for (uint32 i = 0; i < extend_by; i++)
2282  {
2283  Buffer victim_buf = buffers[i];
2284  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2285  BufferTag tag;
2286  uint32 hash;
2287  LWLock *partition_lock;
2288  int existing_id;
2289 
2290  /* in case we need to pin an existing buffer below */
2293 
2294  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2295  hash = BufTableHashCode(&tag);
2296  partition_lock = BufMappingPartitionLock(hash);
2297 
2298  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2299 
2300  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2301 
2302  /*
2303  * We get here only in the corner case where we are trying to extend
2304  * the relation but we found a pre-existing buffer. This can happen
2305  * because a prior attempt at extending the relation failed, and
2306  * because mdread doesn't complain about reads beyond EOF (when
2307  * zero_damaged_pages is ON) and so a previous attempt to read a block
2308  * beyond EOF could have left a "valid" zero-filled buffer.
2309  * Unfortunately, we have also seen this case occurring because of
2310  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2311  * that doesn't account for a recent write. In that situation, the
2312  * pre-existing buffer would contain valid data that we don't want to
2313  * overwrite. Since the legitimate cases should always have left a
2314  * zero-filled buffer, complain if not PageIsNew.
2315  */
2316  if (existing_id >= 0)
2317  {
2318  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2319  Block buf_block;
2320  bool valid;
2321 
2322  /*
2323  * Pin the existing buffer before releasing the partition lock,
2324  * preventing it from being evicted.
2325  */
2326  valid = PinBuffer(existing_hdr, strategy);
2327 
2328  LWLockRelease(partition_lock);
2329 
2330  /*
2331  * The victim buffer we acquired previously is clean and unused,
2332  * let it be found again quickly
2333  */
2334  StrategyFreeBuffer(victim_buf_hdr);
2335  UnpinBuffer(victim_buf_hdr);
2336 
2337  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2338  buf_block = BufHdrGetBlock(existing_hdr);
2339 
2340  if (valid && !PageIsNew((Page) buf_block))
2341  ereport(ERROR,
2342  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2343  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2344  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2345 
2346  /*
2347  * We *must* do smgr[zero]extend before succeeding, else the page
2348  * will not be reserved by the kernel, and the next P_NEW call
2349  * will decide to return the same page. Clear the BM_VALID bit,
2350  * do StartBufferIO() and proceed.
2351  *
2352  * Loop to handle the very small possibility that someone re-sets
2353  * BM_VALID between our clearing it and StartBufferIO inspecting
2354  * it.
2355  */
2356  do
2357  {
2358  uint32 buf_state = LockBufHdr(existing_hdr);
2359 
2360  buf_state &= ~BM_VALID;
2361  UnlockBufHdr(existing_hdr, buf_state);
2362  } while (!StartBufferIO(existing_hdr, true, false));
2363  }
2364  else
2365  {
2366  uint32 buf_state;
2367 
2368  buf_state = LockBufHdr(victim_buf_hdr);
2369 
2370  /* some sanity checks while we hold the buffer header lock */
2371  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2372  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2373 
2374  victim_buf_hdr->tag = tag;
2375 
2376  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2377  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2378  buf_state |= BM_PERMANENT;
2379 
2380  UnlockBufHdr(victim_buf_hdr, buf_state);
2381 
2382  LWLockRelease(partition_lock);
2383 
2384  /* XXX: could combine the locked operations in it with the above */
2385  StartBufferIO(victim_buf_hdr, true, false);
2386  }
2387  }
2388 
2390 
2391  /*
2392  * Note: if smgrzeroextend fails, we will end up with buffers that are
2393  * allocated but not marked BM_VALID. The next relation extension will
2394  * still select the same block number (because the relation didn't get any
2395  * longer on disk) and so future attempts to extend the relation will find
2396  * the same buffers (if they have not been recycled) but come right back
2397  * here to try smgrzeroextend again.
2398  *
2399  * We don't need to set checksum for all-zero pages.
2400  */
2401  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2402 
2403  /*
2404  * Release the file-extension lock; it's now OK for someone else to extend
2405  * the relation some more.
2406  *
2407  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2408  * take noticeable time.
2409  */
2410  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2412 
2414  io_start, extend_by);
2415 
2416  /* Set BM_VALID, terminate IO, and wake up any waiters */
2417  for (uint32 i = 0; i < extend_by; i++)
2418  {
2419  Buffer buf = buffers[i];
2420  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2421  bool lock = false;
2422 
2423  if (flags & EB_LOCK_FIRST && i == 0)
2424  lock = true;
2425  else if (flags & EB_LOCK_TARGET)
2426  {
2427  Assert(extend_upto != InvalidBlockNumber);
2428  if (first_block + i + 1 == extend_upto)
2429  lock = true;
2430  }
2431 
2432  if (lock)
2434 
2435  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2436  }
2437 
2438  pgBufferUsage.shared_blks_written += extend_by;
2439 
2440  *extended_by = extend_by;
2441 
2442  return first_block;
2443 }
2444 
2445 /*
2446  * BufferIsExclusiveLocked
2447  *
2448  * Checks if buffer is exclusive-locked.
2449  *
2450  * Buffer must be pinned.
2451  */
2452 bool
2454 {
2455  BufferDesc *bufHdr;
2456 
2457  if (BufferIsLocal(buffer))
2458  {
2459  int bufid = -buffer - 1;
2460 
2461  bufHdr = GetLocalBufferDescriptor(bufid);
2462  }
2463  else
2464  {
2465  bufHdr = GetBufferDescriptor(buffer - 1);
2466  }
2467 
2470  LW_EXCLUSIVE);
2471 }
2472 
2473 /*
2474  * BufferIsDirty
2475  *
2476  * Checks if buffer is already dirty.
2477  *
2478  * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2479  * the result may be stale before it's returned.)
2480  */
2481 bool
2483 {
2484  BufferDesc *bufHdr;
2485 
2486  if (BufferIsLocal(buffer))
2487  {
2488  int bufid = -buffer - 1;
2489 
2490  bufHdr = GetLocalBufferDescriptor(bufid);
2491  }
2492  else
2493  {
2494  bufHdr = GetBufferDescriptor(buffer - 1);
2495  }
2496 
2499  LW_EXCLUSIVE));
2500 
2501  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2502 }
2503 
2504 /*
2505  * MarkBufferDirty
2506  *
2507  * Marks buffer contents as dirty (actual write happens later).
2508  *
2509  * Buffer must be pinned and exclusive-locked. (If caller does not hold
2510  * exclusive lock, then somebody could be in process of writing the buffer,
2511  * leading to risk of bad data written to disk.)
2512  */
2513 void
2515 {
2516  BufferDesc *bufHdr;
2517  uint32 buf_state;
2518  uint32 old_buf_state;
2519 
2520  if (!BufferIsValid(buffer))
2521  elog(ERROR, "bad buffer ID: %d", buffer);
2522 
2523  if (BufferIsLocal(buffer))
2524  {
2526  return;
2527  }
2528 
2529  bufHdr = GetBufferDescriptor(buffer - 1);
2530 
2533  LW_EXCLUSIVE));
2534 
2535  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2536  for (;;)
2537  {
2538  if (old_buf_state & BM_LOCKED)
2539  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2540 
2541  buf_state = old_buf_state;
2542 
2543  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2544  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2545 
2546  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2547  buf_state))
2548  break;
2549  }
2550 
2551  /*
2552  * If the buffer was not dirty already, do vacuum accounting.
2553  */
2554  if (!(old_buf_state & BM_DIRTY))
2555  {
2557  if (VacuumCostActive)
2559  }
2560 }
2561 
2562 /*
2563  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2564  *
2565  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2566  * compared to calling the two routines separately. Now it's mainly just
2567  * a convenience function. However, if the passed buffer is valid and
2568  * already contains the desired block, we just return it as-is; and that
2569  * does save considerable work compared to a full release and reacquire.
2570  *
2571  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2572  * buffer actually needs to be released. This case is the same as ReadBuffer,
2573  * but can save some tests in the caller.
2574  */
2575 Buffer
2577  Relation relation,
2578  BlockNumber blockNum)
2579 {
2580  ForkNumber forkNum = MAIN_FORKNUM;
2581  BufferDesc *bufHdr;
2582 
2583  if (BufferIsValid(buffer))
2584  {
2586  if (BufferIsLocal(buffer))
2587  {
2588  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2589  if (bufHdr->tag.blockNum == blockNum &&
2590  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2591  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2592  return buffer;
2594  }
2595  else
2596  {
2597  bufHdr = GetBufferDescriptor(buffer - 1);
2598  /* we have pin, so it's ok to examine tag without spinlock */
2599  if (bufHdr->tag.blockNum == blockNum &&
2600  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2601  BufTagGetForkNum(&bufHdr->tag) == forkNum)
2602  return buffer;
2603  UnpinBuffer(bufHdr);
2604  }
2605  }
2606 
2607  return ReadBuffer(relation, blockNum);
2608 }
2609 
2610 /*
2611  * PinBuffer -- make buffer unavailable for replacement.
2612  *
2613  * For the default access strategy, the buffer's usage_count is incremented
2614  * when we first pin it; for other strategies we just make sure the usage_count
2615  * isn't zero. (The idea of the latter is that we don't want synchronized
2616  * heap scans to inflate the count, but we need it to not be zero to discourage
2617  * other backends from stealing buffers from our ring. As long as we cycle
2618  * through the ring faster than the global clock-sweep cycles, buffers in
2619  * our ring won't be chosen as victims for replacement by other backends.)
2620  *
2621  * This should be applied only to shared buffers, never local ones.
2622  *
2623  * Since buffers are pinned/unpinned very frequently, pin buffers without
2624  * taking the buffer header lock; instead update the state variable in loop of
2625  * CAS operations. Hopefully it's just a single CAS.
2626  *
2627  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2628  * must have been done already.
2629  *
2630  * Returns true if buffer is BM_VALID, else false. This provision allows
2631  * some callers to avoid an extra spinlock cycle.
2632  */
2633 static bool
2635 {
2637  bool result;
2638  PrivateRefCountEntry *ref;
2639 
2640  Assert(!BufferIsLocal(b));
2641  Assert(ReservedRefCountEntry != NULL);
2642 
2643  ref = GetPrivateRefCountEntry(b, true);
2644 
2645  if (ref == NULL)
2646  {
2647  uint32 buf_state;
2648  uint32 old_buf_state;
2649 
2650  ref = NewPrivateRefCountEntry(b);
2651 
2652  old_buf_state = pg_atomic_read_u32(&buf->state);
2653  for (;;)
2654  {
2655  if (old_buf_state & BM_LOCKED)
2656  old_buf_state = WaitBufHdrUnlocked(buf);
2657 
2658  buf_state = old_buf_state;
2659 
2660  /* increase refcount */
2661  buf_state += BUF_REFCOUNT_ONE;
2662 
2663  if (strategy == NULL)
2664  {
2665  /* Default case: increase usagecount unless already max. */
2667  buf_state += BUF_USAGECOUNT_ONE;
2668  }
2669  else
2670  {
2671  /*
2672  * Ring buffers shouldn't evict others from pool. Thus we
2673  * don't make usagecount more than 1.
2674  */
2675  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2676  buf_state += BUF_USAGECOUNT_ONE;
2677  }
2678 
2679  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2680  buf_state))
2681  {
2682  result = (buf_state & BM_VALID) != 0;
2683 
2684  /*
2685  * Assume that we acquired a buffer pin for the purposes of
2686  * Valgrind buffer client checks (even in !result case) to
2687  * keep things simple. Buffers that are unsafe to access are
2688  * not generally guaranteed to be marked undefined or
2689  * non-accessible in any case.
2690  */
2692  break;
2693  }
2694  }
2695  }
2696  else
2697  {
2698  /*
2699  * If we previously pinned the buffer, it is likely to be valid, but
2700  * it may not be if StartReadBuffers() was called and
2701  * WaitReadBuffers() hasn't been called yet. We'll check by loading
2702  * the flags without locking. This is racy, but it's OK to return
2703  * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2704  * it'll see that it's now valid.
2705  *
2706  * Note: We deliberately avoid a Valgrind client request here.
2707  * Individual access methods can optionally superimpose buffer page
2708  * client requests on top of our client requests to enforce that
2709  * buffers are only accessed while locked (and pinned). It's possible
2710  * that the buffer page is legitimately non-accessible here. We
2711  * cannot meddle with that.
2712  */
2713  result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2714  }
2715 
2716  ref->refcount++;
2717  Assert(ref->refcount > 0);
2719  return result;
2720 }
2721 
2722 /*
2723  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2724  * The spinlock is released before return.
2725  *
2726  * As this function is called with the spinlock held, the caller has to
2727  * previously call ReservePrivateRefCountEntry() and
2728  * ResourceOwnerEnlarge(CurrentResourceOwner);
2729  *
2730  * Currently, no callers of this function want to modify the buffer's
2731  * usage_count at all, so there's no need for a strategy parameter.
2732  * Also we don't bother with a BM_VALID test (the caller could check that for
2733  * itself).
2734  *
2735  * Also all callers only ever use this function when it's known that the
2736  * buffer can't have a preexisting pin by this backend. That allows us to skip
2737  * searching the private refcount array & hash, which is a boon, because the
2738  * spinlock is still held.
2739  *
2740  * Note: use of this routine is frequently mandatory, not just an optimization
2741  * to save a spin lock/unlock cycle, because we need to pin a buffer before
2742  * its state can change under us.
2743  */
2744 static void
2746 {
2747  Buffer b;
2748  PrivateRefCountEntry *ref;
2749  uint32 buf_state;
2750 
2751  /*
2752  * As explained, We don't expect any preexisting pins. That allows us to
2753  * manipulate the PrivateRefCount after releasing the spinlock
2754  */
2756 
2757  /*
2758  * Buffer can't have a preexisting pin, so mark its page as defined to
2759  * Valgrind (this is similar to the PinBuffer() case where the backend
2760  * doesn't already have a buffer pin)
2761  */
2763 
2764  /*
2765  * Since we hold the buffer spinlock, we can update the buffer state and
2766  * release the lock in one operation.
2767  */
2768  buf_state = pg_atomic_read_u32(&buf->state);
2769  Assert(buf_state & BM_LOCKED);
2770  buf_state += BUF_REFCOUNT_ONE;
2771  UnlockBufHdr(buf, buf_state);
2772 
2774 
2775  ref = NewPrivateRefCountEntry(b);
2776  ref->refcount++;
2777 
2779 }
2780 
2781 /*
2782  * UnpinBuffer -- make buffer available for replacement.
2783  *
2784  * This should be applied only to shared buffers, never local ones. This
2785  * always adjusts CurrentResourceOwner.
2786  */
2787 static void
2789 {
2791 
2794 }
2795 
2796 static void
2798 {
2799  PrivateRefCountEntry *ref;
2801 
2802  Assert(!BufferIsLocal(b));
2803 
2804  /* not moving as we're likely deleting it soon anyway */
2805  ref = GetPrivateRefCountEntry(b, false);
2806  Assert(ref != NULL);
2807  Assert(ref->refcount > 0);
2808  ref->refcount--;
2809  if (ref->refcount == 0)
2810  {
2811  uint32 buf_state;
2812  uint32 old_buf_state;
2813 
2814  /*
2815  * Mark buffer non-accessible to Valgrind.
2816  *
2817  * Note that the buffer may have already been marked non-accessible
2818  * within access method code that enforces that buffers are only
2819  * accessed while a buffer lock is held.
2820  */
2822 
2823  /* I'd better not still hold the buffer content lock */
2825 
2826  /*
2827  * Decrement the shared reference count.
2828  *
2829  * Since buffer spinlock holder can update status using just write,
2830  * it's not safe to use atomic decrement here; thus use a CAS loop.
2831  */
2832  old_buf_state = pg_atomic_read_u32(&buf->state);
2833  for (;;)
2834  {
2835  if (old_buf_state & BM_LOCKED)
2836  old_buf_state = WaitBufHdrUnlocked(buf);
2837 
2838  buf_state = old_buf_state;
2839 
2840  buf_state -= BUF_REFCOUNT_ONE;
2841 
2842  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2843  buf_state))
2844  break;
2845  }
2846 
2847  /* Support LockBufferForCleanup() */
2848  if (buf_state & BM_PIN_COUNT_WAITER)
2849  {
2850  /*
2851  * Acquire the buffer header lock, re-check that there's a waiter.
2852  * Another backend could have unpinned this buffer, and already
2853  * woken up the waiter. There's no danger of the buffer being
2854  * replaced after we unpinned it above, as it's pinned by the
2855  * waiter.
2856  */
2857  buf_state = LockBufHdr(buf);
2858 
2859  if ((buf_state & BM_PIN_COUNT_WAITER) &&
2860  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2861  {
2862  /* we just released the last pin other than the waiter's */
2863  int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2864 
2865  buf_state &= ~BM_PIN_COUNT_WAITER;
2866  UnlockBufHdr(buf, buf_state);
2867  ProcSendSignal(wait_backend_pgprocno);
2868  }
2869  else
2870  UnlockBufHdr(buf, buf_state);
2871  }
2873  }
2874 }
2875 
2876 #define ST_SORT sort_checkpoint_bufferids
2877 #define ST_ELEMENT_TYPE CkptSortItem
2878 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2879 #define ST_SCOPE static
2880 #define ST_DEFINE
2881 #include <lib/sort_template.h>
2882 
2883 /*
2884  * BufferSync -- Write out all dirty buffers in the pool.
2885  *
2886  * This is called at checkpoint time to write out all dirty shared buffers.
2887  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2888  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2889  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2890  * unlogged buffers, which are otherwise skipped. The remaining flags
2891  * currently have no effect here.
2892  */
2893 static void
2894 BufferSync(int flags)
2895 {
2896  uint32 buf_state;
2897  int buf_id;
2898  int num_to_scan;
2899  int num_spaces;
2900  int num_processed;
2901  int num_written;
2902  CkptTsStatus *per_ts_stat = NULL;
2903  Oid last_tsid;
2904  binaryheap *ts_heap;
2905  int i;
2906  int mask = BM_DIRTY;
2907  WritebackContext wb_context;
2908 
2909  /*
2910  * Unless this is a shutdown checkpoint or we have been explicitly told,
2911  * we write only permanent, dirty buffers. But at shutdown or end of
2912  * recovery, we write all dirty buffers.
2913  */
2916  mask |= BM_PERMANENT;
2917 
2918  /*
2919  * Loop over all buffers, and mark the ones that need to be written with
2920  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2921  * can estimate how much work needs to be done.
2922  *
2923  * This allows us to write only those pages that were dirty when the
2924  * checkpoint began, and not those that get dirtied while it proceeds.
2925  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2926  * later in this function, or by normal backends or the bgwriter cleaning
2927  * scan, the flag is cleared. Any buffer dirtied after this point won't
2928  * have the flag set.
2929  *
2930  * Note that if we fail to write some buffer, we may leave buffers with
2931  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2932  * certainly need to be written for the next checkpoint attempt, too.
2933  */
2934  num_to_scan = 0;
2935  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2936  {
2937  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2938 
2939  /*
2940  * Header spinlock is enough to examine BM_DIRTY, see comment in
2941  * SyncOneBuffer.
2942  */
2943  buf_state = LockBufHdr(bufHdr);
2944 
2945  if ((buf_state & mask) == mask)
2946  {
2947  CkptSortItem *item;
2948 
2949  buf_state |= BM_CHECKPOINT_NEEDED;
2950 
2951  item = &CkptBufferIds[num_to_scan++];
2952  item->buf_id = buf_id;
2953  item->tsId = bufHdr->tag.spcOid;
2954  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2955  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2956  item->blockNum = bufHdr->tag.blockNum;
2957  }
2958 
2959  UnlockBufHdr(bufHdr, buf_state);
2960 
2961  /* Check for barrier events in case NBuffers is large. */
2964  }
2965 
2966  if (num_to_scan == 0)
2967  return; /* nothing to do */
2968 
2970 
2971  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2972 
2973  /*
2974  * Sort buffers that need to be written to reduce the likelihood of random
2975  * IO. The sorting is also important for the implementation of balancing
2976  * writes between tablespaces. Without balancing writes we'd potentially
2977  * end up writing to the tablespaces one-by-one; possibly overloading the
2978  * underlying system.
2979  */
2980  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2981 
2982  num_spaces = 0;
2983 
2984  /*
2985  * Allocate progress status for each tablespace with buffers that need to
2986  * be flushed. This requires the to-be-flushed array to be sorted.
2987  */
2988  last_tsid = InvalidOid;
2989  for (i = 0; i < num_to_scan; i++)
2990  {
2991  CkptTsStatus *s;
2992  Oid cur_tsid;
2993 
2994  cur_tsid = CkptBufferIds[i].tsId;
2995 
2996  /*
2997  * Grow array of per-tablespace status structs, every time a new
2998  * tablespace is found.
2999  */
3000  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3001  {
3002  Size sz;
3003 
3004  num_spaces++;
3005 
3006  /*
3007  * Not worth adding grow-by-power-of-2 logic here - even with a
3008  * few hundred tablespaces this should be fine.
3009  */
3010  sz = sizeof(CkptTsStatus) * num_spaces;
3011 
3012  if (per_ts_stat == NULL)
3013  per_ts_stat = (CkptTsStatus *) palloc(sz);
3014  else
3015  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3016 
3017  s = &per_ts_stat[num_spaces - 1];
3018  memset(s, 0, sizeof(*s));
3019  s->tsId = cur_tsid;
3020 
3021  /*
3022  * The first buffer in this tablespace. As CkptBufferIds is sorted
3023  * by tablespace all (s->num_to_scan) buffers in this tablespace
3024  * will follow afterwards.
3025  */
3026  s->index = i;
3027 
3028  /*
3029  * progress_slice will be determined once we know how many buffers
3030  * are in each tablespace, i.e. after this loop.
3031  */
3032 
3033  last_tsid = cur_tsid;
3034  }
3035  else
3036  {
3037  s = &per_ts_stat[num_spaces - 1];
3038  }
3039 
3040  s->num_to_scan++;
3041 
3042  /* Check for barrier events. */
3045  }
3046 
3047  Assert(num_spaces > 0);
3048 
3049  /*
3050  * Build a min-heap over the write-progress in the individual tablespaces,
3051  * and compute how large a portion of the total progress a single
3052  * processed buffer is.
3053  */
3054  ts_heap = binaryheap_allocate(num_spaces,
3056  NULL);
3057 
3058  for (i = 0; i < num_spaces; i++)
3059  {
3060  CkptTsStatus *ts_stat = &per_ts_stat[i];
3061 
3062  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3063 
3064  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3065  }
3066 
3067  binaryheap_build(ts_heap);
3068 
3069  /*
3070  * Iterate through to-be-checkpointed buffers and write the ones (still)
3071  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3072  * tablespaces; otherwise the sorting would lead to only one tablespace
3073  * receiving writes at a time, making inefficient use of the hardware.
3074  */
3075  num_processed = 0;
3076  num_written = 0;
3077  while (!binaryheap_empty(ts_heap))
3078  {
3079  BufferDesc *bufHdr = NULL;
3080  CkptTsStatus *ts_stat = (CkptTsStatus *)
3082 
3083  buf_id = CkptBufferIds[ts_stat->index].buf_id;
3084  Assert(buf_id != -1);
3085 
3086  bufHdr = GetBufferDescriptor(buf_id);
3087 
3088  num_processed++;
3089 
3090  /*
3091  * We don't need to acquire the lock here, because we're only looking
3092  * at a single bit. It's possible that someone else writes the buffer
3093  * and clears the flag right after we check, but that doesn't matter
3094  * since SyncOneBuffer will then do nothing. However, there is a
3095  * further race condition: it's conceivable that between the time we
3096  * examine the bit here and the time SyncOneBuffer acquires the lock,
3097  * someone else not only wrote the buffer but replaced it with another
3098  * page and dirtied it. In that improbable case, SyncOneBuffer will
3099  * write the buffer though we didn't need to. It doesn't seem worth
3100  * guarding against this, though.
3101  */
3103  {
3104  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3105  {
3106  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3108  num_written++;
3109  }
3110  }
3111 
3112  /*
3113  * Measure progress independent of actually having to flush the buffer
3114  * - otherwise writing become unbalanced.
3115  */
3116  ts_stat->progress += ts_stat->progress_slice;
3117  ts_stat->num_scanned++;
3118  ts_stat->index++;
3119 
3120  /* Have all the buffers from the tablespace been processed? */
3121  if (ts_stat->num_scanned == ts_stat->num_to_scan)
3122  {
3123  binaryheap_remove_first(ts_heap);
3124  }
3125  else
3126  {
3127  /* update heap with the new progress */
3128  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3129  }
3130 
3131  /*
3132  * Sleep to throttle our I/O rate.
3133  *
3134  * (This will check for barrier events even if it doesn't sleep.)
3135  */
3136  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3137  }
3138 
3139  /*
3140  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3141  * IOContext will always be IOCONTEXT_NORMAL.
3142  */
3144 
3145  pfree(per_ts_stat);
3146  per_ts_stat = NULL;
3147  binaryheap_free(ts_heap);
3148 
3149  /*
3150  * Update checkpoint statistics. As noted above, this doesn't include
3151  * buffers written by other backends or bgwriter scan.
3152  */
3153  CheckpointStats.ckpt_bufs_written += num_written;
3154 
3155  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3156 }
3157 
3158 /*
3159  * BgBufferSync -- Write out some dirty buffers in the pool.
3160  *
3161  * This is called periodically by the background writer process.
3162  *
3163  * Returns true if it's appropriate for the bgwriter process to go into
3164  * low-power hibernation mode. (This happens if the strategy clock sweep
3165  * has been "lapped" and no buffer allocations have occurred recently,
3166  * or if the bgwriter has been effectively disabled by setting
3167  * bgwriter_lru_maxpages to 0.)
3168  */
3169 bool
3171 {
3172  /* info obtained from freelist.c */
3173  int strategy_buf_id;
3174  uint32 strategy_passes;
3175  uint32 recent_alloc;
3176 
3177  /*
3178  * Information saved between calls so we can determine the strategy
3179  * point's advance rate and avoid scanning already-cleaned buffers.
3180  */
3181  static bool saved_info_valid = false;
3182  static int prev_strategy_buf_id;
3183  static uint32 prev_strategy_passes;
3184  static int next_to_clean;
3185  static uint32 next_passes;
3186 
3187  /* Moving averages of allocation rate and clean-buffer density */
3188  static float smoothed_alloc = 0;
3189  static float smoothed_density = 10.0;
3190 
3191  /* Potentially these could be tunables, but for now, not */
3192  float smoothing_samples = 16;
3193  float scan_whole_pool_milliseconds = 120000.0;
3194 
3195  /* Used to compute how far we scan ahead */
3196  long strategy_delta;
3197  int bufs_to_lap;
3198  int bufs_ahead;
3199  float scans_per_alloc;
3200  int reusable_buffers_est;
3201  int upcoming_alloc_est;
3202  int min_scan_buffers;
3203 
3204  /* Variables for the scanning loop proper */
3205  int num_to_scan;
3206  int num_written;
3207  int reusable_buffers;
3208 
3209  /* Variables for final smoothed_density update */
3210  long new_strategy_delta;
3211  uint32 new_recent_alloc;
3212 
3213  /*
3214  * Find out where the freelist clock sweep currently is, and how many
3215  * buffer allocations have happened since our last call.
3216  */
3217  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3218 
3219  /* Report buffer alloc counts to pgstat */
3220  PendingBgWriterStats.buf_alloc += recent_alloc;
3221 
3222  /*
3223  * If we're not running the LRU scan, just stop after doing the stats
3224  * stuff. We mark the saved state invalid so that we can recover sanely
3225  * if LRU scan is turned back on later.
3226  */
3227  if (bgwriter_lru_maxpages <= 0)
3228  {
3229  saved_info_valid = false;
3230  return true;
3231  }
3232 
3233  /*
3234  * Compute strategy_delta = how many buffers have been scanned by the
3235  * clock sweep since last time. If first time through, assume none. Then
3236  * see if we are still ahead of the clock sweep, and if so, how many
3237  * buffers we could scan before we'd catch up with it and "lap" it. Note:
3238  * weird-looking coding of xxx_passes comparisons are to avoid bogus
3239  * behavior when the passes counts wrap around.
3240  */
3241  if (saved_info_valid)
3242  {
3243  int32 passes_delta = strategy_passes - prev_strategy_passes;
3244 
3245  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3246  strategy_delta += (long) passes_delta * NBuffers;
3247 
3248  Assert(strategy_delta >= 0);
3249 
3250  if ((int32) (next_passes - strategy_passes) > 0)
3251  {
3252  /* we're one pass ahead of the strategy point */
3253  bufs_to_lap = strategy_buf_id - next_to_clean;
3254 #ifdef BGW_DEBUG
3255  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3256  next_passes, next_to_clean,
3257  strategy_passes, strategy_buf_id,
3258  strategy_delta, bufs_to_lap);
3259 #endif
3260  }
3261  else if (next_passes == strategy_passes &&
3262  next_to_clean >= strategy_buf_id)
3263  {
3264  /* on same pass, but ahead or at least not behind */
3265  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3266 #ifdef BGW_DEBUG
3267  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3268  next_passes, next_to_clean,
3269  strategy_passes, strategy_buf_id,
3270  strategy_delta, bufs_to_lap);
3271 #endif
3272  }
3273  else
3274  {
3275  /*
3276  * We're behind, so skip forward to the strategy point and start
3277  * cleaning from there.
3278  */
3279 #ifdef BGW_DEBUG
3280  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3281  next_passes, next_to_clean,
3282  strategy_passes, strategy_buf_id,
3283  strategy_delta);
3284 #endif
3285  next_to_clean = strategy_buf_id;
3286  next_passes = strategy_passes;
3287  bufs_to_lap = NBuffers;
3288  }
3289  }
3290  else
3291  {
3292  /*
3293  * Initializing at startup or after LRU scanning had been off. Always
3294  * start at the strategy point.
3295  */
3296 #ifdef BGW_DEBUG
3297  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3298  strategy_passes, strategy_buf_id);
3299 #endif
3300  strategy_delta = 0;
3301  next_to_clean = strategy_buf_id;
3302  next_passes = strategy_passes;
3303  bufs_to_lap = NBuffers;
3304  }
3305 
3306  /* Update saved info for next time */
3307  prev_strategy_buf_id = strategy_buf_id;
3308  prev_strategy_passes = strategy_passes;
3309  saved_info_valid = true;
3310 
3311  /*
3312  * Compute how many buffers had to be scanned for each new allocation, ie,
3313  * 1/density of reusable buffers, and track a moving average of that.
3314  *
3315  * If the strategy point didn't move, we don't update the density estimate
3316  */
3317  if (strategy_delta > 0 && recent_alloc > 0)
3318  {
3319  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3320  smoothed_density += (scans_per_alloc - smoothed_density) /
3321  smoothing_samples;
3322  }
3323 
3324  /*
3325  * Estimate how many reusable buffers there are between the current
3326  * strategy point and where we've scanned ahead to, based on the smoothed
3327  * density estimate.
3328  */
3329  bufs_ahead = NBuffers - bufs_to_lap;
3330  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3331 
3332  /*
3333  * Track a moving average of recent buffer allocations. Here, rather than
3334  * a true average we want a fast-attack, slow-decline behavior: we
3335  * immediately follow any increase.
3336  */
3337  if (smoothed_alloc <= (float) recent_alloc)
3338  smoothed_alloc = recent_alloc;
3339  else
3340  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3341  smoothing_samples;
3342 
3343  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3344  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3345 
3346  /*
3347  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3348  * eventually underflow to zero, and the underflows produce annoying
3349  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3350  * zero, there's no point in tracking smaller and smaller values of
3351  * smoothed_alloc, so just reset it to exactly zero to avoid this
3352  * syndrome. It will pop back up as soon as recent_alloc increases.
3353  */
3354  if (upcoming_alloc_est == 0)
3355  smoothed_alloc = 0;
3356 
3357  /*
3358  * Even in cases where there's been little or no buffer allocation
3359  * activity, we want to make a small amount of progress through the buffer
3360  * cache so that as many reusable buffers as possible are clean after an
3361  * idle period.
3362  *
3363  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3364  * the BGW will be called during the scan_whole_pool time; slice the
3365  * buffer pool into that many sections.
3366  */
3367  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3368 
3369  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3370  {
3371 #ifdef BGW_DEBUG
3372  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3373  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3374 #endif
3375  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3376  }
3377 
3378  /*
3379  * Now write out dirty reusable buffers, working forward from the
3380  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3381  * enough buffers to match our estimate of the next cycle's allocation
3382  * requirements, or hit the bgwriter_lru_maxpages limit.
3383  */
3384 
3385  num_to_scan = bufs_to_lap;
3386  num_written = 0;
3387  reusable_buffers = reusable_buffers_est;
3388 
3389  /* Execute the LRU scan */
3390  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3391  {
3392  int sync_state = SyncOneBuffer(next_to_clean, true,
3393  wb_context);
3394 
3395  if (++next_to_clean >= NBuffers)
3396  {
3397  next_to_clean = 0;
3398  next_passes++;
3399  }
3400  num_to_scan--;
3401 
3402  if (sync_state & BUF_WRITTEN)
3403  {
3404  reusable_buffers++;
3405  if (++num_written >= bgwriter_lru_maxpages)
3406  {
3408  break;
3409  }
3410  }
3411  else if (sync_state & BUF_REUSABLE)
3412  reusable_buffers++;
3413  }
3414 
3415  PendingBgWriterStats.buf_written_clean += num_written;
3416 
3417 #ifdef BGW_DEBUG
3418  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3419  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3420  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3421  bufs_to_lap - num_to_scan,
3422  num_written,
3423  reusable_buffers - reusable_buffers_est);
3424 #endif
3425 
3426  /*
3427  * Consider the above scan as being like a new allocation scan.
3428  * Characterize its density and update the smoothed one based on it. This
3429  * effectively halves the moving average period in cases where both the
3430  * strategy and the background writer are doing some useful scanning,
3431  * which is helpful because a long memory isn't as desirable on the
3432  * density estimates.
3433  */
3434  new_strategy_delta = bufs_to_lap - num_to_scan;
3435  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3436  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3437  {
3438  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3439  smoothed_density += (scans_per_alloc - smoothed_density) /
3440  smoothing_samples;
3441 
3442 #ifdef BGW_DEBUG
3443  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3444  new_recent_alloc, new_strategy_delta,
3445  scans_per_alloc, smoothed_density);
3446 #endif
3447  }
3448 
3449  /* Return true if OK to hibernate */
3450  return (bufs_to_lap == 0 && recent_alloc == 0);
3451 }
3452 
3453 /*
3454  * SyncOneBuffer -- process a single buffer during syncing.
3455  *
3456  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3457  * buffers marked recently used, as these are not replacement candidates.
3458  *
3459  * Returns a bitmask containing the following flag bits:
3460  * BUF_WRITTEN: we wrote the buffer.
3461  * BUF_REUSABLE: buffer is available for replacement, ie, it has
3462  * pin count 0 and usage count 0.
3463  *
3464  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3465  * after locking it, but we don't care all that much.)
3466  */
3467 static int
3468 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3469 {
3470  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3471  int result = 0;
3472  uint32 buf_state;
3473  BufferTag tag;
3474 
3475  /* Make sure we can handle the pin */
3478 
3479  /*
3480  * Check whether buffer needs writing.
3481  *
3482  * We can make this check without taking the buffer content lock so long
3483  * as we mark pages dirty in access methods *before* logging changes with
3484  * XLogInsert(): if someone marks the buffer dirty just after our check we
3485  * don't worry because our checkpoint.redo points before log record for
3486  * upcoming changes and so we are not required to write such dirty buffer.
3487  */
3488  buf_state = LockBufHdr(bufHdr);
3489 
3490  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3491  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3492  {
3493  result |= BUF_REUSABLE;
3494  }
3495  else if (skip_recently_used)
3496  {
3497  /* Caller told us not to write recently-used buffers */
3498  UnlockBufHdr(bufHdr, buf_state);
3499  return result;
3500  }
3501 
3502  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3503  {
3504  /* It's clean, so nothing to do */
3505  UnlockBufHdr(bufHdr, buf_state);
3506  return result;
3507  }
3508 
3509  /*
3510  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3511  * buffer is clean by the time we've locked it.)
3512  */
3513  PinBuffer_Locked(bufHdr);
3515 
3517 
3519 
3520  tag = bufHdr->tag;
3521 
3522  UnpinBuffer(bufHdr);
3523 
3524  /*
3525  * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3526  * IOContext will always be IOCONTEXT_NORMAL.
3527  */
3529 
3530  return result | BUF_WRITTEN;
3531 }
3532 
3533 /*
3534  * AtEOXact_Buffers - clean up at end of transaction.
3535  *
3536  * As of PostgreSQL 8.0, buffer pins should get released by the
3537  * ResourceOwner mechanism. This routine is just a debugging
3538  * cross-check that no pins remain.
3539  */
3540 void
3541 AtEOXact_Buffers(bool isCommit)
3542 {
3544 
3545  AtEOXact_LocalBuffers(isCommit);
3546 
3548 }
3549 
3550 /*
3551  * Initialize access to shared buffer pool
3552  *
3553  * This is called during backend startup (whether standalone or under the
3554  * postmaster). It sets up for this backend's access to the already-existing
3555  * buffer pool.
3556  */
3557 void
3559 {
3560  HASHCTL hash_ctl;
3561 
3562  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3563 
3564  hash_ctl.keysize = sizeof(int32);
3565  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3566 
3567  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3568  HASH_ELEM | HASH_BLOBS);
3569 
3570  /*
3571  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3572  * the corresponding phase of backend shutdown.
3573  */
3574  Assert(MyProc != NULL);
3576 }
3577 
3578 /*
3579  * During backend exit, ensure that we released all shared-buffer locks and
3580  * assert that we have no remaining pins.
3581  */
3582 static void
3584 {
3585  UnlockBuffers();
3586 
3588 
3589  /* localbuf.c needs a chance too */
3591 }
3592 
3593 /*
3594  * CheckForBufferLeaks - ensure this backend holds no buffer pins
3595  *
3596  * As of PostgreSQL 8.0, buffer pins should get released by the
3597  * ResourceOwner mechanism. This routine is just a debugging
3598  * cross-check that no pins remain.
3599  */
3600 static void
3602 {
3603 #ifdef USE_ASSERT_CHECKING
3604  int RefCountErrors = 0;
3606  int i;
3607  char *s;
3608 
3609  /* check the array */
3610  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3611  {
3613 
3614  if (res->buffer != InvalidBuffer)
3615  {
3616  s = DebugPrintBufferRefcount(res->buffer);
3617  elog(WARNING, "buffer refcount leak: %s", s);
3618  pfree(s);
3619 
3620  RefCountErrors++;
3621  }
3622  }
3623 
3624  /* if necessary search the hash */
3626  {
3627  HASH_SEQ_STATUS hstat;
3628 
3630  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3631  {
3632  s = DebugPrintBufferRefcount(res->buffer);
3633  elog(WARNING, "buffer refcount leak: %s", s);
3634  pfree(s);
3635  RefCountErrors++;
3636  }
3637  }
3638 
3639  Assert(RefCountErrors == 0);
3640 #endif
3641 }
3642 
3643 /*
3644  * Helper routine to issue warnings when a buffer is unexpectedly pinned
3645  */
3646 char *
3648 {
3649  BufferDesc *buf;
3650  int32 loccount;
3651  char *path;
3652  char *result;
3653  ProcNumber backend;
3654  uint32 buf_state;
3655 
3657  if (BufferIsLocal(buffer))
3658  {
3660  loccount = LocalRefCount[-buffer - 1];
3661  backend = MyProcNumber;
3662  }
3663  else
3664  {
3666  loccount = GetPrivateRefCount(buffer);
3667  backend = INVALID_PROC_NUMBER;
3668  }
3669 
3670  /* theoretically we should lock the bufhdr here */
3671  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3672  BufTagGetForkNum(&buf->tag));
3673  buf_state = pg_atomic_read_u32(&buf->state);
3674 
3675  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3676  buffer, path,
3677  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3678  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3679  pfree(path);
3680  return result;
3681 }
3682 
3683 /*
3684  * CheckPointBuffers
3685  *
3686  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3687  *
3688  * Note: temporary relations do not participate in checkpoints, so they don't
3689  * need to be flushed.
3690  */
3691 void
3693 {
3694  BufferSync(flags);
3695 }
3696 
3697 /*
3698  * BufferGetBlockNumber
3699  * Returns the block number associated with a buffer.
3700  *
3701  * Note:
3702  * Assumes that the buffer is valid and pinned, else the
3703  * value may be obsolete immediately...
3704  */
3707 {
3708  BufferDesc *bufHdr;
3709 
3711 
3712  if (BufferIsLocal(buffer))
3713  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3714  else
3715  bufHdr = GetBufferDescriptor(buffer - 1);
3716 
3717  /* pinned, so OK to read tag without spinlock */
3718  return bufHdr->tag.blockNum;
3719 }
3720 
3721 /*
3722  * BufferGetTag
3723  * Returns the relfilelocator, fork number and block number associated with
3724  * a buffer.
3725  */
3726 void
3728  BlockNumber *blknum)
3729 {
3730  BufferDesc *bufHdr;
3731 
3732  /* Do the same checks as BufferGetBlockNumber. */
3734 
3735  if (BufferIsLocal(buffer))
3736  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3737  else
3738  bufHdr = GetBufferDescriptor(buffer - 1);
3739 
3740  /* pinned, so OK to read tag without spinlock */
3741  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3742  *forknum = BufTagGetForkNum(&bufHdr->tag);
3743  *blknum = bufHdr->tag.blockNum;
3744 }
3745 
3746 /*
3747  * FlushBuffer
3748  * Physically write out a shared buffer.
3749  *
3750  * NOTE: this actually just passes the buffer contents to the kernel; the
3751  * real write to disk won't happen until the kernel feels like it. This
3752  * is okay from our point of view since we can redo the changes from WAL.
3753  * However, we will need to force the changes to disk via fsync before
3754  * we can checkpoint WAL.
3755  *
3756  * The caller must hold a pin on the buffer and have share-locked the
3757  * buffer contents. (Note: a share-lock does not prevent updates of
3758  * hint bits in the buffer, so the page could change while the write
3759  * is in progress, but we assume that that will not invalidate the data
3760  * written.)
3761  *
3762  * If the caller has an smgr reference for the buffer's relation, pass it
3763  * as the second parameter. If not, pass NULL.
3764  */
3765 static void
3767  IOContext io_context)
3768 {
3769  XLogRecPtr recptr;
3770  ErrorContextCallback errcallback;
3771  instr_time io_start;
3772  Block bufBlock;
3773  char *bufToWrite;
3774  uint32 buf_state;
3775 
3776  /*
3777  * Try to start an I/O operation. If StartBufferIO returns false, then
3778  * someone else flushed the buffer before we could, so we need not do
3779  * anything.
3780  */
3781  if (!StartBufferIO(buf, false, false))
3782  return;
3783 
3784  /* Setup error traceback support for ereport() */
3786  errcallback.arg = (void *) buf;
3787  errcallback.previous = error_context_stack;
3788  error_context_stack = &errcallback;
3789 
3790  /* Find smgr relation for buffer */
3791  if (reln == NULL)
3793 
3794  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3795  buf->tag.blockNum,
3797  reln->smgr_rlocator.locator.dbOid,
3799 
3800  buf_state = LockBufHdr(buf);
3801 
3802  /*
3803  * Run PageGetLSN while holding header lock, since we don't have the
3804  * buffer locked exclusively in all cases.
3805  */
3806  recptr = BufferGetLSN(buf);
3807 
3808  /* To check if block content changes while flushing. - vadim 01/17/97 */
3809  buf_state &= ~BM_JUST_DIRTIED;
3810  UnlockBufHdr(buf, buf_state);
3811 
3812  /*
3813  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3814  * rule that log updates must hit disk before any of the data-file changes
3815  * they describe do.
3816  *
3817  * However, this rule does not apply to unlogged relations, which will be
3818  * lost after a crash anyway. Most unlogged relation pages do not bear
3819  * LSNs since we never emit WAL records for them, and therefore flushing
3820  * up through the buffer LSN would be useless, but harmless. However,
3821  * GiST indexes use LSNs internally to track page-splits, and therefore
3822  * unlogged GiST pages bear "fake" LSNs generated by
3823  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3824  * LSN counter could advance past the WAL insertion point; and if it did
3825  * happen, attempting to flush WAL through that location would fail, with
3826  * disastrous system-wide consequences. To make sure that can't happen,
3827  * skip the flush if the buffer isn't permanent.
3828  */
3829  if (buf_state & BM_PERMANENT)
3830  XLogFlush(recptr);
3831 
3832  /*
3833  * Now it's safe to write buffer to disk. Note that no one else should
3834  * have been able to write it while we were busy with log flushing because
3835  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3836  */
3837  bufBlock = BufHdrGetBlock(buf);
3838 
3839  /*
3840  * Update page checksum if desired. Since we have only shared lock on the
3841  * buffer, other processes might be updating hint bits in it, so we must
3842  * copy the page to private storage if we do checksumming.
3843  */
3844  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3845 
3847 
3848  /*
3849  * bufToWrite is either the shared buffer or a copy, as appropriate.
3850  */
3851  smgrwrite(reln,
3852  BufTagGetForkNum(&buf->tag),
3853  buf->tag.blockNum,
3854  bufToWrite,
3855  false);
3856 
3857  /*
3858  * When a strategy is in use, only flushes of dirty buffers already in the
3859  * strategy ring are counted as strategy writes (IOCONTEXT
3860  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3861  * statistics tracking.
3862  *
3863  * If a shared buffer initially added to the ring must be flushed before
3864  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3865  *
3866  * If a shared buffer which was added to the ring later because the
3867  * current strategy buffer is pinned or in use or because all strategy
3868  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3869  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3870  * (from_ring will be false).
3871  *
3872  * When a strategy is not in use, the write can only be a "regular" write
3873  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3874  */
3876  IOOP_WRITE, io_start, 1);
3877 
3879 
3880  /*
3881  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3882  * end the BM_IO_IN_PROGRESS state.
3883  */
3884  TerminateBufferIO(buf, true, 0, true);
3885 
3886  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3887  buf->tag.blockNum,
3889  reln->smgr_rlocator.locator.dbOid,
3891 
3892  /* Pop the error context stack */
3893  error_context_stack = errcallback.previous;
3894 }
3895 
3896 /*
3897  * RelationGetNumberOfBlocksInFork
3898  * Determines the current number of pages in the specified relation fork.
3899  *
3900  * Note that the accuracy of the result will depend on the details of the
3901  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3902  * it might not be.
3903  */
3906 {
3907  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3908  {
3909  /*
3910  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3911  * tableam returns the size in bytes - but for the purpose of this
3912  * routine, we want the number of blocks. Therefore divide, rounding
3913  * up.
3914  */
3915  uint64 szbytes;
3916 
3917  szbytes = table_relation_size(relation, forkNum);
3918 
3919  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3920  }
3921  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3922  {
3923  return smgrnblocks(RelationGetSmgr(relation), forkNum);
3924  }
3925  else
3926  Assert(false);
3927 
3928  return 0; /* keep compiler quiet */
3929 }
3930 
3931 /*
3932  * BufferIsPermanent
3933  * Determines whether a buffer will potentially still be around after
3934  * a crash. Caller must hold a buffer pin.
3935  */
3936 bool
3938 {
3939  BufferDesc *bufHdr;
3940 
3941  /* Local buffers are used only for temp relations. */
3942  if (BufferIsLocal(buffer))
3943  return false;
3944 
3945  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3948 
3949  /*
3950  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3951  * need not bother with the buffer header spinlock. Even if someone else
3952  * changes the buffer header state while we're doing this, the state is
3953  * changed atomically, so we'll read the old value or the new value, but
3954  * not random garbage.
3955  */
3956  bufHdr = GetBufferDescriptor(buffer - 1);
3957  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3958 }
3959 
3960 /*
3961  * BufferGetLSNAtomic
3962  * Retrieves the LSN of the buffer atomically using a buffer header lock.
3963  * This is necessary for some callers who may not have an exclusive lock
3964  * on the buffer.
3965  */
3966 XLogRecPtr
3968 {
3969  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3970  char *page = BufferGetPage(buffer);
3971  XLogRecPtr lsn;
3972  uint32 buf_state;
3973 
3974  /*
3975  * If we don't need locking for correctness, fastpath out.
3976  */
3978  return PageGetLSN(page);
3979 
3980  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3983 
3984  buf_state = LockBufHdr(bufHdr);
3985  lsn = PageGetLSN(page);
3986  UnlockBufHdr(bufHdr, buf_state);
3987 
3988  return lsn;
3989 }
3990 
3991 /* ---------------------------------------------------------------------
3992  * DropRelationBuffers
3993  *
3994  * This function removes from the buffer pool all the pages of the
3995  * specified relation forks that have block numbers >= firstDelBlock.
3996  * (In particular, with firstDelBlock = 0, all pages are removed.)
3997  * Dirty pages are simply dropped, without bothering to write them
3998  * out first. Therefore, this is NOT rollback-able, and so should be
3999  * used only with extreme caution!
4000  *
4001  * Currently, this is called only from smgr.c when the underlying file
4002  * is about to be deleted or truncated (firstDelBlock is needed for
4003  * the truncation case). The data in the affected pages would therefore
4004  * be deleted momentarily anyway, and there is no point in writing it.
4005  * It is the responsibility of higher-level code to ensure that the
4006  * deletion or truncation does not lose any data that could be needed
4007  * later. It is also the responsibility of higher-level code to ensure
4008  * that no other process could be trying to load more pages of the
4009  * relation into buffers.
4010  * --------------------------------------------------------------------
4011  */
4012 void
4014  int nforks, BlockNumber *firstDelBlock)
4015 {
4016  int i;
4017  int j;
4018  RelFileLocatorBackend rlocator;
4019  BlockNumber nForkBlock[MAX_FORKNUM];
4020  uint64 nBlocksToInvalidate = 0;
4021 
4022  rlocator = smgr_reln->smgr_rlocator;
4023 
4024  /* If it's a local relation, it's localbuf.c's problem. */
4025  if (RelFileLocatorBackendIsTemp(rlocator))
4026  {
4027  if (rlocator.backend == MyProcNumber)
4028  {
4029  for (j = 0; j < nforks; j++)
4030  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4031  firstDelBlock[j]);
4032  }
4033  return;
4034  }
4035 
4036  /*
4037  * To remove all the pages of the specified relation forks from the buffer
4038  * pool, we need to scan the entire buffer pool but we can optimize it by
4039  * finding the buffers from BufMapping table provided we know the exact
4040  * size of each fork of the relation. The exact size is required to ensure
4041  * that we don't leave any buffer for the relation being dropped as
4042  * otherwise the background writer or checkpointer can lead to a PANIC
4043  * error while flushing buffers corresponding to files that don't exist.
4044  *
4045  * To know the exact size, we rely on the size cached for each fork by us
4046  * during recovery which limits the optimization to recovery and on
4047  * standbys but we can easily extend it once we have shared cache for
4048  * relation size.
4049  *
4050  * In recovery, we cache the value returned by the first lseek(SEEK_END)
4051  * and the future writes keeps the cached value up-to-date. See
4052  * smgrextend. It is possible that the value of the first lseek is smaller
4053  * than the actual number of existing blocks in the file due to buggy
4054  * Linux kernels that might not have accounted for the recent write. But
4055  * that should be fine because there must not be any buffers after that
4056  * file size.
4057  */
4058  for (i = 0; i < nforks; i++)
4059  {
4060  /* Get the number of blocks for a relation's fork */
4061  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4062 
4063  if (nForkBlock[i] == InvalidBlockNumber)
4064  {
4065  nBlocksToInvalidate = InvalidBlockNumber;
4066  break;
4067  }
4068 
4069  /* calculate the number of blocks to be invalidated */
4070  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4071  }
4072 
4073  /*
4074  * We apply the optimization iff the total number of blocks to invalidate
4075  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4076  */
4077  if (BlockNumberIsValid(nBlocksToInvalidate) &&
4078  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4079  {
4080  for (j = 0; j < nforks; j++)
4081  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4082  nForkBlock[j], firstDelBlock[j]);
4083  return;
4084  }
4085 
4086  for (i = 0; i < NBuffers; i++)
4087  {
4088  BufferDesc *bufHdr = GetBufferDescriptor(i);
4089  uint32 buf_state;
4090 
4091  /*
4092  * We can make this a tad faster by prechecking the buffer tag before
4093  * we attempt to lock the buffer; this saves a lot of lock
4094  * acquisitions in typical cases. It should be safe because the
4095  * caller must have AccessExclusiveLock on the relation, or some other
4096  * reason to be certain that no one is loading new pages of the rel
4097  * into the buffer pool. (Otherwise we might well miss such pages
4098  * entirely.) Therefore, while the tag might be changing while we
4099  * look at it, it can't be changing *to* a value we care about, only
4100  * *away* from such a value. So false negatives are impossible, and
4101  * false positives are safe because we'll recheck after getting the
4102  * buffer lock.
4103  *
4104  * We could check forkNum and blockNum as well as the rlocator, but
4105  * the incremental win from doing so seems small.
4106  */
4107  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4108  continue;
4109 
4110  buf_state = LockBufHdr(bufHdr);
4111 
4112  for (j = 0; j < nforks; j++)
4113  {
4114  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4115  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4116  bufHdr->tag.blockNum >= firstDelBlock[j])
4117  {
4118  InvalidateBuffer(bufHdr); /* releases spinlock */
4119  break;
4120  }
4121  }
4122  if (j >= nforks)
4123  UnlockBufHdr(bufHdr, buf_state);
4124  }
4125 }
4126 
4127 /* ---------------------------------------------------------------------
4128  * DropRelationsAllBuffers
4129  *
4130  * This function removes from the buffer pool all the pages of all
4131  * forks of the specified relations. It's equivalent to calling
4132  * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4133  * --------------------------------------------------------------------
4134  */
4135 void
4136 DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4137 {
4138  int i;
4139  int n = 0;
4140  SMgrRelation *rels;
4141  BlockNumber (*block)[MAX_FORKNUM + 1];
4142  uint64 nBlocksToInvalidate = 0;
4143  RelFileLocator *locators;
4144  bool cached = true;
4145  bool use_bsearch;
4146 
4147  if (nlocators == 0)
4148  return;
4149 
4150  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4151 
4152  /* If it's a local relation, it's localbuf.c's problem. */
4153  for (i = 0; i < nlocators; i++)
4154  {
4155  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4156  {
4157  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4158  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4159  }
4160  else
4161  rels[n++] = smgr_reln[i];
4162  }
4163 
4164  /*
4165  * If there are no non-local relations, then we're done. Release the
4166  * memory and return.
4167  */
4168  if (n == 0)
4169  {
4170  pfree(rels);
4171  return;
4172  }
4173 
4174  /*
4175  * This is used to remember the number of blocks for all the relations
4176  * forks.
4177  */
4178  block = (BlockNumber (*)[MAX_FORKNUM + 1])
4179  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4180 
4181  /*
4182  * We can avoid scanning the entire buffer pool if we know the exact size
4183  * of each of the given relation forks. See DropRelationBuffers.
4184  */
4185  for (i = 0; i < n && cached; i++)
4186  {
4187  for (int j = 0; j <= MAX_FORKNUM; j++)
4188  {
4189  /* Get the number of blocks for a relation's fork. */
4190  block[i][j] = smgrnblocks_cached(rels[i], j);
4191 
4192  /* We need to only consider the relation forks that exists. */
4193  if (block[i][j] == InvalidBlockNumber)
4194  {
4195  if (!smgrexists(rels[i], j))
4196  continue;
4197  cached = false;
4198  break;
4199  }
4200 
4201  /* calculate the total number of blocks to be invalidated */
4202  nBlocksToInvalidate += block[i][j];
4203  }
4204  }
4205 
4206  /*
4207  * We apply the optimization iff the total number of blocks to invalidate
4208  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4209  */
4210  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4211  {
4212  for (i = 0; i < n; i++)
4213  {
4214  for (int j = 0; j <= MAX_FORKNUM; j++)
4215  {
4216  /* ignore relation forks that doesn't exist */
4217  if (!BlockNumberIsValid(block[i][j]))
4218  continue;
4219 
4220  /* drop all the buffers for a particular relation fork */
4221  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4222  j, block[i][j], 0);
4223  }
4224  }
4225 
4226  pfree(block);
4227  pfree(rels);
4228  return;
4229  }
4230 
4231  pfree(block);
4232  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4233  for (i = 0; i < n; i++)
4234  locators[i] = rels[i]->smgr_rlocator.locator;
4235 
4236  /*
4237  * For low number of relations to drop just use a simple walk through, to
4238  * save the bsearch overhead. The threshold to use is rather a guess than
4239  * an exactly determined value, as it depends on many factors (CPU and RAM
4240  * speeds, amount of shared buffers etc.).
4241  */
4242  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4243 
4244  /* sort the list of rlocators if necessary */
4245  if (use_bsearch)
4246  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4247 
4248  for (i = 0; i < NBuffers; i++)
4249  {
4250  RelFileLocator *rlocator = NULL;
4251  BufferDesc *bufHdr = GetBufferDescriptor(i);
4252  uint32 buf_state;
4253 
4254  /*
4255  * As in DropRelationBuffers, an unlocked precheck should be safe and
4256  * saves some cycles.
4257  */
4258 
4259  if (!use_bsearch)
4260  {
4261  int j;
4262 
4263  for (j = 0; j < n; j++)
4264  {
4265  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4266  {
4267  rlocator = &locators[j];
4268  break;
4269  }
4270  }
4271  }
4272  else
4273  {
4274  RelFileLocator locator;
4275 
4276  locator = BufTagGetRelFileLocator(&bufHdr->tag);
4277  rlocator = bsearch((const void *) &(locator),
4278  locators, n, sizeof(RelFileLocator),
4280  }
4281 
4282  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4283  if (rlocator == NULL)
4284  continue;
4285 
4286  buf_state = LockBufHdr(bufHdr);
4287  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4288  InvalidateBuffer(bufHdr); /* releases spinlock */
4289  else
4290  UnlockBufHdr(bufHdr, buf_state);
4291  }
4292 
4293  pfree(locators);
4294  pfree(rels);
4295 }
4296 
4297 /* ---------------------------------------------------------------------
4298  * FindAndDropRelationBuffers
4299  *
4300  * This function performs look up in BufMapping table and removes from the
4301  * buffer pool all the pages of the specified relation fork that has block
4302  * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4303  * pages are removed.)
4304  * --------------------------------------------------------------------
4305  */
4306 static void
4308  BlockNumber nForkBlock,
4309  BlockNumber firstDelBlock)
4310 {
4311  BlockNumber curBlock;
4312 
4313  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4314  {
4315  uint32 bufHash; /* hash value for tag */
4316  BufferTag bufTag; /* identity of requested block */
4317  LWLock *bufPartitionLock; /* buffer partition lock for it */
4318  int buf_id;
4319  BufferDesc *bufHdr;
4320  uint32 buf_state;
4321 
4322  /* create a tag so we can lookup the buffer */
4323  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4324 
4325  /* determine its hash code and partition lock ID */
4326  bufHash = BufTableHashCode(&bufTag);
4327  bufPartitionLock = BufMappingPartitionLock(bufHash);
4328 
4329  /* Check that it is in the buffer pool. If not, do nothing. */
4330  LWLockAcquire(bufPartitionLock, LW_SHARED);
4331  buf_id = BufTableLookup(&bufTag, bufHash);
4332  LWLockRelease(bufPartitionLock);
4333 
4334  if (buf_id < 0)
4335  continue;
4336 
4337  bufHdr = GetBufferDescriptor(buf_id);
4338 
4339  /*
4340  * We need to lock the buffer header and recheck if the buffer is
4341  * still associated with the same block because the buffer could be
4342  * evicted by some other backend loading blocks for a different
4343  * relation after we release lock on the BufMapping table.
4344  */
4345  buf_state = LockBufHdr(bufHdr);
4346 
4347  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4348  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4349  bufHdr->tag.blockNum >= firstDelBlock)
4350  InvalidateBuffer(bufHdr); /* releases spinlock */
4351  else
4352  UnlockBufHdr(bufHdr, buf_state);
4353  }
4354 }
4355 
4356 /* ---------------------------------------------------------------------
4357  * DropDatabaseBuffers
4358  *
4359  * This function removes all the buffers in the buffer cache for a
4360  * particular database. Dirty pages are simply dropped, without
4361  * bothering to write them out first. This is used when we destroy a
4362  * database, to avoid trying to flush data to disk when the directory
4363  * tree no longer exists. Implementation is pretty similar to
4364  * DropRelationBuffers() which is for destroying just one relation.
4365  * --------------------------------------------------------------------
4366  */
4367 void
4369 {
4370  int i;
4371 
4372  /*
4373  * We needn't consider local buffers, since by assumption the target
4374  * database isn't our own.
4375  */
4376 
4377  for (i = 0; i < NBuffers; i++)
4378  {
4379  BufferDesc *bufHdr = GetBufferDescriptor(i);
4380  uint32 buf_state;
4381 
4382  /*
4383  * As in DropRelationBuffers, an unlocked precheck should be safe and
4384  * saves some cycles.
4385  */
4386  if (bufHdr->tag.dbOid != dbid)
4387  continue;
4388 
4389  buf_state = LockBufHdr(bufHdr);
4390  if (bufHdr->tag.dbOid == dbid)
4391  InvalidateBuffer(bufHdr); /* releases spinlock */
4392  else
4393  UnlockBufHdr(bufHdr, buf_state);
4394  }
4395 }
4396 
4397 /* -----------------------------------------------------------------
4398  * PrintBufferDescs
4399  *
4400  * this function prints all the buffer descriptors, for debugging
4401  * use only.
4402  * -----------------------------------------------------------------
4403  */
4404 #ifdef NOT_USED
4405 void
4406 PrintBufferDescs(void)
4407 {
4408  int i;
4409 
4410  for (i = 0; i < NBuffers; ++i)
4411  {
4414 
4415  /* theoretically we should lock the bufhdr here */
4416  elog(LOG,
4417  "[%02d] (freeNext=%d, rel=%s, "
4418  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4419  i, buf->freeNext,
4422  buf->tag.blockNum, buf->flags,
4423  buf->refcount, GetPrivateRefCount(b));
4424  }
4425 }
4426 #endif
4427 
4428 #ifdef NOT_USED
4429 void
4430 PrintPinnedBufs(void)
4431 {
4432  int i;
4433 
4434  for (i = 0; i < NBuffers; ++i)
4435  {
4438 
4439  if (GetPrivateRefCount(b) > 0)
4440  {
4441  /* theoretically we should lock the bufhdr here */
4442  elog(LOG,
4443  "[%02d] (freeNext=%d, rel=%s, "
4444  "blockNum=%u, flags=0x%x, refcount=%u %d)",
4445  i, buf->freeNext,
4447  BufTagGetForkNum(&buf->tag)),
4448  buf->tag.blockNum, buf->flags,
4449  buf->refcount, GetPrivateRefCount(b));
4450  }
4451  }
4452 }
4453 #endif
4454 
4455 /* ---------------------------------------------------------------------
4456  * FlushRelationBuffers
4457  *
4458  * This function writes all dirty pages of a relation out to disk
4459  * (or more accurately, out to kernel disk buffers), ensuring that the
4460  * kernel has an up-to-date view of the relation.
4461  *
4462  * Generally, the caller should be holding AccessExclusiveLock on the
4463  * target relation to ensure that no other backend is busy dirtying
4464  * more blocks of the relation; the effects can't be expected to last
4465  * after the lock is released.
4466  *
4467  * XXX currently it sequentially searches the buffer pool, should be
4468  * changed to more clever ways of searching. This routine is not
4469  * used in any performance-critical code paths, so it's not worth
4470  * adding additional overhead to normal paths to make it go faster.
4471  * --------------------------------------------------------------------
4472  */
4473 void
4475 {
4476  int i;
4477  BufferDesc *bufHdr;
4478  SMgrRelation srel = RelationGetSmgr(rel);
4479 
4480  if (RelationUsesLocalBuffers(rel))
4481  {
4482  for (i = 0; i < NLocBuffer; i++)
4483  {
4484  uint32 buf_state;
4485  instr_time io_start;
4486 
4487  bufHdr = GetLocalBufferDescriptor(i);
4488  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4489  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4490  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4491  {
4492  ErrorContextCallback errcallback;
4493  Page localpage;
4494 
4495  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4496 
4497  /* Setup error traceback support for ereport() */
4499  errcallback.arg = (void *) bufHdr;
4500  errcallback.previous = error_context_stack;
4501  error_context_stack = &errcallback;
4502 
4503  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4504 
4506 
4507  smgrwrite(srel,
4508  BufTagGetForkNum(&bufHdr->tag),
4509  bufHdr->tag.blockNum,
4510  localpage,
4511  false);
4512 
4515  io_start, 1);
4516 
4517  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4518  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4519 
4521 
4522  /* Pop the error context stack */
4523  error_context_stack = errcallback.previous;
4524  }
4525  }
4526 
4527  return;
4528  }
4529 
4530  for (i = 0; i < NBuffers; i++)
4531  {
4532  uint32 buf_state;
4533 
4534  bufHdr = GetBufferDescriptor(i);
4535 
4536  /*
4537  * As in DropRelationBuffers, an unlocked precheck should be safe and
4538  * saves some cycles.
4539  */
4540  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4541  continue;
4542 
4543  /* Make sure we can handle the pin */
4546 
4547  buf_state = LockBufHdr(bufHdr);
4548  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4549  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4550  {
4551  PinBuffer_Locked(bufHdr);
4555  UnpinBuffer(bufHdr);
4556  }
4557  else
4558  UnlockBufHdr(bufHdr, buf_state);
4559  }
4560 }
4561 
4562 /* ---------------------------------------------------------------------
4563  * FlushRelationsAllBuffers
4564  *
4565  * This function flushes out of the buffer pool all the pages of all
4566  * forks of the specified smgr relations. It's equivalent to calling
4567  * FlushRelationBuffers once per relation. The relations are assumed not
4568  * to use local buffers.
4569  * --------------------------------------------------------------------
4570  */
4571 void
4573 {
4574  int i;
4575  SMgrSortArray *srels;
4576  bool use_bsearch;
4577 
4578  if (nrels == 0)
4579  return;
4580 
4581  /* fill-in array for qsort */
4582  srels = palloc(sizeof(SMgrSortArray) * nrels);
4583 
4584  for (i = 0; i < nrels; i++)
4585  {
4586  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4587 
4588  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4589  srels[i].srel = smgrs[i];
4590  }
4591 
4592  /*
4593  * Save the bsearch overhead for low number of relations to sync. See
4594  * DropRelationsAllBuffers for details.
4595  */
4596  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4597 
4598  /* sort the list of SMgrRelations if necessary */
4599  if (use_bsearch)
4600  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4601 
4602  for (i = 0; i < NBuffers; i++)
4603  {
4604  SMgrSortArray *srelent = NULL;
4605  BufferDesc *bufHdr = GetBufferDescriptor(i);
4606  uint32 buf_state;
4607 
4608  /*
4609  * As in DropRelationBuffers, an unlocked precheck should be safe and
4610  * saves some cycles.
4611  */
4612 
4613  if (!use_bsearch)
4614  {
4615  int j;
4616 
4617  for (j = 0; j < nrels; j++)
4618  {
4619  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4620  {
4621  srelent = &srels[j];
4622  break;
4623  }
4624  }
4625  }
4626  else
4627  {
4628  RelFileLocator rlocator;
4629 
4630  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4631  srelent = bsearch((const void *) &(rlocator),
4632  srels, nrels, sizeof(SMgrSortArray),
4634  }
4635 
4636  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4637  if (srelent == NULL)
4638  continue;
4639 
4640  /* Make sure we can handle the pin */
4643 
4644  buf_state = LockBufHdr(bufHdr);
4645  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4646  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4647  {
4648  PinBuffer_Locked(bufHdr);
4650  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4652  UnpinBuffer(bufHdr);
4653  }
4654  else
4655  UnlockBufHdr(bufHdr, buf_state);
4656  }
4657 
4658  pfree(srels);
4659 }
4660 
4661 /* ---------------------------------------------------------------------
4662  * RelationCopyStorageUsingBuffer
4663  *
4664  * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4665  * of using smgrread and smgrextend this will copy using bufmgr APIs.
4666  *
4667  * Refer comments atop CreateAndCopyRelationData() for details about
4668  * 'permanent' parameter.
4669  * --------------------------------------------------------------------
4670  */
4671 static void
4673  RelFileLocator dstlocator,
4674  ForkNumber forkNum, bool permanent)
4675 {
4676  Buffer srcBuf;
4677  Buffer dstBuf;
4678  Page srcPage;
4679  Page dstPage;
4680  bool use_wal;
4681  BlockNumber nblocks;
4682  BlockNumber blkno;
4684  BufferAccessStrategy bstrategy_src;
4685  BufferAccessStrategy bstrategy_dst;
4687  ReadStream *src_stream;
4688  SMgrRelation src_smgr;
4689 
4690  /*
4691  * In general, we want to write WAL whenever wal_level > 'minimal', but we
4692  * can skip it when copying any fork of an unlogged relation other than
4693  * the init fork.
4694  */
4695  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
4696 
4697  /* Get number of blocks in the source relation. */
4698  nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
4699  forkNum);
4700 
4701  /* Nothing to copy; just return. */
4702  if (nblocks == 0)
4703  return;
4704 
4705  /*
4706  * Bulk extend the destination relation of the same size as the source
4707  * relation before starting to copy block by block.
4708  */
4709  memset(buf.data, 0, BLCKSZ);
4710  smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4711  buf.data, true);
4712 
4713  /* This is a bulk operation, so use buffer access strategies. */
4714  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4715  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4716 
4717  /* Initialize streaming read */
4718  p.current_blocknum = 0;
4719  p.last_exclusive = nblocks;
4720  src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
4722  bstrategy_src,
4723  src_smgr,
4724  permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
4725  forkNum,
4727  &p,
4728  0);
4729 
4730  /* Iterate over each block of the source relation file. */
4731  for (blkno = 0; blkno < nblocks; blkno++)
4732  {
4734 
4735  /* Read block from source relation. */
4736  srcBuf = read_stream_next_buffer(src_stream, NULL);
4737  LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
4738  srcPage = BufferGetPage(srcBuf);
4739 
4740  dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
4741  BufferGetBlockNumber(srcBuf),
4742  RBM_ZERO_AND_LOCK, bstrategy_dst,
4743  permanent);
4744  dstPage = BufferGetPage(dstBuf);
4745 
4747 
4748  /* Copy page data from the source to the destination. */
4749  memcpy(dstPage, srcPage, BLCKSZ);
4750  MarkBufferDirty(dstBuf);
4751 
4752  /* WAL-log the copied page. */
4753  if (use_wal)
4754  log_newpage_buffer(dstBuf, true);
4755 
4756  END_CRIT_SECTION();
4757 
4758  UnlockReleaseBuffer(dstBuf);
4759  UnlockReleaseBuffer(srcBuf);
4760  }
4761  Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
4762  read_stream_end(src_stream);
4763 
4764  FreeAccessStrategy(bstrategy_src);
4765  FreeAccessStrategy(bstrategy_dst);
4766 }
4767 
4768 /* ---------------------------------------------------------------------
4769  * CreateAndCopyRelationData
4770  *
4771  * Create destination relation storage and copy all forks from the
4772  * source relation to the destination.
4773  *
4774  * Pass permanent as true for permanent relations and false for
4775  * unlogged relations. Currently this API is not supported for
4776  * temporary relations.
4777  * --------------------------------------------------------------------
4778  */
4779 void
4781  RelFileLocator dst_rlocator, bool permanent)
4782 {
4783  char relpersistence;
4784  SMgrRelation src_rel;
4785  SMgrRelation dst_rel;
4786 
4787  /* Set the relpersistence. */
4788  relpersistence = permanent ?
4789  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4790 
4791  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4792  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4793 
4794  /*
4795  * Create and copy all forks of the relation. During create database we
4796  * have a separate cleanup mechanism which deletes complete database
4797  * directory. Therefore, each individual relation doesn't need to be
4798  * registered for cleanup.
4799  */
4800  RelationCreateStorage(dst_rlocator, relpersistence, false);
4801 
4802  /* copy main fork. */
4803  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4804  permanent);
4805 
4806  /* copy those extra forks that exist */
4807  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4808  forkNum <= MAX_FORKNUM; forkNum++)
4809  {
4810  if (smgrexists(src_rel, forkNum))
4811  {
4812  smgrcreate(dst_rel, forkNum, false);
4813 
4814  /*
4815  * WAL log creation if the relation is persistent, or this is the
4816  * init fork of an unlogged relation.
4817  */
4818  if (permanent || forkNum == INIT_FORKNUM)
4819  log_smgrcreate(&dst_rlocator, forkNum);
4820 
4821  /* Copy a fork's data, block by block. */
4822  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4823  permanent);
4824  }
4825  }
4826 }
4827 
4828 /* ---------------------------------------------------------------------
4829  * FlushDatabaseBuffers
4830  *
4831  * This function writes all dirty pages of a database out to disk
4832  * (or more accurately, out to kernel disk buffers), ensuring that the
4833  * kernel has an up-to-date view of the database.
4834  *
4835  * Generally, the caller should be holding an appropriate lock to ensure
4836  * no other backend is active in the target database; otherwise more
4837  * pages could get dirtied.
4838  *
4839  * Note we don't worry about flushing any pages of temporary relations.
4840  * It's assumed these wouldn't be interesting.
4841  * --------------------------------------------------------------------
4842  */
4843 void
4845 {
4846  int i;
4847  BufferDesc *bufHdr;
4848 
4849  for (i = 0; i < NBuffers; i++)
4850  {
4851  uint32 buf_state;
4852 
4853  bufHdr = GetBufferDescriptor(i);
4854 
4855  /*
4856  * As in DropRelationBuffers, an unlocked precheck should be safe and
4857  * saves some cycles.
4858  */
4859  if (bufHdr->tag.dbOid != dbid)
4860  continue;
4861 
4862  /* Make sure we can handle the pin */
4865 
4866  buf_state = LockBufHdr(bufHdr);
4867  if (bufHdr->tag.dbOid == dbid &&
4868  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4869  {
4870  PinBuffer_Locked(bufHdr);
4874  UnpinBuffer(bufHdr);
4875  }
4876  else
4877  UnlockBufHdr(bufHdr, buf_state);
4878  }
4879 }
4880 
4881 /*
4882  * Flush a previously, shared or exclusively, locked and pinned buffer to the
4883  * OS.
4884  */
4885 void
4887 {
4888  BufferDesc *bufHdr;
4889 
4890  /* currently not needed, but no fundamental reason not to support */
4892 
4894 
4895  bufHdr = GetBufferDescriptor(buffer - 1);
4896 
4898 
4900 }
4901 
4902 /*
4903  * ReleaseBuffer -- release the pin on a buffer
4904  */
4905 void
4907 {
4908  if (!BufferIsValid(buffer))
4909  elog(ERROR, "bad buffer ID: %d", buffer);
4910 
4911  if (BufferIsLocal(buffer))
4913  else
4915 }
4916 
4917 /*
4918  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4919  *
4920  * This is just a shorthand for a common combination.
4921  */
4922 void
4924 {
4927 }
4928 
4929 /*
4930  * IncrBufferRefCount
4931  * Increment the pin count on a buffer that we have *already* pinned
4932  * at least once.
4933  *
4934  * This function cannot be used on a buffer we do not have pinned,
4935  * because it doesn't change the shared buffer state.
4936  */
4937 void
4939 {
4942  if (BufferIsLocal(buffer))
4943  LocalRefCount[-buffer - 1]++;
4944  else
4945  {
4946  PrivateRefCountEntry *ref;
4947 
4948  ref = GetPrivateRefCountEntry(buffer, true);
4949  Assert(ref != NULL);
4950  ref->refcount++;
4951  }
4953 }
4954 
4955 /*
4956  * MarkBufferDirtyHint
4957  *
4958  * Mark a buffer dirty for non-critical changes.
4959  *
4960  * This is essentially the same as MarkBufferDirty, except:
4961  *
4962  * 1. The caller does not write WAL; so if checksums are enabled, we may need
4963  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4964  * 2. The caller might have only share-lock instead of exclusive-lock on the
4965  * buffer's content lock.
4966  * 3. This function does not guarantee that the buffer is always marked dirty
4967  * (due to a race condition), so it cannot be used for important changes.
4968  */
4969 void
4971 {
4972  BufferDesc *bufHdr;
4973  Page page = BufferGetPage(buffer);
4974 
4975  if (!BufferIsValid(buffer))
4976  elog(ERROR, "bad buffer ID: %d", buffer);
4977 
4978  if (BufferIsLocal(buffer))
4979  {
4981  return;
4982  }
4983 
4984  bufHdr = GetBufferDescriptor(buffer - 1);
4985 
4987  /* here, either share or exclusive lock is OK */
4989 
4990  /*
4991  * This routine might get called many times on the same page, if we are
4992  * making the first scan after commit of an xact that added/deleted many
4993  * tuples. So, be as quick as we can if the buffer is already dirty. We
4994  * do this by not acquiring spinlock if it looks like the status bits are
4995  * already set. Since we make this test unlocked, there's a chance we
4996  * might fail to notice that the flags have just been cleared, and failed
4997  * to reset them, due to memory-ordering issues. But since this function
4998  * is only intended to be used in cases where failing to write out the
4999  * data would be harmless anyway, it doesn't really matter.
5000  */
5001  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5003  {
5005  bool dirtied = false;
5006  bool delayChkptFlags = false;
5007  uint32 buf_state;
5008 
5009  /*
5010  * If we need to protect hint bit updates from torn writes, WAL-log a
5011  * full page image of the page. This full page image is only necessary
5012  * if the hint bit update is the first change to the page since the
5013  * last checkpoint.
5014  *
5015  * We don't check full_page_writes here because that logic is included
5016  * when we call XLogInsert() since the value changes dynamically.
5017  */
5018  if (XLogHintBitIsNeeded() &&
5019  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5020  {
5021  /*
5022  * If we must not write WAL, due to a relfilelocator-specific
5023  * condition or being in recovery, don't dirty the page. We can
5024  * set the hint, just not dirty the page as a result so the hint
5025  * is lost when we evict the page or shutdown.
5026  *
5027  * See src/backend/storage/page/README for longer discussion.
5028  */
5029  if (RecoveryInProgress() ||
5031  return;
5032 
5033  /*
5034  * If the block is already dirty because we either made a change
5035  * or set a hint already, then we don't need to write a full page
5036  * image. Note that aggressive cleaning of blocks dirtied by hint
5037  * bit setting would increase the call rate. Bulk setting of hint
5038  * bits would reduce the call rate...
5039  *
5040  * We must issue the WAL record before we mark the buffer dirty.
5041  * Otherwise we might write the page before we write the WAL. That
5042  * causes a race condition, since a checkpoint might occur between
5043  * writing the WAL record and marking the buffer dirty. We solve
5044  * that with a kluge, but one that is already in use during
5045  * transaction commit to prevent race conditions. Basically, we
5046  * simply prevent the checkpoint WAL record from being written
5047  * until we have marked the buffer dirty. We don't start the
5048  * checkpoint flush until we have marked dirty, so our checkpoint
5049  * must flush the change to disk successfully or the checkpoint
5050  * never gets written, so crash recovery will fix.
5051  *
5052  * It's possible we may enter here without an xid, so it is
5053  * essential that CreateCheckPoint waits for virtual transactions
5054  * rather than full transactionids.
5055  */
5058  delayChkptFlags = true;
5059  lsn = XLogSaveBufferForHint(buffer, buffer_std);
5060  }
5061 
5062  buf_state = LockBufHdr(bufHdr);
5063 
5064  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5065 
5066  if (!(buf_state & BM_DIRTY))
5067  {
5068  dirtied = true; /* Means "will be dirtied by this action" */
5069 
5070  /*
5071  * Set the page LSN if we wrote a backup block. We aren't supposed
5072  * to set this when only holding a share lock but as long as we
5073  * serialise it somehow we're OK. We choose to set LSN while
5074  * holding the buffer header lock, which causes any reader of an
5075  * LSN who holds only a share lock to also obtain a buffer header
5076  * lock before using PageGetLSN(), which is enforced in
5077  * BufferGetLSNAtomic().
5078  *
5079  * If checksums are enabled, you might think we should reset the
5080  * checksum here. That will happen when the page is written
5081  * sometime later in this checkpoint cycle.
5082  */
5083  if (!XLogRecPtrIsInvalid(lsn))
5084  PageSetLSN(page, lsn);
5085  }
5086 
5087  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5088  UnlockBufHdr(bufHdr, buf_state);
5089 
5090  if (delayChkptFlags)
5092 
5093  if (dirtied)
5094  {
5096  if (VacuumCostActive)
5098  }
5099  }
5100 }
5101 
5102 /*
5103  * Release buffer content locks for shared buffers.
5104  *
5105  * Used to clean up after errors.
5106  *
5107  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5108  * of releasing buffer content locks per se; the only thing we need to deal
5109  * with here is clearing any PIN_COUNT request that was in progress.
5110  */
5111 void
5113 {
5115 
5116  if (buf)
5117  {
5118  uint32 buf_state;
5119 
5120  buf_state = LockBufHdr(buf);
5121 
5122  /*
5123  * Don't complain if flag bit not set; it could have been reset but we
5124  * got a cancel/die interrupt before getting the signal.
5125  */
5126  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5127  buf->wait_backend_pgprocno == MyProcNumber)
5128  buf_state &= ~BM_PIN_COUNT_WAITER;
5129 
5130  UnlockBufHdr(buf, buf_state);
5131 
5132  PinCountWaitBuf = NULL;
5133  }
5134 }
5135 
5136 /*
5137  * Acquire or release the content_lock for the buffer.
5138  */
5139 void
5141 {
5142  BufferDesc *buf;
5143 
5145  if (BufferIsLocal(buffer))
5146  return; /* local buffers need no lock */
5147 
5149 
5150  if (mode == BUFFER_LOCK_UNLOCK)
5152  else if (mode == BUFFER_LOCK_SHARE)
5154  else if (mode == BUFFER_LOCK_EXCLUSIVE)
5156  else
5157  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5158 }
5159 
5160 /*
5161  * Acquire the content_lock for the buffer, but only if we don't have to wait.
5162  *
5163  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5164  */
5165 bool
5167 {
5168  BufferDesc *buf;
5169 
5171  if (BufferIsLocal(buffer))
5172  return true; /* act as though we got it */
5173 
5175 
5177  LW_EXCLUSIVE);
5178 }
5179 
5180 /*
5181  * Verify that this backend is pinning the buffer exactly once.
5182  *
5183  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5184  * holds a pin on the buffer. We do not care whether some other backend does.
5185  */
5186 void
5188 {
5189  if (BufferIsLocal(buffer))
5190  {
5191  if (LocalRefCount[-buffer - 1] != 1)
5192  elog(ERROR, "incorrect local pin count: %d",
5193  LocalRefCount[-buffer - 1]);
5194  }
5195  else
5196  {
5197  if (GetPrivateRefCount(buffer) != 1)
5198  elog(ERROR, "incorrect local pin count: %d",
5200  }
5201 }
5202 
5203 /*
5204  * LockBufferForCleanup - lock a buffer in preparation for deleting items
5205  *
5206  * Items may be deleted from a disk page only when the caller (a) holds an
5207  * exclusive lock on the buffer and (b) has observed that no other backend
5208  * holds a pin on the buffer. If there is a pin, then the other backend
5209  * might have a pointer into the buffer (for example, a heapscan reference
5210  * to an item --- see README for more details). It's OK if a pin is added
5211  * after the cleanup starts, however; the newly-arrived backend will be
5212  * unable to look at the page until we release the exclusive lock.
5213  *
5214  * To implement this protocol, a would-be deleter must pin the buffer and
5215  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5216  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5217  * it has successfully observed pin count = 1.
5218  */
5219 void
5221 {
5222  BufferDesc *bufHdr;
5223  TimestampTz waitStart = 0;
5224  bool waiting = false;
5225  bool logged_recovery_conflict = false;
5226 
5228  Assert(PinCountWaitBuf == NULL);
5229 
5231 
5232  /* Nobody else to wait for */
5233  if (BufferIsLocal(buffer))
5234  return;
5235 
5236  bufHdr = GetBufferDescriptor(buffer - 1);
5237 
5238  for (;;)
5239  {
5240  uint32 buf_state;
5241 
5242  /* Try to acquire lock */
5244  buf_state = LockBufHdr(bufHdr);
5245 
5246  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5247  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5248  {
5249  /* Successfully acquired exclusive lock with pincount 1 */
5250  UnlockBufHdr(bufHdr, buf_state);
5251 
5252  /*
5253  * Emit the log message if recovery conflict on buffer pin was
5254  * resolved but the startup process waited longer than
5255  * deadlock_timeout for it.
5256  */
5257  if (logged_recovery_conflict)
5259  waitStart, GetCurrentTimestamp(),
5260  NULL, false);
5261 
5262  if (waiting)
5263  {
5264  /* reset ps display to remove the suffix if we added one */
5266  waiting = false;
5267  }
5268  return;
5269  }
5270  /* Failed, so mark myself as waiting for pincount 1 */
5271  if (buf_state & BM_PIN_COUNT_WAITER)
5272  {
5273  UnlockBufHdr(bufHdr, buf_state);
5275  elog(ERROR, "multiple backends attempting to wait for pincount 1");
5276  }
5278  PinCountWaitBuf = bufHdr;
5279  buf_state |= BM_PIN_COUNT_WAITER;
5280  UnlockBufHdr(bufHdr, buf_state);
5282 
5283  /* Wait to be signaled by UnpinBuffer() */
5284  if (InHotStandby)
5285  {
5286  if (!waiting)
5287  {
5288  /* adjust the process title to indicate that it's waiting */
5289  set_ps_display_suffix("waiting");
5290  waiting = true;
5291  }
5292 
5293  /*
5294  * Emit the log message if the startup process is waiting longer
5295  * than deadlock_timeout for recovery conflict on buffer pin.
5296  *
5297  * Skip this if first time through because the startup process has
5298  * not started waiting yet in this case. So, the wait start
5299  * timestamp is set after this logic.
5300  */
5301  if (waitStart != 0 && !logged_recovery_conflict)
5302  {
5304 
5305  if (TimestampDifferenceExceeds(waitStart, now,
5306  DeadlockTimeout))
5307  {
5309  waitStart, now, NULL, true);
5310  logged_recovery_conflict = true;
5311  }
5312  }
5313 
5314  /*
5315  * Set the wait start timestamp if logging is enabled and first
5316  * time through.
5317  */
5318  if (log_recovery_conflict_waits && waitStart == 0)
5319  waitStart = GetCurrentTimestamp();
5320 
5321  /* Publish the bufid that Startup process waits on */
5323  /* Set alarm and then wait to be signaled by UnpinBuffer() */
5325  /* Reset the published bufid */
5327  }
5328  else
5329  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5330 
5331  /*
5332  * Remove flag marking us as waiter. Normally this will not be set
5333  * anymore, but ProcWaitForSignal() can return for other signals as
5334  * well. We take care to only reset the flag if we're the waiter, as
5335  * theoretically another backend could have started waiting. That's
5336  * impossible with the current usages due to table level locking, but
5337  * better be safe.
5338  */
5339  buf_state = LockBufHdr(bufHdr);
5340  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5342  buf_state &= ~BM_PIN_COUNT_WAITER;
5343  UnlockBufHdr(bufHdr, buf_state);
5344 
5345  PinCountWaitBuf = NULL;
5346  /* Loop back and try again */
5347  }
5348 }
5349 
5350 /*
5351  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5352  * requests cancellation of all pin holders that are blocking it.
5353  */
5354 bool
5356 {
5357  int bufid = GetStartupBufferPinWaitBufId();
5358 
5359  /*
5360  * If we get woken slowly then it's possible that the Startup process was
5361  * already woken by other backends before we got here. Also possible that
5362  * we get here by multiple interrupts or interrupts at inappropriate
5363  * times, so make sure we do nothing if the bufid is not set.
5364  */
5365  if (bufid < 0)
5366  return false;
5367 
5368  if (GetPrivateRefCount(bufid + 1) > 0)
5369  return true;
5370 
5371  return false;
5372 }
5373 
5374 /*
5375  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5376  *
5377  * We won't loop, but just check once to see if the pin count is OK. If
5378  * not, return false with no lock held.
5379  */
5380 bool
5382 {
5383  BufferDesc *bufHdr;
5384  uint32 buf_state,
5385  refcount;
5386 
5388 
5389  if (BufferIsLocal(buffer))
5390  {
5391  refcount = LocalRefCount[-buffer - 1];
5392  /* There should be exactly one pin */
5393  Assert(refcount > 0);
5394  if (refcount != 1)
5395  return false;
5396  /* Nobody else to wait for */
5397  return true;
5398  }
5399 
5400  /* There should be exactly one local pin */
5402  Assert(refcount);
5403  if (refcount != 1)
5404  return false;
5405 
5406  /* Try to acquire lock */
5408  return false;
5409 
5410  bufHdr = GetBufferDescriptor(buffer - 1);
5411  buf_state = LockBufHdr(bufHdr);
5412  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5413 
5414  Assert(refcount > 0);
5415  if (refcount == 1)
5416  {
5417  /* Successfully acquired exclusive lock with pincount 1 */
5418  UnlockBufHdr(bufHdr, buf_state);
5419  return true;
5420  }
5421 
5422  /* Failed, so release the lock */
5423  UnlockBufHdr(bufHdr, buf_state);
5425  return false;
5426 }
5427 
5428 /*
5429  * IsBufferCleanupOK - as above, but we already have the lock
5430  *
5431  * Check whether it's OK to perform cleanup on a buffer we've already
5432  * locked. If we observe that the pin count is 1, our exclusive lock
5433  * happens to be a cleanup lock, and we can proceed with anything that
5434  * would have been allowable had we sought a cleanup lock originally.
5435  */
5436 bool
5438 {
5439  BufferDesc *bufHdr;
5440  uint32 buf_state;
5441 
5443 
5444  if (BufferIsLocal(buffer))
5445  {
5446  /* There should be exactly one pin */
5447  if (LocalRefCount[-buffer - 1] != 1)
5448  return false;
5449  /* Nobody else to wait for */
5450  return true;
5451  }
5452 
5453  /* There should be exactly one local pin */
5454  if (GetPrivateRefCount(buffer) != 1)
5455  return false;
5456 
5457  bufHdr = GetBufferDescriptor(buffer - 1);
5458 
5459  /* caller must hold exclusive lock on buffer */
5461  LW_EXCLUSIVE));
5462 
5463  buf_state = LockBufHdr(bufHdr);
5464 
5465  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5466  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5467  {
5468  /* pincount is OK. */
5469  UnlockBufHdr(bufHdr, buf_state);
5470  return true;
5471  }
5472 
5473  UnlockBufHdr(bufHdr, buf_state);
5474  return false;
5475 }
5476 
5477 
5478 /*
5479  * Functions for buffer I/O handling
5480  *
5481  * Note: We assume that nested buffer I/O never occurs.
5482  * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5483  *
5484  * Also note that these are used only for shared buffers, not local ones.
5485  */
5486 
5487 /*
5488  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5489  */
5490 static void
5492 {
5494 
5496  for (;;)
5497  {
5498  uint32 buf_state;
5499 
5500  /*
5501  * It may not be necessary to acquire the spinlock to check the flag
5502  * here, but since this test is essential for correctness, we'd better
5503  * play it safe.
5504  */
5505  buf_state = LockBufHdr(buf);
5506  UnlockBufHdr(buf, buf_state);
5507 
5508  if (!(buf_state & BM_IO_IN_PROGRESS))
5509  break;
5510  ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5511  }
5513 }
5514 
5515 /*
5516  * StartBufferIO: begin I/O on this buffer
5517  * (Assumptions)
5518  * My process is executing no IO
5519  * The buffer is Pinned
5520  *
5521  * In some scenarios there are race conditions in which multiple backends
5522  * could attempt the same I/O operation concurrently. If someone else
5523  * has already started I/O on this buffer then we will block on the
5524  * I/O condition variable until he's done.
5525  *
5526  * Input operations are only attempted on buffers that are not BM_VALID,
5527  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5528  * so we can always tell if the work is already done.
5529  *
5530  * Returns true if we successfully marked the buffer as I/O busy,
5531  * false if someone else already did the work.
5532  *
5533  * If nowait is true, then we don't wait for an I/O to be finished by another
5534  * backend. In that case, false indicates either that the I/O was already
5535  * finished, or is still in progress. This is useful for callers that want to
5536  * find out if they can perform the I/O as part of a larger operation, without
5537  * waiting for the answer or distinguishing the reasons why not.
5538  */
5539 static bool
5540 StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5541 {
5542  uint32 buf_state;
5543 
5545 
5546  for (;;)
5547  {
5548  buf_state = LockBufHdr(buf);
5549 
5550  if (!(buf_state & BM_IO_IN_PROGRESS))
5551  break;
5552  UnlockBufHdr(buf, buf_state);
5553  if (nowait)
5554  return false;
5555  WaitIO(buf);
5556  }
5557 
5558  /* Once we get here, there is definitely no I/O active on this buffer */
5559 
5560  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5561  {
5562  /* someone else already did the I/O */
5563  UnlockBufHdr(buf, buf_state);
5564  return false;
5565  }
5566 
5567  buf_state |= BM_IO_IN_PROGRESS;
5568  UnlockBufHdr(buf, buf_state);
5569 
5572 
5573  return true;
5574 }
5575 
5576 /*
5577  * TerminateBufferIO: release a buffer we were doing I/O on
5578  * (Assumptions)
5579  * My process is executing IO for the buffer
5580  * BM_IO_IN_PROGRESS bit is set for the buffer
5581  * The buffer is Pinned
5582  *
5583  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5584  * buffer's BM_DIRTY flag. This is appropriate when terminating a
5585  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5586  * marking the buffer clean if it was re-dirtied while we were writing.
5587  *
5588  * set_flag_bits gets ORed into the buffer's flags. It must include
5589  * BM_IO_ERROR in a failure case. For successful completion it could
5590  * be 0, or BM_VALID if we just finished reading in the page.
5591  *
5592  * If forget_owner is true, we release the buffer I/O from the current
5593  * resource owner. (forget_owner=false is used when the resource owner itself
5594  * is being released)
5595  */
5596 static void
5597 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5598  bool forget_owner)
5599 {
5600  uint32 buf_state;
5601 
5602  buf_state = LockBufHdr(buf);
5603 
5604  Assert(buf_state & BM_IO_IN_PROGRESS);
5605 
5606  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5607  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5608  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5609 
5610  buf_state |= set_flag_bits;
5611  UnlockBufHdr(buf, buf_state);
5612 
5613  if (forget_owner)
5616 
5618 }
5619 
5620 /*
5621  * AbortBufferIO: Clean up active buffer I/O after an error.
5622  *
5623  * All LWLocks we might have held have been released,
5624  * but we haven't yet released buffer pins, so the buffer is still pinned.
5625  *
5626  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5627  * possible the error condition wasn't related to the I/O.
5628  *
5629  * Note: this does not remove the buffer I/O from the resource owner.
5630  * That's correct when we're releasing the whole resource owner, but
5631  * beware if you use this in other contexts.
5632  */
5633 static void
5635 {
5636  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5637  uint32 buf_state;
5638 
5639  buf_state = LockBufHdr(buf_hdr);
5640  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5641 
5642  if (!(buf_state & BM_VALID))
5643  {
5644  Assert(!(buf_state & BM_DIRTY));
5645  UnlockBufHdr(buf_hdr, buf_state);
5646  }
5647  else
5648  {
5649  Assert(buf_state & BM_DIRTY);
5650  UnlockBufHdr(buf_hdr, buf_state);
5651 
5652  /* Issue notice if this is not the first failure... */
5653  if (buf_state & BM_IO_ERROR)
5654  {
5655  /* Buffer is pinned, so we can read tag without spinlock */
5656  char *path;
5657 
5658  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5659  BufTagGetForkNum(&buf_hdr->tag));
5660  ereport(WARNING,
5661  (errcode(ERRCODE_IO_ERROR),
5662  errmsg("could not write block %u of %s",
5663  buf_hdr->tag.blockNum, path),
5664  errdetail("Multiple failures --- write error might be permanent.")));
5665  pfree(path);
5666  }
5667  }
5668 
5669  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5670 }
5671 
5672 /*
5673  * Error context callback for errors occurring during shared buffer writes.
5674  */
5675 static void
5677 {
5678  BufferDesc *bufHdr = (BufferDesc *) arg;
5679 
5680  /* Buffer is pinned, so we can read the tag without locking the spinlock */
5681  if (bufHdr != NULL)
5682  {
5683  char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5684  BufTagGetForkNum(&bufHdr->tag));
5685 
5686  errcontext("writing block %u of relation %s",
5687  bufHdr->tag.blockNum, path);
5688  pfree(path);
5689  }
5690 }
5691 
5692 /*
5693  * Error context callback for errors occurring during local buffer writes.
5694  */
5695 static void
5697 {
5698  BufferDesc *bufHdr = (BufferDesc *) arg;
5699 
5700  if (bufHdr != NULL)
5701  {
5702  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5703  MyProcNumber,
5704  BufTagGetForkNum(&bufHdr->tag));
5705 
5706  errcontext("writing block %u of relation %s",
5707  bufHdr->tag.blockNum, path);
5708  pfree(path);
5709  }
5710 }
5711 
5712 /*
5713  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5714  */
5715 static int
5716 rlocator_comparator(const void *p1, const void *p2)
5717 {
5718  RelFileLocator n1 = *(const RelFileLocator *) p1;
5719  RelFileLocator n2 = *(const RelFileLocator *) p2;
5720 
5721  if (n1.relNumber < n2.relNumber)
5722  return -1;
5723  else if (n1.relNumber > n2.relNumber)
5724  return 1;
5725 
5726  if (n1.dbOid < n2.dbOid)
5727  return -1;
5728  else if (n1.dbOid > n2.dbOid)
5729  return 1;
5730 
5731  if (n1.spcOid < n2.spcOid)
5732  return -1;
5733  else if (n1.spcOid > n2.spcOid)
5734  return 1;
5735  else
5736  return 0;
5737 }
5738 
5739 /*
5740  * Lock buffer header - set BM_LOCKED in buffer state.
5741  */
5742 uint32
5744 {
5745  SpinDelayStatus delayStatus;
5746  uint32 old_buf_state;
5747 
5749 
5750  init_local_spin_delay(&delayStatus);
5751 
5752  while (true)
5753  {
5754  /* set BM_LOCKED flag */
5755  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5756  /* if it wasn't set before we're OK */
5757  if (!(old_buf_state & BM_LOCKED))
5758  break;
5759  perform_spin_delay(&delayStatus);
5760  }
5761  finish_spin_delay(&delayStatus);
5762  return old_buf_state | BM_LOCKED;
5763 }
5764 
5765 /*
5766  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5767  * state at that point.
5768  *
5769  * Obviously the buffer could be locked by the time the value is returned, so
5770  * this is primarily useful in CAS style loops.
5771  */
5772 static uint32
5774 {
5775  SpinDelayStatus delayStatus;
5776  uint32 buf_state;
5777 
5778  init_local_spin_delay(&delayStatus);
5779 
5780  buf_state = pg_atomic_read_u32(&buf->state);
5781 
5782  while (buf_state & BM_LOCKED)
5783  {
5784  perform_spin_delay(&delayStatus);
5785  buf_state = pg_atomic_read_u32(&<