PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/memdebug.h"
53 #include "utils/ps_status.h"
54 #include "utils/rel.h"
55 #include "utils/resowner_private.h"
56 #include "utils/timestamp.h"
57 
58 
59 /* Note: these two macros only work on shared buffers, not local ones! */
60 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
61 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
62 
63 /* Note: this macro only works on local buffers, not shared ones! */
64 #define LocalBufHdrGetBlock(bufHdr) \
65  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
66 
67 /* Bits in SyncOneBuffer's return value */
68 #define BUF_WRITTEN 0x01
69 #define BUF_REUSABLE 0x02
70 
71 #define RELS_BSEARCH_THRESHOLD 20
72 
73 typedef struct PrivateRefCountEntry
74 {
78 
79 /* 64 bytes, about the size of a cache line on common systems */
80 #define REFCOUNT_ARRAY_ENTRIES 8
81 
82 /*
83  * Status of buffers to checkpoint for a particular tablespace, used
84  * internally in BufferSync.
85  */
86 typedef struct CkptTsStatus
87 {
88  /* oid of the tablespace */
90 
91  /*
92  * Checkpoint progress for this tablespace. To make progress comparable
93  * between tablespaces the progress is, for each tablespace, measured as a
94  * number between 0 and the total number of to-be-checkpointed pages. Each
95  * page checkpointed in this tablespace increments this space's progress
96  * by progress_slice.
97  */
100 
101  /* number of to-be checkpointed pages in this tablespace */
103  /* already processed pages in this tablespace */
105 
106  /* current offset in CkptBufferIds for this tablespace */
107  int index;
108 } CkptTsStatus;
109 
110 /*
111  * Type for array used to sort SMgrRelations
112  *
113  * FlushRelationsAllBuffers shares the same comparator function with
114  * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
115  * compatible.
116  */
117 typedef struct SMgrSortArray
118 {
119  RelFileNode rnode; /* This must be the first member */
121 } SMgrSortArray;
122 
123 /* GUC variables */
124 bool zero_damaged_pages = false;
127 bool track_io_timing = false;
128 
129 /*
130  * How many buffers PrefetchBuffer callers should try to stay ahead of their
131  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
132  * for buffers not belonging to tablespaces that have their
133  * effective_io_concurrency parameter set.
134  */
136 
137 /*
138  * Like effective_io_concurrency, but used by maintenance code paths that might
139  * benefit from a higher setting because they work on behalf of many sessions.
140  * Overridden by the tablespace setting of the same name.
141  */
143 
144 /*
145  * GUC variables about triggering kernel writeback for buffers written; OS
146  * dependent defaults are set via the GUC mechanism.
147  */
151 
152 /* local state for StartBufferIO and related functions */
153 static BufferDesc *InProgressBuf = NULL;
154 static bool IsForInput;
155 
156 /* local state for LockBufferForCleanup */
158 
159 /*
160  * Backend-Private refcount management:
161  *
162  * Each buffer also has a private refcount that keeps track of the number of
163  * times the buffer is pinned in the current process. This is so that the
164  * shared refcount needs to be modified only once if a buffer is pinned more
165  * than once by an individual backend. It's also used to check that no buffers
166  * are still pinned at the end of transactions and when exiting.
167  *
168  *
169  * To avoid - as we used to - requiring an array with NBuffers entries to keep
170  * track of local buffers, we use a small sequentially searched array
171  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
172  * keep track of backend local pins.
173  *
174  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
175  * refcounts are kept track of in the array; after that, new array entries
176  * displace old ones into the hash table. That way a frequently used entry
177  * can't get "stuck" in the hashtable while infrequent ones clog the array.
178  *
179  * Note that in most scenarios the number of pinned buffers will not exceed
180  * REFCOUNT_ARRAY_ENTRIES.
181  *
182  *
183  * To enter a buffer into the refcount tracking mechanism first reserve a free
184  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
185  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
186  * memory allocations in NewPrivateRefCountEntry() which can be important
187  * because in some scenarios it's called with a spinlock held...
188  */
190 static HTAB *PrivateRefCountHash = NULL;
194 
195 static void ReservePrivateRefCountEntry(void);
198 static inline int32 GetPrivateRefCount(Buffer buffer);
200 
201 /*
202  * Ensure that the PrivateRefCountArray has sufficient space to store one more
203  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
204  * a new entry - but it's perfectly fine to not use a reserved entry.
205  */
206 static void
208 {
209  /* Already reserved (or freed), nothing to do */
210  if (ReservedRefCountEntry != NULL)
211  return;
212 
213  /*
214  * First search for a free entry the array, that'll be sufficient in the
215  * majority of cases.
216  */
217  {
218  int i;
219 
220  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
221  {
223 
224  res = &PrivateRefCountArray[i];
225 
226  if (res->buffer == InvalidBuffer)
227  {
228  ReservedRefCountEntry = res;
229  return;
230  }
231  }
232  }
233 
234  /*
235  * No luck. All array entries are full. Move one array entry into the hash
236  * table.
237  */
238  {
239  /*
240  * Move entry from the current clock position in the array into the
241  * hashtable. Use that slot.
242  */
243  PrivateRefCountEntry *hashent;
244  bool found;
245 
246  /* select victim slot */
247  ReservedRefCountEntry =
249 
250  /* Better be used, otherwise we shouldn't get here. */
251  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
252 
253  /* enter victim array entry into hashtable */
254  hashent = hash_search(PrivateRefCountHash,
255  (void *) &(ReservedRefCountEntry->buffer),
256  HASH_ENTER,
257  &found);
258  Assert(!found);
259  hashent->refcount = ReservedRefCountEntry->refcount;
260 
261  /* clear the now free array slot */
262  ReservedRefCountEntry->buffer = InvalidBuffer;
263  ReservedRefCountEntry->refcount = 0;
264 
266  }
267 }
268 
269 /*
270  * Fill a previously reserved refcount entry.
271  */
272 static PrivateRefCountEntry *
274 {
276 
277  /* only allowed to be called when a reservation has been made */
278  Assert(ReservedRefCountEntry != NULL);
279 
280  /* use up the reserved entry */
281  res = ReservedRefCountEntry;
282  ReservedRefCountEntry = NULL;
283 
284  /* and fill it */
285  res->buffer = buffer;
286  res->refcount = 0;
287 
288  return res;
289 }
290 
291 /*
292  * Return the PrivateRefCount entry for the passed buffer.
293  *
294  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
295  * do_move is true, and the entry resides in the hashtable the entry is
296  * optimized for frequent access by moving it to the array.
297  */
298 static PrivateRefCountEntry *
300 {
302  int i;
303 
304  Assert(BufferIsValid(buffer));
305  Assert(!BufferIsLocal(buffer));
306 
307  /*
308  * First search for references in the array, that'll be sufficient in the
309  * majority of cases.
310  */
311  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
312  {
313  res = &PrivateRefCountArray[i];
314 
315  if (res->buffer == buffer)
316  return res;
317  }
318 
319  /*
320  * By here we know that the buffer, if already pinned, isn't residing in
321  * the array.
322  *
323  * Only look up the buffer in the hashtable if we've previously overflowed
324  * into it.
325  */
326  if (PrivateRefCountOverflowed == 0)
327  return NULL;
328 
329  res = hash_search(PrivateRefCountHash,
330  (void *) &buffer,
331  HASH_FIND,
332  NULL);
333 
334  if (res == NULL)
335  return NULL;
336  else if (!do_move)
337  {
338  /* caller doesn't want us to move the hash entry into the array */
339  return res;
340  }
341  else
342  {
343  /* move buffer from hashtable into the free array slot */
344  bool found;
346 
347  /* Ensure there's a free array slot */
349 
350  /* Use up the reserved slot */
351  Assert(ReservedRefCountEntry != NULL);
352  free = ReservedRefCountEntry;
353  ReservedRefCountEntry = NULL;
354  Assert(free->buffer == InvalidBuffer);
355 
356  /* and fill it */
357  free->buffer = buffer;
358  free->refcount = res->refcount;
359 
360  /* delete from hashtable */
361  hash_search(PrivateRefCountHash,
362  (void *) &buffer,
363  HASH_REMOVE,
364  &found);
365  Assert(found);
368 
369  return free;
370  }
371 }
372 
373 /*
374  * Returns how many times the passed buffer is pinned by this backend.
375  *
376  * Only works for shared memory buffers!
377  */
378 static inline int32
380 {
382 
383  Assert(BufferIsValid(buffer));
384  Assert(!BufferIsLocal(buffer));
385 
386  /*
387  * Not moving the entry - that's ok for the current users, but we might
388  * want to change this one day.
389  */
390  ref = GetPrivateRefCountEntry(buffer, false);
391 
392  if (ref == NULL)
393  return 0;
394  return ref->refcount;
395 }
396 
397 /*
398  * Release resources used to track the reference count of a buffer which we no
399  * longer have pinned and don't want to pin again immediately.
400  */
401 static void
403 {
404  Assert(ref->refcount == 0);
405 
406  if (ref >= &PrivateRefCountArray[0] &&
408  {
409  ref->buffer = InvalidBuffer;
410 
411  /*
412  * Mark the just used entry as reserved - in many scenarios that
413  * allows us to avoid ever having to search the array/hash for free
414  * entries.
415  */
416  ReservedRefCountEntry = ref;
417  }
418  else
419  {
420  bool found;
421  Buffer buffer = ref->buffer;
422 
423  hash_search(PrivateRefCountHash,
424  (void *) &buffer,
425  HASH_REMOVE,
426  &found);
427  Assert(found);
430  }
431 }
432 
433 /*
434  * BufferIsPinned
435  * True iff the buffer is pinned (also checks for valid buffer number).
436  *
437  * NOTE: what we check here is that *this* backend holds a pin on
438  * the buffer. We do not care whether some other backend does.
439  */
440 #define BufferIsPinned(bufnum) \
441 ( \
442  !BufferIsValid(bufnum) ? \
443  false \
444  : \
445  BufferIsLocal(bufnum) ? \
446  (LocalRefCount[-(bufnum) - 1] > 0) \
447  : \
448  (GetPrivateRefCount(bufnum) > 0) \
449 )
450 
451 
452 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
453  ForkNumber forkNum, BlockNumber blockNum,
455  bool *hit);
456 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
457 static void PinBuffer_Locked(BufferDesc *buf);
458 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
459 static void BufferSync(int flags);
461 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
462  WritebackContext *wb_context);
463 static void WaitIO(BufferDesc *buf);
464 static bool StartBufferIO(BufferDesc *buf, bool forInput);
465 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
466  uint32 set_flag_bits);
467 static void shared_buffer_write_error_callback(void *arg);
468 static void local_buffer_write_error_callback(void *arg);
469 static BufferDesc *BufferAlloc(SMgrRelation smgr,
470  char relpersistence,
471  ForkNumber forkNum,
472  BlockNumber blockNum,
473  BufferAccessStrategy strategy,
474  bool *foundPtr);
475 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
476 static void AtProcExit_Buffers(int code, Datum arg);
477 static void CheckForBufferLeaks(void);
478 static int rnode_comparator(const void *p1, const void *p2);
479 static int buffertag_comparator(const void *p1, const void *p2);
480 static int ckpt_buforder_comparator(const void *pa, const void *pb);
481 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
482 
483 
484 /*
485  * Implementation of PrefetchBuffer() for shared buffers.
486  */
489  ForkNumber forkNum,
490  BlockNumber blockNum)
491 {
492  PrefetchBufferResult result = {InvalidBuffer, false};
493  BufferTag newTag; /* identity of requested block */
494  uint32 newHash; /* hash value for newTag */
495  LWLock *newPartitionLock; /* buffer partition lock for it */
496  int buf_id;
497 
498  Assert(BlockNumberIsValid(blockNum));
499 
500  /* create a tag so we can lookup the buffer */
501  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
502  forkNum, blockNum);
503 
504  /* determine its hash code and partition lock ID */
505  newHash = BufTableHashCode(&newTag);
506  newPartitionLock = BufMappingPartitionLock(newHash);
507 
508  /* see if the block is in the buffer pool already */
509  LWLockAcquire(newPartitionLock, LW_SHARED);
510  buf_id = BufTableLookup(&newTag, newHash);
511  LWLockRelease(newPartitionLock);
512 
513  /* If not in buffers, initiate prefetch */
514  if (buf_id < 0)
515  {
516 #ifdef USE_PREFETCH
517  /*
518  * Try to initiate an asynchronous read. This returns false in
519  * recovery if the relation file doesn't exist.
520  */
521  if (smgrprefetch(smgr_reln, forkNum, blockNum))
522  result.initiated_io = true;
523 #endif /* USE_PREFETCH */
524  }
525  else
526  {
527  /*
528  * Report the buffer it was in at that time. The caller may be able
529  * to avoid a buffer table lookup, but it's not pinned and it must be
530  * rechecked!
531  */
532  result.recent_buffer = buf_id + 1;
533  }
534 
535  /*
536  * If the block *is* in buffers, we do nothing. This is not really ideal:
537  * the block might be just about to be evicted, which would be stupid
538  * since we know we are going to need it soon. But the only easy answer
539  * is to bump the usage_count, which does not seem like a great solution:
540  * when the caller does ultimately touch the block, usage_count would get
541  * bumped again, resulting in too much favoritism for blocks that are
542  * involved in a prefetch sequence. A real fix would involve some
543  * additional per-buffer state, and it's not clear that there's enough of
544  * a problem to justify that.
545  */
546 
547  return result;
548 }
549 
550 /*
551  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
552  *
553  * This is named by analogy to ReadBuffer but doesn't actually allocate a
554  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
555  * block will not be delayed by the I/O. Prefetching is optional.
556  *
557  * There are three possible outcomes:
558  *
559  * 1. If the block is already cached, the result includes a valid buffer that
560  * could be used by the caller to avoid the need for a later buffer lookup, but
561  * it's not pinned, so the caller must recheck it.
562  *
563  * 2. If the kernel has been asked to initiate I/O, the initated_io member is
564  * true. Currently there is no way to know if the data was already cached by
565  * the kernel and therefore didn't really initiate I/O, and no way to know when
566  * the I/O completes other than using synchronous ReadBuffer().
567  *
568  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
569  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
570  * lack of a kernel facility), or the underlying relation file wasn't found and
571  * we are in recovery. (If the relation file wasn't found and we are not in
572  * recovery, an error is raised).
573  */
576 {
577  Assert(RelationIsValid(reln));
578  Assert(BlockNumberIsValid(blockNum));
579 
580  /* Open it at the smgr level if not already done */
581  RelationOpenSmgr(reln);
582 
583  if (RelationUsesLocalBuffers(reln))
584  {
585  /* see comments in ReadBufferExtended */
586  if (RELATION_IS_OTHER_TEMP(reln))
587  ereport(ERROR,
588  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
589  errmsg("cannot access temporary tables of other sessions")));
590 
591  /* pass it off to localbuf.c */
592  return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
593  }
594  else
595  {
596  /* pass it to the shared buffer version */
597  return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
598  }
599 }
600 
601 
602 /*
603  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
604  * fork with RBM_NORMAL mode and default strategy.
605  */
606 Buffer
608 {
609  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
610 }
611 
612 /*
613  * ReadBufferExtended -- returns a buffer containing the requested
614  * block of the requested relation. If the blknum
615  * requested is P_NEW, extend the relation file and
616  * allocate a new block. (Caller is responsible for
617  * ensuring that only one backend tries to extend a
618  * relation at the same time!)
619  *
620  * Returns: the buffer number for the buffer containing
621  * the block read. The returned buffer has been pinned.
622  * Does not return on error --- elog's instead.
623  *
624  * Assume when this function is called, that reln has been opened already.
625  *
626  * In RBM_NORMAL mode, the page is read from disk, and the page header is
627  * validated. An error is thrown if the page header is not valid. (But
628  * note that an all-zero page is considered "valid"; see PageIsVerified().)
629  *
630  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
631  * valid, the page is zeroed instead of throwing an error. This is intended
632  * for non-critical data, where the caller is prepared to repair errors.
633  *
634  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
635  * filled with zeros instead of reading it from disk. Useful when the caller
636  * is going to fill the page from scratch, since this saves I/O and avoids
637  * unnecessary failure if the page-on-disk has corrupt page headers.
638  * The page is returned locked to ensure that the caller has a chance to
639  * initialize the page before it's made visible to others.
640  * Caution: do not use this mode to read a page that is beyond the relation's
641  * current physical EOF; that is likely to cause problems in md.c when
642  * the page is modified and written out. P_NEW is OK, though.
643  *
644  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
645  * a cleanup-strength lock on the page.
646  *
647  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
648  *
649  * If strategy is not NULL, a nondefault buffer access strategy is used.
650  * See buffer/README for details.
651  */
652 Buffer
655 {
656  bool hit;
657  Buffer buf;
658 
659  /* Open it at the smgr level if not already done */
660  RelationOpenSmgr(reln);
661 
662  /*
663  * Reject attempts to read non-local temporary relations; we would be
664  * likely to get wrong data since we have no visibility into the owning
665  * session's local buffers.
666  */
667  if (RELATION_IS_OTHER_TEMP(reln))
668  ereport(ERROR,
669  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
670  errmsg("cannot access temporary tables of other sessions")));
671 
672  /*
673  * Read the buffer, and update pgstat counters to reflect a cache hit or
674  * miss.
675  */
677  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
678  forkNum, blockNum, mode, strategy, &hit);
679  if (hit)
681  return buf;
682 }
683 
684 
685 /*
686  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
687  * a relcache entry for the relation.
688  *
689  * NB: At present, this function may only be used on permanent relations, which
690  * is OK, because we only use it during XLOG replay. If in the future we
691  * want to use it on temporary or unlogged relations, we could pass additional
692  * parameters.
693  */
694 Buffer
696  BlockNumber blockNum, ReadBufferMode mode,
697  BufferAccessStrategy strategy)
698 {
699  bool hit;
700 
701  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
702 
704 
705  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
706  mode, strategy, &hit);
707 }
708 
709 
710 /*
711  * ReadBuffer_common -- common logic for all ReadBuffer variants
712  *
713  * *hit is set to true if the request was satisfied from shared buffer cache.
714  */
715 static Buffer
716 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
717  BlockNumber blockNum, ReadBufferMode mode,
718  BufferAccessStrategy strategy, bool *hit)
719 {
720  BufferDesc *bufHdr;
721  Block bufBlock;
722  bool found;
723  bool isExtend;
724  bool isLocalBuf = SmgrIsTemp(smgr);
725 
726  *hit = false;
727 
728  /* Make sure we will have room to remember the buffer pin */
730 
731  isExtend = (blockNum == P_NEW);
732 
733  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
734  smgr->smgr_rnode.node.spcNode,
735  smgr->smgr_rnode.node.dbNode,
736  smgr->smgr_rnode.node.relNode,
737  smgr->smgr_rnode.backend,
738  isExtend);
739 
740  /* Substitute proper block number if caller asked for P_NEW */
741  if (isExtend)
742  blockNum = smgrnblocks(smgr, forkNum);
743 
744  if (isLocalBuf)
745  {
746  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
747  if (found)
749  else if (isExtend)
751  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
752  mode == RBM_ZERO_ON_ERROR)
754  }
755  else
756  {
757  /*
758  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
759  * not currently in memory.
760  */
761  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
762  strategy, &found);
763  if (found)
765  else if (isExtend)
767  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
768  mode == RBM_ZERO_ON_ERROR)
770  }
771 
772  /* At this point we do NOT hold any locks. */
773 
774  /* if it was already in the buffer pool, we're done */
775  if (found)
776  {
777  if (!isExtend)
778  {
779  /* Just need to update stats before we exit */
780  *hit = true;
781  VacuumPageHit++;
782 
783  if (VacuumCostActive)
785 
786  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
787  smgr->smgr_rnode.node.spcNode,
788  smgr->smgr_rnode.node.dbNode,
789  smgr->smgr_rnode.node.relNode,
790  smgr->smgr_rnode.backend,
791  isExtend,
792  found);
793 
794  /*
795  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
796  * locked on return.
797  */
798  if (!isLocalBuf)
799  {
800  if (mode == RBM_ZERO_AND_LOCK)
802  LW_EXCLUSIVE);
803  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
805  }
806 
807  return BufferDescriptorGetBuffer(bufHdr);
808  }
809 
810  /*
811  * We get here only in the corner case where we are trying to extend
812  * the relation but we found a pre-existing buffer marked BM_VALID.
813  * This can happen because mdread doesn't complain about reads beyond
814  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
815  * read a block beyond EOF could have left a "valid" zero-filled
816  * buffer. Unfortunately, we have also seen this case occurring
817  * because of buggy Linux kernels that sometimes return an
818  * lseek(SEEK_END) result that doesn't account for a recent write. In
819  * that situation, the pre-existing buffer would contain valid data
820  * that we don't want to overwrite. Since the legitimate case should
821  * always have left a zero-filled buffer, complain if not PageIsNew.
822  */
823  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
824  if (!PageIsNew((Page) bufBlock))
825  ereport(ERROR,
826  (errmsg("unexpected data beyond EOF in block %u of relation %s",
827  blockNum, relpath(smgr->smgr_rnode, forkNum)),
828  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
829 
830  /*
831  * We *must* do smgrextend before succeeding, else the page will not
832  * be reserved by the kernel, and the next P_NEW call will decide to
833  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
834  * call that BufferAlloc didn't, and proceed.
835  */
836  if (isLocalBuf)
837  {
838  /* Only need to adjust flags */
839  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
840 
841  Assert(buf_state & BM_VALID);
842  buf_state &= ~BM_VALID;
843  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
844  }
845  else
846  {
847  /*
848  * Loop to handle the very small possibility that someone re-sets
849  * BM_VALID between our clearing it and StartBufferIO inspecting
850  * it.
851  */
852  do
853  {
854  uint32 buf_state = LockBufHdr(bufHdr);
855 
856  Assert(buf_state & BM_VALID);
857  buf_state &= ~BM_VALID;
858  UnlockBufHdr(bufHdr, buf_state);
859  } while (!StartBufferIO(bufHdr, true));
860  }
861  }
862 
863  /*
864  * if we have gotten to this point, we have allocated a buffer for the
865  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
866  * if it's a shared buffer.
867  *
868  * Note: if smgrextend fails, we will end up with a buffer that is
869  * allocated but not marked BM_VALID. P_NEW will still select the same
870  * block number (because the relation didn't get any longer on disk) and
871  * so future attempts to extend the relation will find the same buffer (if
872  * it's not been recycled) but come right back here to try smgrextend
873  * again.
874  */
875  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
876 
877  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
878 
879  if (isExtend)
880  {
881  /* new buffers are zero-filled */
882  MemSet((char *) bufBlock, 0, BLCKSZ);
883  /* don't set checksum for all-zero page */
884  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
885 
886  /*
887  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
888  * although we're essentially performing a write. At least on linux
889  * doing so defeats the 'delayed allocation' mechanism, leading to
890  * increased file fragmentation.
891  */
892  }
893  else
894  {
895  /*
896  * Read in the page, unless the caller intends to overwrite it and
897  * just wants us to allocate a buffer.
898  */
899  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
900  MemSet((char *) bufBlock, 0, BLCKSZ);
901  else
902  {
903  instr_time io_start,
904  io_time;
905 
906  if (track_io_timing)
907  INSTR_TIME_SET_CURRENT(io_start);
908 
909  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
910 
911  if (track_io_timing)
912  {
913  INSTR_TIME_SET_CURRENT(io_time);
914  INSTR_TIME_SUBTRACT(io_time, io_start);
917  }
918 
919  /* check for garbage data */
920  if (!PageIsVerified((Page) bufBlock, blockNum))
921  {
922  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
923  {
926  errmsg("invalid page in block %u of relation %s; zeroing out page",
927  blockNum,
928  relpath(smgr->smgr_rnode, forkNum))));
929  MemSet((char *) bufBlock, 0, BLCKSZ);
930  }
931  else
932  ereport(ERROR,
934  errmsg("invalid page in block %u of relation %s",
935  blockNum,
936  relpath(smgr->smgr_rnode, forkNum))));
937  }
938  }
939  }
940 
941  /*
942  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
943  * the page as valid, to make sure that no other backend sees the zeroed
944  * page before the caller has had a chance to initialize it.
945  *
946  * Since no-one else can be looking at the page contents yet, there is no
947  * difference between an exclusive lock and a cleanup-strength lock. (Note
948  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
949  * they assert that the buffer is already valid.)
950  */
951  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
952  !isLocalBuf)
953  {
955  }
956 
957  if (isLocalBuf)
958  {
959  /* Only need to adjust flags */
960  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
961 
962  buf_state |= BM_VALID;
963  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
964  }
965  else
966  {
967  /* Set BM_VALID, terminate IO, and wake up any waiters */
968  TerminateBufferIO(bufHdr, false, BM_VALID);
969  }
970 
971  VacuumPageMiss++;
972  if (VacuumCostActive)
974 
975  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
976  smgr->smgr_rnode.node.spcNode,
977  smgr->smgr_rnode.node.dbNode,
978  smgr->smgr_rnode.node.relNode,
979  smgr->smgr_rnode.backend,
980  isExtend,
981  found);
982 
983  return BufferDescriptorGetBuffer(bufHdr);
984 }
985 
986 /*
987  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
988  * buffer. If no buffer exists already, selects a replacement
989  * victim and evicts the old page, but does NOT read in new page.
990  *
991  * "strategy" can be a buffer replacement strategy object, or NULL for
992  * the default strategy. The selected buffer's usage_count is advanced when
993  * using the default strategy, but otherwise possibly not (see PinBuffer).
994  *
995  * The returned buffer is pinned and is already marked as holding the
996  * desired page. If it already did have the desired page, *foundPtr is
997  * set true. Otherwise, *foundPtr is set false and the buffer is marked
998  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
999  *
1000  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1001  * we keep it for simplicity in ReadBuffer.
1002  *
1003  * No locks are held either at entry or exit.
1004  */
1005 static BufferDesc *
1006 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1007  BlockNumber blockNum,
1008  BufferAccessStrategy strategy,
1009  bool *foundPtr)
1010 {
1011  BufferTag newTag; /* identity of requested block */
1012  uint32 newHash; /* hash value for newTag */
1013  LWLock *newPartitionLock; /* buffer partition lock for it */
1014  BufferTag oldTag; /* previous identity of selected buffer */
1015  uint32 oldHash; /* hash value for oldTag */
1016  LWLock *oldPartitionLock; /* buffer partition lock for it */
1017  uint32 oldFlags;
1018  int buf_id;
1019  BufferDesc *buf;
1020  bool valid;
1021  uint32 buf_state;
1022 
1023  /* create a tag so we can lookup the buffer */
1024  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1025 
1026  /* determine its hash code and partition lock ID */
1027  newHash = BufTableHashCode(&newTag);
1028  newPartitionLock = BufMappingPartitionLock(newHash);
1029 
1030  /* see if the block is in the buffer pool already */
1031  LWLockAcquire(newPartitionLock, LW_SHARED);
1032  buf_id = BufTableLookup(&newTag, newHash);
1033  if (buf_id >= 0)
1034  {
1035  /*
1036  * Found it. Now, pin the buffer so no one can steal it from the
1037  * buffer pool, and check to see if the correct data has been loaded
1038  * into the buffer.
1039  */
1040  buf = GetBufferDescriptor(buf_id);
1041 
1042  valid = PinBuffer(buf, strategy);
1043 
1044  /* Can release the mapping lock as soon as we've pinned it */
1045  LWLockRelease(newPartitionLock);
1046 
1047  *foundPtr = true;
1048 
1049  if (!valid)
1050  {
1051  /*
1052  * We can only get here if (a) someone else is still reading in
1053  * the page, or (b) a previous read attempt failed. We have to
1054  * wait for any active read attempt to finish, and then set up our
1055  * own read attempt if the page is still not BM_VALID.
1056  * StartBufferIO does it all.
1057  */
1058  if (StartBufferIO(buf, true))
1059  {
1060  /*
1061  * If we get here, previous attempts to read the buffer must
1062  * have failed ... but we shall bravely try again.
1063  */
1064  *foundPtr = false;
1065  }
1066  }
1067 
1068  return buf;
1069  }
1070 
1071  /*
1072  * Didn't find it in the buffer pool. We'll have to initialize a new
1073  * buffer. Remember to unlock the mapping lock while doing the work.
1074  */
1075  LWLockRelease(newPartitionLock);
1076 
1077  /* Loop here in case we have to try another victim buffer */
1078  for (;;)
1079  {
1080  /*
1081  * Ensure, while the spinlock's not yet held, that there's a free
1082  * refcount entry.
1083  */
1085 
1086  /*
1087  * Select a victim buffer. The buffer is returned with its header
1088  * spinlock still held!
1089  */
1090  buf = StrategyGetBuffer(strategy, &buf_state);
1091 
1092  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1093 
1094  /* Must copy buffer flags while we still hold the spinlock */
1095  oldFlags = buf_state & BUF_FLAG_MASK;
1096 
1097  /* Pin the buffer and then release the buffer spinlock */
1098  PinBuffer_Locked(buf);
1099 
1100  /*
1101  * If the buffer was dirty, try to write it out. There is a race
1102  * condition here, in that someone might dirty it after we released it
1103  * above, or even while we are writing it out (since our share-lock
1104  * won't prevent hint-bit updates). We will recheck the dirty bit
1105  * after re-locking the buffer header.
1106  */
1107  if (oldFlags & BM_DIRTY)
1108  {
1109  /*
1110  * We need a share-lock on the buffer contents to write it out
1111  * (else we might write invalid data, eg because someone else is
1112  * compacting the page contents while we write). We must use a
1113  * conditional lock acquisition here to avoid deadlock. Even
1114  * though the buffer was not pinned (and therefore surely not
1115  * locked) when StrategyGetBuffer returned it, someone else could
1116  * have pinned and exclusive-locked it by the time we get here. If
1117  * we try to get the lock unconditionally, we'd block waiting for
1118  * them; if they later block waiting for us, deadlock ensues.
1119  * (This has been observed to happen when two backends are both
1120  * trying to split btree index pages, and the second one just
1121  * happens to be trying to split the page the first one got from
1122  * StrategyGetBuffer.)
1123  */
1125  LW_SHARED))
1126  {
1127  /*
1128  * If using a nondefault strategy, and writing the buffer
1129  * would require a WAL flush, let the strategy decide whether
1130  * to go ahead and write/reuse the buffer or to choose another
1131  * victim. We need lock to inspect the page LSN, so this
1132  * can't be done inside StrategyGetBuffer.
1133  */
1134  if (strategy != NULL)
1135  {
1136  XLogRecPtr lsn;
1137 
1138  /* Read the LSN while holding buffer header lock */
1139  buf_state = LockBufHdr(buf);
1140  lsn = BufferGetLSN(buf);
1141  UnlockBufHdr(buf, buf_state);
1142 
1143  if (XLogNeedsFlush(lsn) &&
1144  StrategyRejectBuffer(strategy, buf))
1145  {
1146  /* Drop lock/pin and loop around for another buffer */
1148  UnpinBuffer(buf, true);
1149  continue;
1150  }
1151  }
1152 
1153  /* OK, do the I/O */
1154  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1155  smgr->smgr_rnode.node.spcNode,
1156  smgr->smgr_rnode.node.dbNode,
1157  smgr->smgr_rnode.node.relNode);
1158 
1159  FlushBuffer(buf, NULL);
1161 
1163  &buf->tag);
1164 
1165  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1166  smgr->smgr_rnode.node.spcNode,
1167  smgr->smgr_rnode.node.dbNode,
1168  smgr->smgr_rnode.node.relNode);
1169  }
1170  else
1171  {
1172  /*
1173  * Someone else has locked the buffer, so give it up and loop
1174  * back to get another one.
1175  */
1176  UnpinBuffer(buf, true);
1177  continue;
1178  }
1179  }
1180 
1181  /*
1182  * To change the association of a valid buffer, we'll need to have
1183  * exclusive lock on both the old and new mapping partitions.
1184  */
1185  if (oldFlags & BM_TAG_VALID)
1186  {
1187  /*
1188  * Need to compute the old tag's hashcode and partition lock ID.
1189  * XXX is it worth storing the hashcode in BufferDesc so we need
1190  * not recompute it here? Probably not.
1191  */
1192  oldTag = buf->tag;
1193  oldHash = BufTableHashCode(&oldTag);
1194  oldPartitionLock = BufMappingPartitionLock(oldHash);
1195 
1196  /*
1197  * Must lock the lower-numbered partition first to avoid
1198  * deadlocks.
1199  */
1200  if (oldPartitionLock < newPartitionLock)
1201  {
1202  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1203  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1204  }
1205  else if (oldPartitionLock > newPartitionLock)
1206  {
1207  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1208  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1209  }
1210  else
1211  {
1212  /* only one partition, only one lock */
1213  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1214  }
1215  }
1216  else
1217  {
1218  /* if it wasn't valid, we need only the new partition */
1219  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1220  /* remember we have no old-partition lock or tag */
1221  oldPartitionLock = NULL;
1222  /* keep the compiler quiet about uninitialized variables */
1223  oldHash = 0;
1224  }
1225 
1226  /*
1227  * Try to make a hashtable entry for the buffer under its new tag.
1228  * This could fail because while we were writing someone else
1229  * allocated another buffer for the same block we want to read in.
1230  * Note that we have not yet removed the hashtable entry for the old
1231  * tag.
1232  */
1233  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1234 
1235  if (buf_id >= 0)
1236  {
1237  /*
1238  * Got a collision. Someone has already done what we were about to
1239  * do. We'll just handle this as if it were found in the buffer
1240  * pool in the first place. First, give up the buffer we were
1241  * planning to use.
1242  */
1243  UnpinBuffer(buf, true);
1244 
1245  /* Can give up that buffer's mapping partition lock now */
1246  if (oldPartitionLock != NULL &&
1247  oldPartitionLock != newPartitionLock)
1248  LWLockRelease(oldPartitionLock);
1249 
1250  /* remaining code should match code at top of routine */
1251 
1252  buf = GetBufferDescriptor(buf_id);
1253 
1254  valid = PinBuffer(buf, strategy);
1255 
1256  /* Can release the mapping lock as soon as we've pinned it */
1257  LWLockRelease(newPartitionLock);
1258 
1259  *foundPtr = true;
1260 
1261  if (!valid)
1262  {
1263  /*
1264  * We can only get here if (a) someone else is still reading
1265  * in the page, or (b) a previous read attempt failed. We
1266  * have to wait for any active read attempt to finish, and
1267  * then set up our own read attempt if the page is still not
1268  * BM_VALID. StartBufferIO does it all.
1269  */
1270  if (StartBufferIO(buf, true))
1271  {
1272  /*
1273  * If we get here, previous attempts to read the buffer
1274  * must have failed ... but we shall bravely try again.
1275  */
1276  *foundPtr = false;
1277  }
1278  }
1279 
1280  return buf;
1281  }
1282 
1283  /*
1284  * Need to lock the buffer header too in order to change its tag.
1285  */
1286  buf_state = LockBufHdr(buf);
1287 
1288  /*
1289  * Somebody could have pinned or re-dirtied the buffer while we were
1290  * doing the I/O and making the new hashtable entry. If so, we can't
1291  * recycle this buffer; we must undo everything we've done and start
1292  * over with a new victim buffer.
1293  */
1294  oldFlags = buf_state & BUF_FLAG_MASK;
1295  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1296  break;
1297 
1298  UnlockBufHdr(buf, buf_state);
1299  BufTableDelete(&newTag, newHash);
1300  if (oldPartitionLock != NULL &&
1301  oldPartitionLock != newPartitionLock)
1302  LWLockRelease(oldPartitionLock);
1303  LWLockRelease(newPartitionLock);
1304  UnpinBuffer(buf, true);
1305  }
1306 
1307  /*
1308  * Okay, it's finally safe to rename the buffer.
1309  *
1310  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1311  * paranoia. We also reset the usage_count since any recency of use of
1312  * the old content is no longer relevant. (The usage_count starts out at
1313  * 1 so that the buffer can survive one clock-sweep pass.)
1314  *
1315  * Make sure BM_PERMANENT is set for buffers that must be written at every
1316  * checkpoint. Unlogged buffers only need to be written at shutdown
1317  * checkpoints, except for their "init" forks, which need to be treated
1318  * just like permanent relations.
1319  */
1320  buf->tag = newTag;
1321  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1324  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1325  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1326  else
1327  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1328 
1329  UnlockBufHdr(buf, buf_state);
1330 
1331  if (oldPartitionLock != NULL)
1332  {
1333  BufTableDelete(&oldTag, oldHash);
1334  if (oldPartitionLock != newPartitionLock)
1335  LWLockRelease(oldPartitionLock);
1336  }
1337 
1338  LWLockRelease(newPartitionLock);
1339 
1340  /*
1341  * Buffer contents are currently invalid. Try to get the io_in_progress
1342  * lock. If StartBufferIO returns false, then someone else managed to
1343  * read it before we did, so there's nothing left for BufferAlloc() to do.
1344  */
1345  if (StartBufferIO(buf, true))
1346  *foundPtr = false;
1347  else
1348  *foundPtr = true;
1349 
1350  return buf;
1351 }
1352 
1353 /*
1354  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1355  * freelist.
1356  *
1357  * The buffer header spinlock must be held at entry. We drop it before
1358  * returning. (This is sane because the caller must have locked the
1359  * buffer in order to be sure it should be dropped.)
1360  *
1361  * This is used only in contexts such as dropping a relation. We assume
1362  * that no other backend could possibly be interested in using the page,
1363  * so the only reason the buffer might be pinned is if someone else is
1364  * trying to write it out. We have to let them finish before we can
1365  * reclaim the buffer.
1366  *
1367  * The buffer could get reclaimed by someone else while we are waiting
1368  * to acquire the necessary locks; if so, don't mess it up.
1369  */
1370 static void
1372 {
1373  BufferTag oldTag;
1374  uint32 oldHash; /* hash value for oldTag */
1375  LWLock *oldPartitionLock; /* buffer partition lock for it */
1376  uint32 oldFlags;
1377  uint32 buf_state;
1378 
1379  /* Save the original buffer tag before dropping the spinlock */
1380  oldTag = buf->tag;
1381 
1382  buf_state = pg_atomic_read_u32(&buf->state);
1383  Assert(buf_state & BM_LOCKED);
1384  UnlockBufHdr(buf, buf_state);
1385 
1386  /*
1387  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1388  * worth storing the hashcode in BufferDesc so we need not recompute it
1389  * here? Probably not.
1390  */
1391  oldHash = BufTableHashCode(&oldTag);
1392  oldPartitionLock = BufMappingPartitionLock(oldHash);
1393 
1394 retry:
1395 
1396  /*
1397  * Acquire exclusive mapping lock in preparation for changing the buffer's
1398  * association.
1399  */
1400  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1401 
1402  /* Re-lock the buffer header */
1403  buf_state = LockBufHdr(buf);
1404 
1405  /* If it's changed while we were waiting for lock, do nothing */
1406  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1407  {
1408  UnlockBufHdr(buf, buf_state);
1409  LWLockRelease(oldPartitionLock);
1410  return;
1411  }
1412 
1413  /*
1414  * We assume the only reason for it to be pinned is that someone else is
1415  * flushing the page out. Wait for them to finish. (This could be an
1416  * infinite loop if the refcount is messed up... it would be nice to time
1417  * out after awhile, but there seems no way to be sure how many loops may
1418  * be needed. Note that if the other guy has pinned the buffer but not
1419  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1420  * be busy-looping here.)
1421  */
1422  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1423  {
1424  UnlockBufHdr(buf, buf_state);
1425  LWLockRelease(oldPartitionLock);
1426  /* safety check: should definitely not be our *own* pin */
1428  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1429  WaitIO(buf);
1430  goto retry;
1431  }
1432 
1433  /*
1434  * Clear out the buffer's tag and flags. We must do this to ensure that
1435  * linear scans of the buffer array don't think the buffer is valid.
1436  */
1437  oldFlags = buf_state & BUF_FLAG_MASK;
1438  CLEAR_BUFFERTAG(buf->tag);
1439  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1440  UnlockBufHdr(buf, buf_state);
1441 
1442  /*
1443  * Remove the buffer from the lookup hashtable, if it was in there.
1444  */
1445  if (oldFlags & BM_TAG_VALID)
1446  BufTableDelete(&oldTag, oldHash);
1447 
1448  /*
1449  * Done with mapping lock.
1450  */
1451  LWLockRelease(oldPartitionLock);
1452 
1453  /*
1454  * Insert the buffer at the head of the list of free buffers.
1455  */
1456  StrategyFreeBuffer(buf);
1457 }
1458 
1459 /*
1460  * MarkBufferDirty
1461  *
1462  * Marks buffer contents as dirty (actual write happens later).
1463  *
1464  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1465  * exclusive lock, then somebody could be in process of writing the buffer,
1466  * leading to risk of bad data written to disk.)
1467  */
1468 void
1470 {
1471  BufferDesc *bufHdr;
1472  uint32 buf_state;
1473  uint32 old_buf_state;
1474 
1475  if (!BufferIsValid(buffer))
1476  elog(ERROR, "bad buffer ID: %d", buffer);
1477 
1478  if (BufferIsLocal(buffer))
1479  {
1480  MarkLocalBufferDirty(buffer);
1481  return;
1482  }
1483 
1484  bufHdr = GetBufferDescriptor(buffer - 1);
1485 
1486  Assert(BufferIsPinned(buffer));
1488  LW_EXCLUSIVE));
1489 
1490  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1491  for (;;)
1492  {
1493  if (old_buf_state & BM_LOCKED)
1494  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1495 
1496  buf_state = old_buf_state;
1497 
1498  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1499  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1500 
1501  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1502  buf_state))
1503  break;
1504  }
1505 
1506  /*
1507  * If the buffer was not dirty already, do vacuum accounting.
1508  */
1509  if (!(old_buf_state & BM_DIRTY))
1510  {
1511  VacuumPageDirty++;
1513  if (VacuumCostActive)
1515  }
1516 }
1517 
1518 /*
1519  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1520  *
1521  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1522  * compared to calling the two routines separately. Now it's mainly just
1523  * a convenience function. However, if the passed buffer is valid and
1524  * already contains the desired block, we just return it as-is; and that
1525  * does save considerable work compared to a full release and reacquire.
1526  *
1527  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1528  * buffer actually needs to be released. This case is the same as ReadBuffer,
1529  * but can save some tests in the caller.
1530  */
1531 Buffer
1533  Relation relation,
1534  BlockNumber blockNum)
1535 {
1536  ForkNumber forkNum = MAIN_FORKNUM;
1537  BufferDesc *bufHdr;
1538 
1539  if (BufferIsValid(buffer))
1540  {
1541  Assert(BufferIsPinned(buffer));
1542  if (BufferIsLocal(buffer))
1543  {
1544  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1545  if (bufHdr->tag.blockNum == blockNum &&
1546  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1547  bufHdr->tag.forkNum == forkNum)
1548  return buffer;
1550  LocalRefCount[-buffer - 1]--;
1551  }
1552  else
1553  {
1554  bufHdr = GetBufferDescriptor(buffer - 1);
1555  /* we have pin, so it's ok to examine tag without spinlock */
1556  if (bufHdr->tag.blockNum == blockNum &&
1557  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1558  bufHdr->tag.forkNum == forkNum)
1559  return buffer;
1560  UnpinBuffer(bufHdr, true);
1561  }
1562  }
1563 
1564  return ReadBuffer(relation, blockNum);
1565 }
1566 
1567 /*
1568  * PinBuffer -- make buffer unavailable for replacement.
1569  *
1570  * For the default access strategy, the buffer's usage_count is incremented
1571  * when we first pin it; for other strategies we just make sure the usage_count
1572  * isn't zero. (The idea of the latter is that we don't want synchronized
1573  * heap scans to inflate the count, but we need it to not be zero to discourage
1574  * other backends from stealing buffers from our ring. As long as we cycle
1575  * through the ring faster than the global clock-sweep cycles, buffers in
1576  * our ring won't be chosen as victims for replacement by other backends.)
1577  *
1578  * This should be applied only to shared buffers, never local ones.
1579  *
1580  * Since buffers are pinned/unpinned very frequently, pin buffers without
1581  * taking the buffer header lock; instead update the state variable in loop of
1582  * CAS operations. Hopefully it's just a single CAS.
1583  *
1584  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1585  *
1586  * Returns true if buffer is BM_VALID, else false. This provision allows
1587  * some callers to avoid an extra spinlock cycle.
1588  */
1589 static bool
1591 {
1593  bool result;
1594  PrivateRefCountEntry *ref;
1595 
1596  ref = GetPrivateRefCountEntry(b, true);
1597 
1598  if (ref == NULL)
1599  {
1600  uint32 buf_state;
1601  uint32 old_buf_state;
1602 
1604  ref = NewPrivateRefCountEntry(b);
1605 
1606  old_buf_state = pg_atomic_read_u32(&buf->state);
1607  for (;;)
1608  {
1609  if (old_buf_state & BM_LOCKED)
1610  old_buf_state = WaitBufHdrUnlocked(buf);
1611 
1612  buf_state = old_buf_state;
1613 
1614  /* increase refcount */
1615  buf_state += BUF_REFCOUNT_ONE;
1616 
1617  if (strategy == NULL)
1618  {
1619  /* Default case: increase usagecount unless already max. */
1621  buf_state += BUF_USAGECOUNT_ONE;
1622  }
1623  else
1624  {
1625  /*
1626  * Ring buffers shouldn't evict others from pool. Thus we
1627  * don't make usagecount more than 1.
1628  */
1629  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1630  buf_state += BUF_USAGECOUNT_ONE;
1631  }
1632 
1633  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1634  buf_state))
1635  {
1636  result = (buf_state & BM_VALID) != 0;
1637 
1638  /*
1639  * Assume that we acquired a buffer pin for the purposes of
1640  * Valgrind buffer client checks (even in !result case) to
1641  * keep things simple. Buffers that are unsafe to access are
1642  * not generally guaranteed to be marked undefined or
1643  * non-accessible in any case.
1644  */
1646  break;
1647  }
1648  }
1649  }
1650  else
1651  {
1652  /*
1653  * If we previously pinned the buffer, it must surely be valid.
1654  *
1655  * Note: We deliberately avoid a Valgrind client request here.
1656  * Individual access methods can optionally superimpose buffer page
1657  * client requests on top of our client requests to enforce that
1658  * buffers are only accessed while locked (and pinned). It's possible
1659  * that the buffer page is legitimately non-accessible here. We
1660  * cannot meddle with that.
1661  */
1662  result = true;
1663  }
1664 
1665  ref->refcount++;
1666  Assert(ref->refcount > 0);
1668  return result;
1669 }
1670 
1671 /*
1672  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1673  * The spinlock is released before return.
1674  *
1675  * As this function is called with the spinlock held, the caller has to
1676  * previously call ReservePrivateRefCountEntry().
1677  *
1678  * Currently, no callers of this function want to modify the buffer's
1679  * usage_count at all, so there's no need for a strategy parameter.
1680  * Also we don't bother with a BM_VALID test (the caller could check that for
1681  * itself).
1682  *
1683  * Also all callers only ever use this function when it's known that the
1684  * buffer can't have a preexisting pin by this backend. That allows us to skip
1685  * searching the private refcount array & hash, which is a boon, because the
1686  * spinlock is still held.
1687  *
1688  * Note: use of this routine is frequently mandatory, not just an optimization
1689  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1690  * its state can change under us.
1691  */
1692 static void
1694 {
1695  Buffer b;
1696  PrivateRefCountEntry *ref;
1697  uint32 buf_state;
1698 
1699  /*
1700  * As explained, We don't expect any preexisting pins. That allows us to
1701  * manipulate the PrivateRefCount after releasing the spinlock
1702  */
1704 
1705  /*
1706  * Buffer can't have a preexisting pin, so mark its page as defined to
1707  * Valgrind (this is similar to the PinBuffer() case where the backend
1708  * doesn't already have a buffer pin)
1709  */
1711 
1712  /*
1713  * Since we hold the buffer spinlock, we can update the buffer state and
1714  * release the lock in one operation.
1715  */
1716  buf_state = pg_atomic_read_u32(&buf->state);
1717  Assert(buf_state & BM_LOCKED);
1718  buf_state += BUF_REFCOUNT_ONE;
1719  UnlockBufHdr(buf, buf_state);
1720 
1721  b = BufferDescriptorGetBuffer(buf);
1722 
1723  ref = NewPrivateRefCountEntry(b);
1724  ref->refcount++;
1725 
1727 }
1728 
1729 /*
1730  * UnpinBuffer -- make buffer available for replacement.
1731  *
1732  * This should be applied only to shared buffers, never local ones.
1733  *
1734  * Most but not all callers want CurrentResourceOwner to be adjusted.
1735  * Those that don't should pass fixOwner = false.
1736  */
1737 static void
1738 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1739 {
1740  PrivateRefCountEntry *ref;
1742 
1743  /* not moving as we're likely deleting it soon anyway */
1744  ref = GetPrivateRefCountEntry(b, false);
1745  Assert(ref != NULL);
1746 
1747  if (fixOwner)
1749 
1750  Assert(ref->refcount > 0);
1751  ref->refcount--;
1752  if (ref->refcount == 0)
1753  {
1754  uint32 buf_state;
1755  uint32 old_buf_state;
1756 
1757  /*
1758  * Mark buffer non-accessible to Valgrind.
1759  *
1760  * Note that the buffer may have already been marked non-accessible
1761  * within access method code that enforces that buffers are only
1762  * accessed while a buffer lock is held.
1763  */
1765 
1766  /* I'd better not still hold any locks on the buffer */
1769 
1770  /*
1771  * Decrement the shared reference count.
1772  *
1773  * Since buffer spinlock holder can update status using just write,
1774  * it's not safe to use atomic decrement here; thus use a CAS loop.
1775  */
1776  old_buf_state = pg_atomic_read_u32(&buf->state);
1777  for (;;)
1778  {
1779  if (old_buf_state & BM_LOCKED)
1780  old_buf_state = WaitBufHdrUnlocked(buf);
1781 
1782  buf_state = old_buf_state;
1783 
1784  buf_state -= BUF_REFCOUNT_ONE;
1785 
1786  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1787  buf_state))
1788  break;
1789  }
1790 
1791  /* Support LockBufferForCleanup() */
1792  if (buf_state & BM_PIN_COUNT_WAITER)
1793  {
1794  /*
1795  * Acquire the buffer header lock, re-check that there's a waiter.
1796  * Another backend could have unpinned this buffer, and already
1797  * woken up the waiter. There's no danger of the buffer being
1798  * replaced after we unpinned it above, as it's pinned by the
1799  * waiter.
1800  */
1801  buf_state = LockBufHdr(buf);
1802 
1803  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1804  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1805  {
1806  /* we just released the last pin other than the waiter's */
1807  int wait_backend_pid = buf->wait_backend_pid;
1808 
1809  buf_state &= ~BM_PIN_COUNT_WAITER;
1810  UnlockBufHdr(buf, buf_state);
1811  ProcSendSignal(wait_backend_pid);
1812  }
1813  else
1814  UnlockBufHdr(buf, buf_state);
1815  }
1817  }
1818 }
1819 
1820 /*
1821  * BufferSync -- Write out all dirty buffers in the pool.
1822  *
1823  * This is called at checkpoint time to write out all dirty shared buffers.
1824  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1825  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1826  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1827  * unlogged buffers, which are otherwise skipped. The remaining flags
1828  * currently have no effect here.
1829  */
1830 static void
1831 BufferSync(int flags)
1832 {
1833  uint32 buf_state;
1834  int buf_id;
1835  int num_to_scan;
1836  int num_spaces;
1837  int num_processed;
1838  int num_written;
1839  CkptTsStatus *per_ts_stat = NULL;
1840  Oid last_tsid;
1841  binaryheap *ts_heap;
1842  int i;
1843  int mask = BM_DIRTY;
1844  WritebackContext wb_context;
1845 
1846  /* Make sure we can handle the pin inside SyncOneBuffer */
1848 
1849  /*
1850  * Unless this is a shutdown checkpoint or we have been explicitly told,
1851  * we write only permanent, dirty buffers. But at shutdown or end of
1852  * recovery, we write all dirty buffers.
1853  */
1856  mask |= BM_PERMANENT;
1857 
1858  /*
1859  * Loop over all buffers, and mark the ones that need to be written with
1860  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1861  * can estimate how much work needs to be done.
1862  *
1863  * This allows us to write only those pages that were dirty when the
1864  * checkpoint began, and not those that get dirtied while it proceeds.
1865  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1866  * later in this function, or by normal backends or the bgwriter cleaning
1867  * scan, the flag is cleared. Any buffer dirtied after this point won't
1868  * have the flag set.
1869  *
1870  * Note that if we fail to write some buffer, we may leave buffers with
1871  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1872  * certainly need to be written for the next checkpoint attempt, too.
1873  */
1874  num_to_scan = 0;
1875  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1876  {
1877  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1878 
1879  /*
1880  * Header spinlock is enough to examine BM_DIRTY, see comment in
1881  * SyncOneBuffer.
1882  */
1883  buf_state = LockBufHdr(bufHdr);
1884 
1885  if ((buf_state & mask) == mask)
1886  {
1887  CkptSortItem *item;
1888 
1889  buf_state |= BM_CHECKPOINT_NEEDED;
1890 
1891  item = &CkptBufferIds[num_to_scan++];
1892  item->buf_id = buf_id;
1893  item->tsId = bufHdr->tag.rnode.spcNode;
1894  item->relNode = bufHdr->tag.rnode.relNode;
1895  item->forkNum = bufHdr->tag.forkNum;
1896  item->blockNum = bufHdr->tag.blockNum;
1897  }
1898 
1899  UnlockBufHdr(bufHdr, buf_state);
1900 
1901  /* Check for barrier events in case NBuffers is large. */
1904  }
1905 
1906  if (num_to_scan == 0)
1907  return; /* nothing to do */
1908 
1910 
1911  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1912 
1913  /*
1914  * Sort buffers that need to be written to reduce the likelihood of random
1915  * IO. The sorting is also important for the implementation of balancing
1916  * writes between tablespaces. Without balancing writes we'd potentially
1917  * end up writing to the tablespaces one-by-one; possibly overloading the
1918  * underlying system.
1919  */
1920  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1922 
1923  num_spaces = 0;
1924 
1925  /*
1926  * Allocate progress status for each tablespace with buffers that need to
1927  * be flushed. This requires the to-be-flushed array to be sorted.
1928  */
1929  last_tsid = InvalidOid;
1930  for (i = 0; i < num_to_scan; i++)
1931  {
1932  CkptTsStatus *s;
1933  Oid cur_tsid;
1934 
1935  cur_tsid = CkptBufferIds[i].tsId;
1936 
1937  /*
1938  * Grow array of per-tablespace status structs, every time a new
1939  * tablespace is found.
1940  */
1941  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1942  {
1943  Size sz;
1944 
1945  num_spaces++;
1946 
1947  /*
1948  * Not worth adding grow-by-power-of-2 logic here - even with a
1949  * few hundred tablespaces this should be fine.
1950  */
1951  sz = sizeof(CkptTsStatus) * num_spaces;
1952 
1953  if (per_ts_stat == NULL)
1954  per_ts_stat = (CkptTsStatus *) palloc(sz);
1955  else
1956  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1957 
1958  s = &per_ts_stat[num_spaces - 1];
1959  memset(s, 0, sizeof(*s));
1960  s->tsId = cur_tsid;
1961 
1962  /*
1963  * The first buffer in this tablespace. As CkptBufferIds is sorted
1964  * by tablespace all (s->num_to_scan) buffers in this tablespace
1965  * will follow afterwards.
1966  */
1967  s->index = i;
1968 
1969  /*
1970  * progress_slice will be determined once we know how many buffers
1971  * are in each tablespace, i.e. after this loop.
1972  */
1973 
1974  last_tsid = cur_tsid;
1975  }
1976  else
1977  {
1978  s = &per_ts_stat[num_spaces - 1];
1979  }
1980 
1981  s->num_to_scan++;
1982 
1983  /* Check for barrier events. */
1986  }
1987 
1988  Assert(num_spaces > 0);
1989 
1990  /*
1991  * Build a min-heap over the write-progress in the individual tablespaces,
1992  * and compute how large a portion of the total progress a single
1993  * processed buffer is.
1994  */
1995  ts_heap = binaryheap_allocate(num_spaces,
1997  NULL);
1998 
1999  for (i = 0; i < num_spaces; i++)
2000  {
2001  CkptTsStatus *ts_stat = &per_ts_stat[i];
2002 
2003  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2004 
2005  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2006  }
2007 
2008  binaryheap_build(ts_heap);
2009 
2010  /*
2011  * Iterate through to-be-checkpointed buffers and write the ones (still)
2012  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2013  * tablespaces; otherwise the sorting would lead to only one tablespace
2014  * receiving writes at a time, making inefficient use of the hardware.
2015  */
2016  num_processed = 0;
2017  num_written = 0;
2018  while (!binaryheap_empty(ts_heap))
2019  {
2020  BufferDesc *bufHdr = NULL;
2021  CkptTsStatus *ts_stat = (CkptTsStatus *)
2023 
2024  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2025  Assert(buf_id != -1);
2026 
2027  bufHdr = GetBufferDescriptor(buf_id);
2028 
2029  num_processed++;
2030 
2031  /*
2032  * We don't need to acquire the lock here, because we're only looking
2033  * at a single bit. It's possible that someone else writes the buffer
2034  * and clears the flag right after we check, but that doesn't matter
2035  * since SyncOneBuffer will then do nothing. However, there is a
2036  * further race condition: it's conceivable that between the time we
2037  * examine the bit here and the time SyncOneBuffer acquires the lock,
2038  * someone else not only wrote the buffer but replaced it with another
2039  * page and dirtied it. In that improbable case, SyncOneBuffer will
2040  * write the buffer though we didn't need to. It doesn't seem worth
2041  * guarding against this, though.
2042  */
2044  {
2045  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2046  {
2047  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2049  num_written++;
2050  }
2051  }
2052 
2053  /*
2054  * Measure progress independent of actually having to flush the buffer
2055  * - otherwise writing become unbalanced.
2056  */
2057  ts_stat->progress += ts_stat->progress_slice;
2058  ts_stat->num_scanned++;
2059  ts_stat->index++;
2060 
2061  /* Have all the buffers from the tablespace been processed? */
2062  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2063  {
2064  binaryheap_remove_first(ts_heap);
2065  }
2066  else
2067  {
2068  /* update heap with the new progress */
2069  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2070  }
2071 
2072  /*
2073  * Sleep to throttle our I/O rate.
2074  *
2075  * (This will check for barrier events even if it doesn't sleep.)
2076  */
2077  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2078  }
2079 
2080  /* issue all pending flushes */
2081  IssuePendingWritebacks(&wb_context);
2082 
2083  pfree(per_ts_stat);
2084  per_ts_stat = NULL;
2085  binaryheap_free(ts_heap);
2086 
2087  /*
2088  * Update checkpoint statistics. As noted above, this doesn't include
2089  * buffers written by other backends or bgwriter scan.
2090  */
2091  CheckpointStats.ckpt_bufs_written += num_written;
2092 
2093  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2094 }
2095 
2096 /*
2097  * BgBufferSync -- Write out some dirty buffers in the pool.
2098  *
2099  * This is called periodically by the background writer process.
2100  *
2101  * Returns true if it's appropriate for the bgwriter process to go into
2102  * low-power hibernation mode. (This happens if the strategy clock sweep
2103  * has been "lapped" and no buffer allocations have occurred recently,
2104  * or if the bgwriter has been effectively disabled by setting
2105  * bgwriter_lru_maxpages to 0.)
2106  */
2107 bool
2109 {
2110  /* info obtained from freelist.c */
2111  int strategy_buf_id;
2112  uint32 strategy_passes;
2113  uint32 recent_alloc;
2114 
2115  /*
2116  * Information saved between calls so we can determine the strategy
2117  * point's advance rate and avoid scanning already-cleaned buffers.
2118  */
2119  static bool saved_info_valid = false;
2120  static int prev_strategy_buf_id;
2121  static uint32 prev_strategy_passes;
2122  static int next_to_clean;
2123  static uint32 next_passes;
2124 
2125  /* Moving averages of allocation rate and clean-buffer density */
2126  static float smoothed_alloc = 0;
2127  static float smoothed_density = 10.0;
2128 
2129  /* Potentially these could be tunables, but for now, not */
2130  float smoothing_samples = 16;
2131  float scan_whole_pool_milliseconds = 120000.0;
2132 
2133  /* Used to compute how far we scan ahead */
2134  long strategy_delta;
2135  int bufs_to_lap;
2136  int bufs_ahead;
2137  float scans_per_alloc;
2138  int reusable_buffers_est;
2139  int upcoming_alloc_est;
2140  int min_scan_buffers;
2141 
2142  /* Variables for the scanning loop proper */
2143  int num_to_scan;
2144  int num_written;
2145  int reusable_buffers;
2146 
2147  /* Variables for final smoothed_density update */
2148  long new_strategy_delta;
2149  uint32 new_recent_alloc;
2150 
2151  /*
2152  * Find out where the freelist clock sweep currently is, and how many
2153  * buffer allocations have happened since our last call.
2154  */
2155  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2156 
2157  /* Report buffer alloc counts to pgstat */
2158  BgWriterStats.m_buf_alloc += recent_alloc;
2159 
2160  /*
2161  * If we're not running the LRU scan, just stop after doing the stats
2162  * stuff. We mark the saved state invalid so that we can recover sanely
2163  * if LRU scan is turned back on later.
2164  */
2165  if (bgwriter_lru_maxpages <= 0)
2166  {
2167  saved_info_valid = false;
2168  return true;
2169  }
2170 
2171  /*
2172  * Compute strategy_delta = how many buffers have been scanned by the
2173  * clock sweep since last time. If first time through, assume none. Then
2174  * see if we are still ahead of the clock sweep, and if so, how many
2175  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2176  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2177  * behavior when the passes counts wrap around.
2178  */
2179  if (saved_info_valid)
2180  {
2181  int32 passes_delta = strategy_passes - prev_strategy_passes;
2182 
2183  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2184  strategy_delta += (long) passes_delta * NBuffers;
2185 
2186  Assert(strategy_delta >= 0);
2187 
2188  if ((int32) (next_passes - strategy_passes) > 0)
2189  {
2190  /* we're one pass ahead of the strategy point */
2191  bufs_to_lap = strategy_buf_id - next_to_clean;
2192 #ifdef BGW_DEBUG
2193  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2194  next_passes, next_to_clean,
2195  strategy_passes, strategy_buf_id,
2196  strategy_delta, bufs_to_lap);
2197 #endif
2198  }
2199  else if (next_passes == strategy_passes &&
2200  next_to_clean >= strategy_buf_id)
2201  {
2202  /* on same pass, but ahead or at least not behind */
2203  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2204 #ifdef BGW_DEBUG
2205  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2206  next_passes, next_to_clean,
2207  strategy_passes, strategy_buf_id,
2208  strategy_delta, bufs_to_lap);
2209 #endif
2210  }
2211  else
2212  {
2213  /*
2214  * We're behind, so skip forward to the strategy point and start
2215  * cleaning from there.
2216  */
2217 #ifdef BGW_DEBUG
2218  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2219  next_passes, next_to_clean,
2220  strategy_passes, strategy_buf_id,
2221  strategy_delta);
2222 #endif
2223  next_to_clean = strategy_buf_id;
2224  next_passes = strategy_passes;
2225  bufs_to_lap = NBuffers;
2226  }
2227  }
2228  else
2229  {
2230  /*
2231  * Initializing at startup or after LRU scanning had been off. Always
2232  * start at the strategy point.
2233  */
2234 #ifdef BGW_DEBUG
2235  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2236  strategy_passes, strategy_buf_id);
2237 #endif
2238  strategy_delta = 0;
2239  next_to_clean = strategy_buf_id;
2240  next_passes = strategy_passes;
2241  bufs_to_lap = NBuffers;
2242  }
2243 
2244  /* Update saved info for next time */
2245  prev_strategy_buf_id = strategy_buf_id;
2246  prev_strategy_passes = strategy_passes;
2247  saved_info_valid = true;
2248 
2249  /*
2250  * Compute how many buffers had to be scanned for each new allocation, ie,
2251  * 1/density of reusable buffers, and track a moving average of that.
2252  *
2253  * If the strategy point didn't move, we don't update the density estimate
2254  */
2255  if (strategy_delta > 0 && recent_alloc > 0)
2256  {
2257  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2258  smoothed_density += (scans_per_alloc - smoothed_density) /
2259  smoothing_samples;
2260  }
2261 
2262  /*
2263  * Estimate how many reusable buffers there are between the current
2264  * strategy point and where we've scanned ahead to, based on the smoothed
2265  * density estimate.
2266  */
2267  bufs_ahead = NBuffers - bufs_to_lap;
2268  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2269 
2270  /*
2271  * Track a moving average of recent buffer allocations. Here, rather than
2272  * a true average we want a fast-attack, slow-decline behavior: we
2273  * immediately follow any increase.
2274  */
2275  if (smoothed_alloc <= (float) recent_alloc)
2276  smoothed_alloc = recent_alloc;
2277  else
2278  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2279  smoothing_samples;
2280 
2281  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2282  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2283 
2284  /*
2285  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2286  * eventually underflow to zero, and the underflows produce annoying
2287  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2288  * zero, there's no point in tracking smaller and smaller values of
2289  * smoothed_alloc, so just reset it to exactly zero to avoid this
2290  * syndrome. It will pop back up as soon as recent_alloc increases.
2291  */
2292  if (upcoming_alloc_est == 0)
2293  smoothed_alloc = 0;
2294 
2295  /*
2296  * Even in cases where there's been little or no buffer allocation
2297  * activity, we want to make a small amount of progress through the buffer
2298  * cache so that as many reusable buffers as possible are clean after an
2299  * idle period.
2300  *
2301  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2302  * the BGW will be called during the scan_whole_pool time; slice the
2303  * buffer pool into that many sections.
2304  */
2305  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2306 
2307  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2308  {
2309 #ifdef BGW_DEBUG
2310  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2311  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2312 #endif
2313  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2314  }
2315 
2316  /*
2317  * Now write out dirty reusable buffers, working forward from the
2318  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2319  * enough buffers to match our estimate of the next cycle's allocation
2320  * requirements, or hit the bgwriter_lru_maxpages limit.
2321  */
2322 
2323  /* Make sure we can handle the pin inside SyncOneBuffer */
2325 
2326  num_to_scan = bufs_to_lap;
2327  num_written = 0;
2328  reusable_buffers = reusable_buffers_est;
2329 
2330  /* Execute the LRU scan */
2331  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2332  {
2333  int sync_state = SyncOneBuffer(next_to_clean, true,
2334  wb_context);
2335 
2336  if (++next_to_clean >= NBuffers)
2337  {
2338  next_to_clean = 0;
2339  next_passes++;
2340  }
2341  num_to_scan--;
2342 
2343  if (sync_state & BUF_WRITTEN)
2344  {
2345  reusable_buffers++;
2346  if (++num_written >= bgwriter_lru_maxpages)
2347  {
2349  break;
2350  }
2351  }
2352  else if (sync_state & BUF_REUSABLE)
2353  reusable_buffers++;
2354  }
2355 
2356  BgWriterStats.m_buf_written_clean += num_written;
2357 
2358 #ifdef BGW_DEBUG
2359  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2360  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2361  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2362  bufs_to_lap - num_to_scan,
2363  num_written,
2364  reusable_buffers - reusable_buffers_est);
2365 #endif
2366 
2367  /*
2368  * Consider the above scan as being like a new allocation scan.
2369  * Characterize its density and update the smoothed one based on it. This
2370  * effectively halves the moving average period in cases where both the
2371  * strategy and the background writer are doing some useful scanning,
2372  * which is helpful because a long memory isn't as desirable on the
2373  * density estimates.
2374  */
2375  new_strategy_delta = bufs_to_lap - num_to_scan;
2376  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2377  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2378  {
2379  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2380  smoothed_density += (scans_per_alloc - smoothed_density) /
2381  smoothing_samples;
2382 
2383 #ifdef BGW_DEBUG
2384  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2385  new_recent_alloc, new_strategy_delta,
2386  scans_per_alloc, smoothed_density);
2387 #endif
2388  }
2389 
2390  /* Return true if OK to hibernate */
2391  return (bufs_to_lap == 0 && recent_alloc == 0);
2392 }
2393 
2394 /*
2395  * SyncOneBuffer -- process a single buffer during syncing.
2396  *
2397  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2398  * buffers marked recently used, as these are not replacement candidates.
2399  *
2400  * Returns a bitmask containing the following flag bits:
2401  * BUF_WRITTEN: we wrote the buffer.
2402  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2403  * pin count 0 and usage count 0.
2404  *
2405  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2406  * after locking it, but we don't care all that much.)
2407  *
2408  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2409  */
2410 static int
2411 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2412 {
2413  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2414  int result = 0;
2415  uint32 buf_state;
2416  BufferTag tag;
2417 
2419 
2420  /*
2421  * Check whether buffer needs writing.
2422  *
2423  * We can make this check without taking the buffer content lock so long
2424  * as we mark pages dirty in access methods *before* logging changes with
2425  * XLogInsert(): if someone marks the buffer dirty just after our check we
2426  * don't worry because our checkpoint.redo points before log record for
2427  * upcoming changes and so we are not required to write such dirty buffer.
2428  */
2429  buf_state = LockBufHdr(bufHdr);
2430 
2431  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2432  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2433  {
2434  result |= BUF_REUSABLE;
2435  }
2436  else if (skip_recently_used)
2437  {
2438  /* Caller told us not to write recently-used buffers */
2439  UnlockBufHdr(bufHdr, buf_state);
2440  return result;
2441  }
2442 
2443  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2444  {
2445  /* It's clean, so nothing to do */
2446  UnlockBufHdr(bufHdr, buf_state);
2447  return result;
2448  }
2449 
2450  /*
2451  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2452  * buffer is clean by the time we've locked it.)
2453  */
2454  PinBuffer_Locked(bufHdr);
2456 
2457  FlushBuffer(bufHdr, NULL);
2458 
2460 
2461  tag = bufHdr->tag;
2462 
2463  UnpinBuffer(bufHdr, true);
2464 
2465  ScheduleBufferTagForWriteback(wb_context, &tag);
2466 
2467  return result | BUF_WRITTEN;
2468 }
2469 
2470 /*
2471  * AtEOXact_Buffers - clean up at end of transaction.
2472  *
2473  * As of PostgreSQL 8.0, buffer pins should get released by the
2474  * ResourceOwner mechanism. This routine is just a debugging
2475  * cross-check that no pins remain.
2476  */
2477 void
2478 AtEOXact_Buffers(bool isCommit)
2479 {
2481 
2482  AtEOXact_LocalBuffers(isCommit);
2483 
2485 }
2486 
2487 /*
2488  * Initialize access to shared buffer pool
2489  *
2490  * This is called during backend startup (whether standalone or under the
2491  * postmaster). It sets up for this backend's access to the already-existing
2492  * buffer pool.
2493  *
2494  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2495  * cannot do LWLockAcquire; hence we can't actually access stuff in
2496  * shared memory yet. We are only initializing local data here.
2497  * (See also InitBufferPoolBackend)
2498  */
2499 void
2501 {
2502  HASHCTL hash_ctl;
2503 
2504  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2505 
2506  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2507  hash_ctl.keysize = sizeof(int32);
2508  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2509 
2510  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2511  HASH_ELEM | HASH_BLOBS);
2512 }
2513 
2514 /*
2515  * InitBufferPoolBackend --- second-stage initialization of a new backend
2516  *
2517  * This is called after we have acquired a PGPROC and so can safely get
2518  * LWLocks. We don't currently need to do anything at this stage ...
2519  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2520  * access, and thereby has to be called at the corresponding phase of
2521  * backend shutdown.
2522  */
2523 void
2525 {
2527 }
2528 
2529 /*
2530  * During backend exit, ensure that we released all shared-buffer locks and
2531  * assert that we have no remaining pins.
2532  */
2533 static void
2535 {
2536  AbortBufferIO();
2537  UnlockBuffers();
2538 
2540 
2541  /* localbuf.c needs a chance too */
2543 }
2544 
2545 /*
2546  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2547  *
2548  * As of PostgreSQL 8.0, buffer pins should get released by the
2549  * ResourceOwner mechanism. This routine is just a debugging
2550  * cross-check that no pins remain.
2551  */
2552 static void
2554 {
2555 #ifdef USE_ASSERT_CHECKING
2556  int RefCountErrors = 0;
2557  PrivateRefCountEntry *res;
2558  int i;
2559 
2560  /* check the array */
2561  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2562  {
2563  res = &PrivateRefCountArray[i];
2564 
2565  if (res->buffer != InvalidBuffer)
2566  {
2568  RefCountErrors++;
2569  }
2570  }
2571 
2572  /* if necessary search the hash */
2574  {
2575  HASH_SEQ_STATUS hstat;
2576 
2577  hash_seq_init(&hstat, PrivateRefCountHash);
2578  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2579  {
2581  RefCountErrors++;
2582  }
2583 
2584  }
2585 
2586  Assert(RefCountErrors == 0);
2587 #endif
2588 }
2589 
2590 /*
2591  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2592  */
2593 void
2595 {
2596  BufferDesc *buf;
2597  int32 loccount;
2598  char *path;
2599  BackendId backend;
2600  uint32 buf_state;
2601 
2602  Assert(BufferIsValid(buffer));
2603  if (BufferIsLocal(buffer))
2604  {
2605  buf = GetLocalBufferDescriptor(-buffer - 1);
2606  loccount = LocalRefCount[-buffer - 1];
2607  backend = MyBackendId;
2608  }
2609  else
2610  {
2611  buf = GetBufferDescriptor(buffer - 1);
2612  loccount = GetPrivateRefCount(buffer);
2613  backend = InvalidBackendId;
2614  }
2615 
2616  /* theoretically we should lock the bufhdr here */
2617  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2618  buf_state = pg_atomic_read_u32(&buf->state);
2619  elog(WARNING,
2620  "buffer refcount leak: [%03d] "
2621  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2622  buffer, path,
2623  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2624  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2625  pfree(path);
2626 }
2627 
2628 /*
2629  * CheckPointBuffers
2630  *
2631  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2632  *
2633  * Note: temporary relations do not participate in checkpoints, so they don't
2634  * need to be flushed.
2635  */
2636 void
2638 {
2639  BufferSync(flags);
2640 }
2641 
2642 
2643 /*
2644  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2645  */
2646 void
2648 {
2649  /* Nothing to do in bufmgr anymore... */
2650 }
2651 
2652 /*
2653  * BufferGetBlockNumber
2654  * Returns the block number associated with a buffer.
2655  *
2656  * Note:
2657  * Assumes that the buffer is valid and pinned, else the
2658  * value may be obsolete immediately...
2659  */
2662 {
2663  BufferDesc *bufHdr;
2664 
2665  Assert(BufferIsPinned(buffer));
2666 
2667  if (BufferIsLocal(buffer))
2668  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2669  else
2670  bufHdr = GetBufferDescriptor(buffer - 1);
2671 
2672  /* pinned, so OK to read tag without spinlock */
2673  return bufHdr->tag.blockNum;
2674 }
2675 
2676 /*
2677  * BufferGetTag
2678  * Returns the relfilenode, fork number and block number associated with
2679  * a buffer.
2680  */
2681 void
2683  BlockNumber *blknum)
2684 {
2685  BufferDesc *bufHdr;
2686 
2687  /* Do the same checks as BufferGetBlockNumber. */
2688  Assert(BufferIsPinned(buffer));
2689 
2690  if (BufferIsLocal(buffer))
2691  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2692  else
2693  bufHdr = GetBufferDescriptor(buffer - 1);
2694 
2695  /* pinned, so OK to read tag without spinlock */
2696  *rnode = bufHdr->tag.rnode;
2697  *forknum = bufHdr->tag.forkNum;
2698  *blknum = bufHdr->tag.blockNum;
2699 }
2700 
2701 /*
2702  * FlushBuffer
2703  * Physically write out a shared buffer.
2704  *
2705  * NOTE: this actually just passes the buffer contents to the kernel; the
2706  * real write to disk won't happen until the kernel feels like it. This
2707  * is okay from our point of view since we can redo the changes from WAL.
2708  * However, we will need to force the changes to disk via fsync before
2709  * we can checkpoint WAL.
2710  *
2711  * The caller must hold a pin on the buffer and have share-locked the
2712  * buffer contents. (Note: a share-lock does not prevent updates of
2713  * hint bits in the buffer, so the page could change while the write
2714  * is in progress, but we assume that that will not invalidate the data
2715  * written.)
2716  *
2717  * If the caller has an smgr reference for the buffer's relation, pass it
2718  * as the second parameter. If not, pass NULL.
2719  */
2720 static void
2722 {
2723  XLogRecPtr recptr;
2724  ErrorContextCallback errcallback;
2725  instr_time io_start,
2726  io_time;
2727  Block bufBlock;
2728  char *bufToWrite;
2729  uint32 buf_state;
2730 
2731  /*
2732  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2733  * false, then someone else flushed the buffer before we could, so we need
2734  * not do anything.
2735  */
2736  if (!StartBufferIO(buf, false))
2737  return;
2738 
2739  /* Setup error traceback support for ereport() */
2741  errcallback.arg = (void *) buf;
2742  errcallback.previous = error_context_stack;
2743  error_context_stack = &errcallback;
2744 
2745  /* Find smgr relation for buffer */
2746  if (reln == NULL)
2747  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2748 
2749  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2750  buf->tag.blockNum,
2751  reln->smgr_rnode.node.spcNode,
2752  reln->smgr_rnode.node.dbNode,
2753  reln->smgr_rnode.node.relNode);
2754 
2755  buf_state = LockBufHdr(buf);
2756 
2757  /*
2758  * Run PageGetLSN while holding header lock, since we don't have the
2759  * buffer locked exclusively in all cases.
2760  */
2761  recptr = BufferGetLSN(buf);
2762 
2763  /* To check if block content changes while flushing. - vadim 01/17/97 */
2764  buf_state &= ~BM_JUST_DIRTIED;
2765  UnlockBufHdr(buf, buf_state);
2766 
2767  /*
2768  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2769  * rule that log updates must hit disk before any of the data-file changes
2770  * they describe do.
2771  *
2772  * However, this rule does not apply to unlogged relations, which will be
2773  * lost after a crash anyway. Most unlogged relation pages do not bear
2774  * LSNs since we never emit WAL records for them, and therefore flushing
2775  * up through the buffer LSN would be useless, but harmless. However,
2776  * GiST indexes use LSNs internally to track page-splits, and therefore
2777  * unlogged GiST pages bear "fake" LSNs generated by
2778  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2779  * LSN counter could advance past the WAL insertion point; and if it did
2780  * happen, attempting to flush WAL through that location would fail, with
2781  * disastrous system-wide consequences. To make sure that can't happen,
2782  * skip the flush if the buffer isn't permanent.
2783  */
2784  if (buf_state & BM_PERMANENT)
2785  XLogFlush(recptr);
2786 
2787  /*
2788  * Now it's safe to write buffer to disk. Note that no one else should
2789  * have been able to write it while we were busy with log flushing because
2790  * we have the io_in_progress lock.
2791  */
2792  bufBlock = BufHdrGetBlock(buf);
2793 
2794  /*
2795  * Update page checksum if desired. Since we have only shared lock on the
2796  * buffer, other processes might be updating hint bits in it, so we must
2797  * copy the page to private storage if we do checksumming.
2798  */
2799  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2800 
2801  if (track_io_timing)
2802  INSTR_TIME_SET_CURRENT(io_start);
2803 
2804  /*
2805  * bufToWrite is either the shared buffer or a copy, as appropriate.
2806  */
2807  smgrwrite(reln,
2808  buf->tag.forkNum,
2809  buf->tag.blockNum,
2810  bufToWrite,
2811  false);
2812 
2813  if (track_io_timing)
2814  {
2815  INSTR_TIME_SET_CURRENT(io_time);
2816  INSTR_TIME_SUBTRACT(io_time, io_start);
2819  }
2820 
2822 
2823  /*
2824  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2825  * end the io_in_progress state.
2826  */
2827  TerminateBufferIO(buf, true, 0);
2828 
2829  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2830  buf->tag.blockNum,
2831  reln->smgr_rnode.node.spcNode,
2832  reln->smgr_rnode.node.dbNode,
2833  reln->smgr_rnode.node.relNode);
2834 
2835  /* Pop the error context stack */
2836  error_context_stack = errcallback.previous;
2837 }
2838 
2839 /*
2840  * RelationGetNumberOfBlocksInFork
2841  * Determines the current number of pages in the specified relation fork.
2842  *
2843  * Note that the accuracy of the result will depend on the details of the
2844  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2845  * it might not be.
2846  */
2849 {
2850  switch (relation->rd_rel->relkind)
2851  {
2852  case RELKIND_SEQUENCE:
2853  case RELKIND_INDEX:
2854  case RELKIND_PARTITIONED_INDEX:
2855  /* Open it at the smgr level if not already done */
2856  RelationOpenSmgr(relation);
2857 
2858  return smgrnblocks(relation->rd_smgr, forkNum);
2859 
2860  case RELKIND_RELATION:
2861  case RELKIND_TOASTVALUE:
2862  case RELKIND_MATVIEW:
2863  {
2864  /*
2865  * Not every table AM uses BLCKSZ wide fixed size blocks.
2866  * Therefore tableam returns the size in bytes - but for the
2867  * purpose of this routine, we want the number of blocks.
2868  * Therefore divide, rounding up.
2869  */
2870  uint64 szbytes;
2871 
2872  szbytes = table_relation_size(relation, forkNum);
2873 
2874  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2875  }
2876  case RELKIND_VIEW:
2877  case RELKIND_COMPOSITE_TYPE:
2878  case RELKIND_FOREIGN_TABLE:
2879  case RELKIND_PARTITIONED_TABLE:
2880  default:
2881  Assert(false);
2882  break;
2883  }
2884 
2885  return 0; /* keep compiler quiet */
2886 }
2887 
2888 /*
2889  * BufferIsPermanent
2890  * Determines whether a buffer will potentially still be around after
2891  * a crash. Caller must hold a buffer pin.
2892  */
2893 bool
2895 {
2896  BufferDesc *bufHdr;
2897 
2898  /* Local buffers are used only for temp relations. */
2899  if (BufferIsLocal(buffer))
2900  return false;
2901 
2902  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2903  Assert(BufferIsValid(buffer));
2904  Assert(BufferIsPinned(buffer));
2905 
2906  /*
2907  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2908  * need not bother with the buffer header spinlock. Even if someone else
2909  * changes the buffer header state while we're doing this, the state is
2910  * changed atomically, so we'll read the old value or the new value, but
2911  * not random garbage.
2912  */
2913  bufHdr = GetBufferDescriptor(buffer - 1);
2914  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2915 }
2916 
2917 /*
2918  * BufferGetLSNAtomic
2919  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2920  * This is necessary for some callers who may not have an exclusive lock
2921  * on the buffer.
2922  */
2923 XLogRecPtr
2925 {
2926  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2927  char *page = BufferGetPage(buffer);
2928  XLogRecPtr lsn;
2929  uint32 buf_state;
2930 
2931  /*
2932  * If we don't need locking for correctness, fastpath out.
2933  */
2934  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2935  return PageGetLSN(page);
2936 
2937  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2938  Assert(BufferIsValid(buffer));
2939  Assert(BufferIsPinned(buffer));
2940 
2941  buf_state = LockBufHdr(bufHdr);
2942  lsn = PageGetLSN(page);
2943  UnlockBufHdr(bufHdr, buf_state);
2944 
2945  return lsn;
2946 }
2947 
2948 /* ---------------------------------------------------------------------
2949  * DropRelFileNodeBuffers
2950  *
2951  * This function removes from the buffer pool all the pages of the
2952  * specified relation forks that have block numbers >= firstDelBlock.
2953  * (In particular, with firstDelBlock = 0, all pages are removed.)
2954  * Dirty pages are simply dropped, without bothering to write them
2955  * out first. Therefore, this is NOT rollback-able, and so should be
2956  * used only with extreme caution!
2957  *
2958  * Currently, this is called only from smgr.c when the underlying file
2959  * is about to be deleted or truncated (firstDelBlock is needed for
2960  * the truncation case). The data in the affected pages would therefore
2961  * be deleted momentarily anyway, and there is no point in writing it.
2962  * It is the responsibility of higher-level code to ensure that the
2963  * deletion or truncation does not lose any data that could be needed
2964  * later. It is also the responsibility of higher-level code to ensure
2965  * that no other process could be trying to load more pages of the
2966  * relation into buffers.
2967  *
2968  * XXX currently it sequentially searches the buffer pool, should be
2969  * changed to more clever ways of searching. However, this routine
2970  * is used only in code paths that aren't very performance-critical,
2971  * and we shouldn't slow down the hot paths to make it faster ...
2972  * --------------------------------------------------------------------
2973  */
2974 void
2976  int nforks, BlockNumber *firstDelBlock)
2977 {
2978  int i;
2979  int j;
2980 
2981  /* If it's a local relation, it's localbuf.c's problem. */
2982  if (RelFileNodeBackendIsTemp(rnode))
2983  {
2984  if (rnode.backend == MyBackendId)
2985  {
2986  for (j = 0; j < nforks; j++)
2987  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2988  firstDelBlock[j]);
2989  }
2990  return;
2991  }
2992 
2993  for (i = 0; i < NBuffers; i++)
2994  {
2995  BufferDesc *bufHdr = GetBufferDescriptor(i);
2996  uint32 buf_state;
2997 
2998  /*
2999  * We can make this a tad faster by prechecking the buffer tag before
3000  * we attempt to lock the buffer; this saves a lot of lock
3001  * acquisitions in typical cases. It should be safe because the
3002  * caller must have AccessExclusiveLock on the relation, or some other
3003  * reason to be certain that no one is loading new pages of the rel
3004  * into the buffer pool. (Otherwise we might well miss such pages
3005  * entirely.) Therefore, while the tag might be changing while we
3006  * look at it, it can't be changing *to* a value we care about, only
3007  * *away* from such a value. So false negatives are impossible, and
3008  * false positives are safe because we'll recheck after getting the
3009  * buffer lock.
3010  *
3011  * We could check forkNum and blockNum as well as the rnode, but the
3012  * incremental win from doing so seems small.
3013  */
3014  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
3015  continue;
3016 
3017  buf_state = LockBufHdr(bufHdr);
3018 
3019  for (j = 0; j < nforks; j++)
3020  {
3021  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
3022  bufHdr->tag.forkNum == forkNum[j] &&
3023  bufHdr->tag.blockNum >= firstDelBlock[j])
3024  {
3025  InvalidateBuffer(bufHdr); /* releases spinlock */
3026  break;
3027  }
3028  }
3029  if (j >= nforks)
3030  UnlockBufHdr(bufHdr, buf_state);
3031  }
3032 }
3033 
3034 /* ---------------------------------------------------------------------
3035  * DropRelFileNodesAllBuffers
3036  *
3037  * This function removes from the buffer pool all the pages of all
3038  * forks of the specified relations. It's equivalent to calling
3039  * DropRelFileNodeBuffers once per fork per relation with
3040  * firstDelBlock = 0.
3041  * --------------------------------------------------------------------
3042  */
3043 void
3045 {
3046  int i,
3047  n = 0;
3048  RelFileNode *nodes;
3049  bool use_bsearch;
3050 
3051  if (nnodes == 0)
3052  return;
3053 
3054  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3055 
3056  /* If it's a local relation, it's localbuf.c's problem. */
3057  for (i = 0; i < nnodes; i++)
3058  {
3059  if (RelFileNodeBackendIsTemp(rnodes[i]))
3060  {
3061  if (rnodes[i].backend == MyBackendId)
3062  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3063  }
3064  else
3065  nodes[n++] = rnodes[i].node;
3066  }
3067 
3068  /*
3069  * If there are no non-local relations, then we're done. Release the
3070  * memory and return.
3071  */
3072  if (n == 0)
3073  {
3074  pfree(nodes);
3075  return;
3076  }
3077 
3078  /*
3079  * For low number of relations to drop just use a simple walk through, to
3080  * save the bsearch overhead. The threshold to use is rather a guess than
3081  * an exactly determined value, as it depends on many factors (CPU and RAM
3082  * speeds, amount of shared buffers etc.).
3083  */
3084  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3085 
3086  /* sort the list of rnodes if necessary */
3087  if (use_bsearch)
3088  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3089 
3090  for (i = 0; i < NBuffers; i++)
3091  {
3092  RelFileNode *rnode = NULL;
3093  BufferDesc *bufHdr = GetBufferDescriptor(i);
3094  uint32 buf_state;
3095 
3096  /*
3097  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3098  * and saves some cycles.
3099  */
3100 
3101  if (!use_bsearch)
3102  {
3103  int j;
3104 
3105  for (j = 0; j < n; j++)
3106  {
3107  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3108  {
3109  rnode = &nodes[j];
3110  break;
3111  }
3112  }
3113  }
3114  else
3115  {
3116  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3117  nodes, n, sizeof(RelFileNode),
3119  }
3120 
3121  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3122  if (rnode == NULL)
3123  continue;
3124 
3125  buf_state = LockBufHdr(bufHdr);
3126  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3127  InvalidateBuffer(bufHdr); /* releases spinlock */
3128  else
3129  UnlockBufHdr(bufHdr, buf_state);
3130  }
3131 
3132  pfree(nodes);
3133 }
3134 
3135 /* ---------------------------------------------------------------------
3136  * DropDatabaseBuffers
3137  *
3138  * This function removes all the buffers in the buffer cache for a
3139  * particular database. Dirty pages are simply dropped, without
3140  * bothering to write them out first. This is used when we destroy a
3141  * database, to avoid trying to flush data to disk when the directory
3142  * tree no longer exists. Implementation is pretty similar to
3143  * DropRelFileNodeBuffers() which is for destroying just one relation.
3144  * --------------------------------------------------------------------
3145  */
3146 void
3148 {
3149  int i;
3150 
3151  /*
3152  * We needn't consider local buffers, since by assumption the target
3153  * database isn't our own.
3154  */
3155 
3156  for (i = 0; i < NBuffers; i++)
3157  {
3158  BufferDesc *bufHdr = GetBufferDescriptor(i);
3159  uint32 buf_state;
3160 
3161  /*
3162  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3163  * and saves some cycles.
3164  */
3165  if (bufHdr->tag.rnode.dbNode != dbid)
3166  continue;
3167 
3168  buf_state = LockBufHdr(bufHdr);
3169  if (bufHdr->tag.rnode.dbNode == dbid)
3170  InvalidateBuffer(bufHdr); /* releases spinlock */
3171  else
3172  UnlockBufHdr(bufHdr, buf_state);
3173  }
3174 }
3175 
3176 /* -----------------------------------------------------------------
3177  * PrintBufferDescs
3178  *
3179  * this function prints all the buffer descriptors, for debugging
3180  * use only.
3181  * -----------------------------------------------------------------
3182  */
3183 #ifdef NOT_USED
3184 void
3185 PrintBufferDescs(void)
3186 {
3187  int i;
3188 
3189  for (i = 0; i < NBuffers; ++i)
3190  {
3193 
3194  /* theoretically we should lock the bufhdr here */
3195  elog(LOG,
3196  "[%02d] (freeNext=%d, rel=%s, "
3197  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3198  i, buf->freeNext,
3200  buf->tag.blockNum, buf->flags,
3201  buf->refcount, GetPrivateRefCount(b));
3202  }
3203 }
3204 #endif
3205 
3206 #ifdef NOT_USED
3207 void
3208 PrintPinnedBufs(void)
3209 {
3210  int i;
3211 
3212  for (i = 0; i < NBuffers; ++i)
3213  {
3216 
3217  if (GetPrivateRefCount(b) > 0)
3218  {
3219  /* theoretically we should lock the bufhdr here */
3220  elog(LOG,
3221  "[%02d] (freeNext=%d, rel=%s, "
3222  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3223  i, buf->freeNext,
3224  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3225  buf->tag.blockNum, buf->flags,
3226  buf->refcount, GetPrivateRefCount(b));
3227  }
3228  }
3229 }
3230 #endif
3231 
3232 /* ---------------------------------------------------------------------
3233  * FlushRelationBuffers
3234  *
3235  * This function writes all dirty pages of a relation out to disk
3236  * (or more accurately, out to kernel disk buffers), ensuring that the
3237  * kernel has an up-to-date view of the relation.
3238  *
3239  * Generally, the caller should be holding AccessExclusiveLock on the
3240  * target relation to ensure that no other backend is busy dirtying
3241  * more blocks of the relation; the effects can't be expected to last
3242  * after the lock is released.
3243  *
3244  * XXX currently it sequentially searches the buffer pool, should be
3245  * changed to more clever ways of searching. This routine is not
3246  * used in any performance-critical code paths, so it's not worth
3247  * adding additional overhead to normal paths to make it go faster;
3248  * but see also DropRelFileNodeBuffers.
3249  * --------------------------------------------------------------------
3250  */
3251 void
3253 {
3254  int i;
3255  BufferDesc *bufHdr;
3256 
3257  /* Open rel at the smgr level if not already done */
3258  RelationOpenSmgr(rel);
3259 
3260  if (RelationUsesLocalBuffers(rel))
3261  {
3262  for (i = 0; i < NLocBuffer; i++)
3263  {
3264  uint32 buf_state;
3265 
3266  bufHdr = GetLocalBufferDescriptor(i);
3267  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3268  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3269  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3270  {
3271  ErrorContextCallback errcallback;
3272  Page localpage;
3273 
3274  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3275 
3276  /* Setup error traceback support for ereport() */
3278  errcallback.arg = (void *) bufHdr;
3279  errcallback.previous = error_context_stack;
3280  error_context_stack = &errcallback;
3281 
3282  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3283 
3284  smgrwrite(rel->rd_smgr,
3285  bufHdr->tag.forkNum,
3286  bufHdr->tag.blockNum,
3287  localpage,
3288  false);
3289 
3290  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3291  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3292 
3293  /* Pop the error context stack */
3294  error_context_stack = errcallback.previous;
3295  }
3296  }
3297 
3298  return;
3299  }
3300 
3301  /* Make sure we can handle the pin inside the loop */
3303 
3304  for (i = 0; i < NBuffers; i++)
3305  {
3306  uint32 buf_state;
3307 
3308  bufHdr = GetBufferDescriptor(i);
3309 
3310  /*
3311  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3312  * and saves some cycles.
3313  */
3314  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3315  continue;
3316 
3318 
3319  buf_state = LockBufHdr(bufHdr);
3320  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3321  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3322  {
3323  PinBuffer_Locked(bufHdr);
3325  FlushBuffer(bufHdr, rel->rd_smgr);
3327  UnpinBuffer(bufHdr, true);
3328  }
3329  else
3330  UnlockBufHdr(bufHdr, buf_state);
3331  }
3332 }
3333 
3334 /* ---------------------------------------------------------------------
3335  * FlushRelationsAllBuffers
3336  *
3337  * This function flushes out of the buffer pool all the pages of all
3338  * forks of the specified smgr relations. It's equivalent to calling
3339  * FlushRelationBuffers once per fork per relation. The relations are
3340  * assumed not to use local buffers.
3341  * --------------------------------------------------------------------
3342  */
3343 void
3345 {
3346  int i;
3347  SMgrSortArray *srels;
3348  bool use_bsearch;
3349 
3350  if (nrels == 0)
3351  return;
3352 
3353  /* fill-in array for qsort */
3354  srels = palloc(sizeof(SMgrSortArray) * nrels);
3355 
3356  for (i = 0; i < nrels; i++)
3357  {
3358  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3359 
3360  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3361  srels[i].srel = smgrs[i];
3362  }
3363 
3364  /*
3365  * Save the bsearch overhead for low number of relations to sync. See
3366  * DropRelFileNodesAllBuffers for details.
3367  */
3368  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3369 
3370  /* sort the list of SMgrRelations if necessary */
3371  if (use_bsearch)
3372  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3373 
3374  /* Make sure we can handle the pin inside the loop */
3376 
3377  for (i = 0; i < NBuffers; i++)
3378  {
3379  SMgrSortArray *srelent = NULL;
3380  BufferDesc *bufHdr = GetBufferDescriptor(i);
3381  uint32 buf_state;
3382 
3383  /*
3384  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3385  * and saves some cycles.
3386  */
3387 
3388  if (!use_bsearch)
3389  {
3390  int j;
3391 
3392  for (j = 0; j < nrels; j++)
3393  {
3394  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3395  {
3396  srelent = &srels[j];
3397  break;
3398  }
3399  }
3400 
3401  }
3402  else
3403  {
3404  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3405  srels, nrels, sizeof(SMgrSortArray),
3407  }
3408 
3409  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3410  if (srelent == NULL)
3411  continue;
3412 
3414 
3415  buf_state = LockBufHdr(bufHdr);
3416  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3417  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3418  {
3419  PinBuffer_Locked(bufHdr);
3421  FlushBuffer(bufHdr, srelent->srel);
3423  UnpinBuffer(bufHdr, true);
3424  }
3425  else
3426  UnlockBufHdr(bufHdr, buf_state);
3427  }
3428 
3429  pfree(srels);
3430 }
3431 
3432 /* ---------------------------------------------------------------------
3433  * FlushDatabaseBuffers
3434  *
3435  * This function writes all dirty pages of a database out to disk
3436  * (or more accurately, out to kernel disk buffers), ensuring that the
3437  * kernel has an up-to-date view of the database.
3438  *
3439  * Generally, the caller should be holding an appropriate lock to ensure
3440  * no other backend is active in the target database; otherwise more
3441  * pages could get dirtied.
3442  *
3443  * Note we don't worry about flushing any pages of temporary relations.
3444  * It's assumed these wouldn't be interesting.
3445  * --------------------------------------------------------------------
3446  */
3447 void
3449 {
3450  int i;
3451  BufferDesc *bufHdr;
3452 
3453  /* Make sure we can handle the pin inside the loop */
3455 
3456  for (i = 0; i < NBuffers; i++)
3457  {
3458  uint32 buf_state;
3459 
3460  bufHdr = GetBufferDescriptor(i);
3461 
3462  /*
3463  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3464  * and saves some cycles.
3465  */
3466  if (bufHdr->tag.rnode.dbNode != dbid)
3467  continue;
3468 
3470 
3471  buf_state = LockBufHdr(bufHdr);
3472  if (bufHdr->tag.rnode.dbNode == dbid &&
3473  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3474  {
3475  PinBuffer_Locked(bufHdr);
3477  FlushBuffer(bufHdr, NULL);
3479  UnpinBuffer(bufHdr, true);
3480  }
3481  else
3482  UnlockBufHdr(bufHdr, buf_state);
3483  }
3484 }
3485 
3486 /*
3487  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3488  * OS.
3489  */
3490 void
3492 {
3493  BufferDesc *bufHdr;
3494 
3495  /* currently not needed, but no fundamental reason not to support */
3496  Assert(!BufferIsLocal(buffer));
3497 
3498  Assert(BufferIsPinned(buffer));
3499 
3500  bufHdr = GetBufferDescriptor(buffer - 1);
3501 
3503 
3504  FlushBuffer(bufHdr, NULL);
3505 }
3506 
3507 /*
3508  * ReleaseBuffer -- release the pin on a buffer
3509  */
3510 void
3512 {
3513  if (!BufferIsValid(buffer))
3514  elog(ERROR, "bad buffer ID: %d", buffer);
3515 
3516  if (BufferIsLocal(buffer))
3517  {
3519 
3520  Assert(LocalRefCount[-buffer - 1] > 0);
3521  LocalRefCount[-buffer - 1]--;
3522  return;
3523  }
3524 
3525  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3526 }
3527 
3528 /*
3529  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3530  *
3531  * This is just a shorthand for a common combination.
3532  */
3533 void
3535 {
3536  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3537  ReleaseBuffer(buffer);
3538 }
3539 
3540 /*
3541  * IncrBufferRefCount
3542  * Increment the pin count on a buffer that we have *already* pinned
3543  * at least once.
3544  *
3545  * This function cannot be used on a buffer we do not have pinned,
3546  * because it doesn't change the shared buffer state.
3547  */
3548 void
3550 {
3551  Assert(BufferIsPinned(buffer));
3553  if (BufferIsLocal(buffer))
3554  LocalRefCount[-buffer - 1]++;
3555  else
3556  {
3557  PrivateRefCountEntry *ref;
3558 
3559  ref = GetPrivateRefCountEntry(buffer, true);
3560  Assert(ref != NULL);
3561  ref->refcount++;
3562  }
3564 }
3565 
3566 /*
3567  * MarkBufferDirtyHint
3568  *
3569  * Mark a buffer dirty for non-critical changes.
3570  *
3571  * This is essentially the same as MarkBufferDirty, except:
3572  *
3573  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3574  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
3575  * 2. The caller might have only share-lock instead of exclusive-lock on the
3576  * buffer's content lock.
3577  * 3. This function does not guarantee that the buffer is always marked dirty
3578  * (due to a race condition), so it cannot be used for important changes.
3579  */
3580 void
3582 {
3583  BufferDesc *bufHdr;
3584  Page page = BufferGetPage(buffer);
3585 
3586  if (!BufferIsValid(buffer))
3587  elog(ERROR, "bad buffer ID: %d", buffer);
3588 
3589  if (BufferIsLocal(buffer))
3590  {
3591  MarkLocalBufferDirty(buffer);
3592  return;
3593  }
3594 
3595  bufHdr = GetBufferDescriptor(buffer - 1);
3596 
3597  Assert(GetPrivateRefCount(buffer) > 0);
3598  /* here, either share or exclusive lock is OK */
3600 
3601  /*
3602  * This routine might get called many times on the same page, if we are
3603  * making the first scan after commit of an xact that added/deleted many
3604  * tuples. So, be as quick as we can if the buffer is already dirty. We
3605  * do this by not acquiring spinlock if it looks like the status bits are
3606  * already set. Since we make this test unlocked, there's a chance we
3607  * might fail to notice that the flags have just been cleared, and failed
3608  * to reset them, due to memory-ordering issues. But since this function
3609  * is only intended to be used in cases where failing to write out the
3610  * data would be harmless anyway, it doesn't really matter.
3611  */
3612  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3614  {
3616  bool dirtied = false;
3617  bool delayChkpt = false;
3618  uint32 buf_state;
3619 
3620  /*
3621  * If we need to protect hint bit updates from torn writes, WAL-log a
3622  * full page image of the page. This full page image is only necessary
3623  * if the hint bit update is the first change to the page since the
3624  * last checkpoint.
3625  *
3626  * We don't check full_page_writes here because that logic is included
3627  * when we call XLogInsert() since the value changes dynamically.
3628  */
3629  if (XLogHintBitIsNeeded() &&
3630  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3631  {
3632  /*
3633  * If we must not write WAL, due to a relfilenode-specific
3634  * condition or being in recovery, don't dirty the page. We can
3635  * set the hint, just not dirty the page as a result so the hint
3636  * is lost when we evict the page or shutdown.
3637  *
3638  * See src/backend/storage/page/README for longer discussion.
3639  */
3640  if (RecoveryInProgress() ||
3641  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3642  return;
3643 
3644  /*
3645  * If the block is already dirty because we either made a change
3646  * or set a hint already, then we don't need to write a full page
3647  * image. Note that aggressive cleaning of blocks dirtied by hint
3648  * bit setting would increase the call rate. Bulk setting of hint
3649  * bits would reduce the call rate...
3650  *
3651  * We must issue the WAL record before we mark the buffer dirty.
3652  * Otherwise we might write the page before we write the WAL. That
3653  * causes a race condition, since a checkpoint might occur between
3654  * writing the WAL record and marking the buffer dirty. We solve
3655  * that with a kluge, but one that is already in use during
3656  * transaction commit to prevent race conditions. Basically, we
3657  * simply prevent the checkpoint WAL record from being written
3658  * until we have marked the buffer dirty. We don't start the
3659  * checkpoint flush until we have marked dirty, so our checkpoint
3660  * must flush the change to disk successfully or the checkpoint
3661  * never gets written, so crash recovery will fix.
3662  *
3663  * It's possible we may enter here without an xid, so it is
3664  * essential that CreateCheckpoint waits for virtual transactions
3665  * rather than full transactionids.
3666  */
3667  MyProc->delayChkpt = delayChkpt = true;
3668  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3669  }
3670 
3671  buf_state = LockBufHdr(bufHdr);
3672 
3673  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3674 
3675  if (!(buf_state & BM_DIRTY))
3676  {
3677  dirtied = true; /* Means "will be dirtied by this action" */
3678 
3679  /*
3680  * Set the page LSN if we wrote a backup block. We aren't supposed
3681  * to set this when only holding a share lock but as long as we
3682  * serialise it somehow we're OK. We choose to set LSN while
3683  * holding the buffer header lock, which causes any reader of an
3684  * LSN who holds only a share lock to also obtain a buffer header
3685  * lock before using PageGetLSN(), which is enforced in
3686  * BufferGetLSNAtomic().
3687  *
3688  * If checksums are enabled, you might think we should reset the
3689  * checksum here. That will happen when the page is written
3690  * sometime later in this checkpoint cycle.
3691  */
3692  if (!XLogRecPtrIsInvalid(lsn))
3693  PageSetLSN(page, lsn);
3694  }
3695 
3696  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3697  UnlockBufHdr(bufHdr, buf_state);
3698 
3699  if (delayChkpt)
3700  MyProc->delayChkpt = false;
3701 
3702  if (dirtied)
3703  {
3704  VacuumPageDirty++;
3706  if (VacuumCostActive)
3708  }
3709  }
3710 }
3711 
3712 /*
3713  * Release buffer content locks for shared buffers.
3714  *
3715  * Used to clean up after errors.
3716  *
3717  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3718  * of releasing buffer content locks per se; the only thing we need to deal
3719  * with here is clearing any PIN_COUNT request that was in progress.
3720  */
3721 void
3723 {
3725 
3726  if (buf)
3727  {
3728  uint32 buf_state;
3729 
3730  buf_state = LockBufHdr(buf);
3731 
3732  /*
3733  * Don't complain if flag bit not set; it could have been reset but we
3734  * got a cancel/die interrupt before getting the signal.
3735  */
3736  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3737  buf->wait_backend_pid == MyProcPid)
3738  buf_state &= ~BM_PIN_COUNT_WAITER;
3739 
3740  UnlockBufHdr(buf, buf_state);
3741 
3742  PinCountWaitBuf = NULL;
3743  }
3744 }
3745 
3746 /*
3747  * Acquire or release the content_lock for the buffer.
3748  */
3749 void
3751 {
3752  BufferDesc *buf;
3753 
3754  Assert(BufferIsPinned(buffer));
3755  if (BufferIsLocal(buffer))
3756  return; /* local buffers need no lock */
3757 
3758  buf = GetBufferDescriptor(buffer - 1);
3759 
3760  if (mode == BUFFER_LOCK_UNLOCK)
3762  else if (mode == BUFFER_LOCK_SHARE)
3764  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3766  else
3767  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3768 }
3769 
3770 /*
3771  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3772  *
3773  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3774  */
3775 bool
3777 {
3778  BufferDesc *buf;
3779 
3780  Assert(BufferIsPinned(buffer));
3781  if (BufferIsLocal(buffer))
3782  return true; /* act as though we got it */
3783 
3784  buf = GetBufferDescriptor(buffer - 1);
3785 
3787  LW_EXCLUSIVE);
3788 }
3789 
3790 /*
3791  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3792  *
3793  * Items may be deleted from a disk page only when the caller (a) holds an
3794  * exclusive lock on the buffer and (b) has observed that no other backend
3795  * holds a pin on the buffer. If there is a pin, then the other backend
3796  * might have a pointer into the buffer (for example, a heapscan reference
3797  * to an item --- see README for more details). It's OK if a pin is added
3798  * after the cleanup starts, however; the newly-arrived backend will be
3799  * unable to look at the page until we release the exclusive lock.
3800  *
3801  * To implement this protocol, a would-be deleter must pin the buffer and
3802  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3803  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3804  * it has successfully observed pin count = 1.
3805  */
3806 void
3808 {
3809  BufferDesc *bufHdr;
3810  char *new_status = NULL;
3811 
3812  Assert(BufferIsPinned(buffer));
3813  Assert(PinCountWaitBuf == NULL);
3814 
3815  if (BufferIsLocal(buffer))
3816  {
3817  /* There should be exactly one pin */
3818  if (LocalRefCount[-buffer - 1] != 1)
3819  elog(ERROR, "incorrect local pin count: %d",
3820  LocalRefCount[-buffer - 1]);
3821  /* Nobody else to wait for */
3822  return;
3823  }
3824 
3825  /* There should be exactly one local pin */
3826  if (GetPrivateRefCount(buffer) != 1)
3827  elog(ERROR, "incorrect local pin count: %d",
3828  GetPrivateRefCount(buffer));
3829 
3830  bufHdr = GetBufferDescriptor(buffer - 1);
3831 
3832  for (;;)
3833  {
3834  uint32 buf_state;
3835 
3836  /* Try to acquire lock */
3838  buf_state = LockBufHdr(bufHdr);
3839 
3840  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3841  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3842  {
3843  /* Successfully acquired exclusive lock with pincount 1 */
3844  UnlockBufHdr(bufHdr, buf_state);
3845 
3846  /* Report change to non-waiting status */
3847  if (new_status)
3848  {
3849  set_ps_display(new_status);
3850  pfree(new_status);
3851  }
3852  return;
3853  }
3854  /* Failed, so mark myself as waiting for pincount 1 */
3855  if (buf_state & BM_PIN_COUNT_WAITER)
3856  {
3857  UnlockBufHdr(bufHdr, buf_state);
3858  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3859  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3860  }
3861  bufHdr->wait_backend_pid = MyProcPid;
3862  PinCountWaitBuf = bufHdr;
3863  buf_state |= BM_PIN_COUNT_WAITER;
3864  UnlockBufHdr(bufHdr, buf_state);
3865  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3866 
3867  /* Wait to be signaled by UnpinBuffer() */
3868  if (InHotStandby)
3869  {
3870  /* Report change to waiting status */
3871  if (update_process_title && new_status == NULL)
3872  {
3873  const char *old_status;
3874  int len;
3875 
3876  old_status = get_ps_display(&len);
3877  new_status = (char *) palloc(len + 8 + 1);
3878  memcpy(new_status, old_status, len);
3879  strcpy(new_status + len, " waiting");
3880  set_ps_display(new_status);
3881  new_status[len] = '\0'; /* truncate off " waiting" */
3882  }
3883 
3884  /* Publish the bufid that Startup process waits on */
3885  SetStartupBufferPinWaitBufId(buffer - 1);
3886  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3888  /* Reset the published bufid */
3890  }
3891  else
3893 
3894  /*
3895  * Remove flag marking us as waiter. Normally this will not be set
3896  * anymore, but ProcWaitForSignal() can return for other signals as
3897  * well. We take care to only reset the flag if we're the waiter, as
3898  * theoretically another backend could have started waiting. That's
3899  * impossible with the current usages due to table level locking, but
3900  * better be safe.
3901  */
3902  buf_state = LockBufHdr(bufHdr);
3903  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3904  bufHdr->wait_backend_pid == MyProcPid)
3905  buf_state &= ~BM_PIN_COUNT_WAITER;
3906  UnlockBufHdr(bufHdr, buf_state);
3907 
3908  PinCountWaitBuf = NULL;
3909  /* Loop back and try again */
3910  }
3911 }
3912 
3913 /*
3914  * Check called from RecoveryConflictInterrupt handler when Startup
3915  * process requests cancellation of all pin holders that are blocking it.
3916  */
3917 bool
3919 {
3920  int bufid = GetStartupBufferPinWaitBufId();
3921 
3922  /*
3923  * If we get woken slowly then it's possible that the Startup process was
3924  * already woken by other backends before we got here. Also possible that
3925  * we get here by multiple interrupts or interrupts at inappropriate
3926  * times, so make sure we do nothing if the bufid is not set.
3927  */
3928  if (bufid < 0)
3929  return false;
3930 
3931  if (GetPrivateRefCount(bufid + 1) > 0)
3932  return true;
3933 
3934  return false;
3935 }
3936 
3937 /*
3938  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3939  *
3940  * We won't loop, but just check once to see if the pin count is OK. If
3941  * not, return false with no lock held.
3942  */
3943 bool
3945 {
3946  BufferDesc *bufHdr;
3947  uint32 buf_state,
3948  refcount;
3949 
3950  Assert(BufferIsValid(buffer));
3951 
3952  if (BufferIsLocal(buffer))
3953  {
3954  refcount = LocalRefCount[-buffer - 1];
3955  /* There should be exactly one pin */
3956  Assert(refcount > 0);
3957  if (refcount != 1)
3958  return false;
3959  /* Nobody else to wait for */
3960  return true;
3961  }
3962 
3963  /* There should be exactly one local pin */
3964  refcount = GetPrivateRefCount(buffer);
3965  Assert(refcount);
3966  if (refcount != 1)
3967  return false;
3968 
3969  /* Try to acquire lock */
3970  if (!ConditionalLockBuffer(buffer))
3971  return false;
3972 
3973  bufHdr = GetBufferDescriptor(buffer - 1);
3974  buf_state = LockBufHdr(bufHdr);
3975  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3976 
3977  Assert(refcount > 0);
3978  if (refcount == 1)
3979  {
3980  /* Successfully acquired exclusive lock with pincount 1 */
3981  UnlockBufHdr(bufHdr, buf_state);
3982  return true;
3983  }
3984 
3985  /* Failed, so release the lock */
3986  UnlockBufHdr(bufHdr, buf_state);
3987  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3988  return false;
3989 }
3990 
3991 /*
3992  * IsBufferCleanupOK - as above, but we already have the lock
3993  *
3994  * Check whether it's OK to perform cleanup on a buffer we've already
3995  * locked. If we observe that the pin count is 1, our exclusive lock
3996  * happens to be a cleanup lock, and we can proceed with anything that
3997  * would have been allowable had we sought a cleanup lock originally.
3998  */
3999 bool
4001 {
4002  BufferDesc *bufHdr;
4003  uint32 buf_state;
4004 
4005  Assert(BufferIsValid(buffer));
4006 
4007  if (BufferIsLocal(buffer))
4008  {
4009  /* There should be exactly one pin */
4010  if (LocalRefCount[-buffer - 1] != 1)
4011  return false;
4012  /* Nobody else to wait for */
4013  return true;
4014  }
4015 
4016  /* There should be exactly one local pin */
4017  if (GetPrivateRefCount(buffer) != 1)
4018  return false;
4019 
4020  bufHdr = GetBufferDescriptor(buffer - 1);
4021 
4022  /* caller must hold exclusive lock on buffer */
4024  LW_EXCLUSIVE));
4025 
4026  buf_state = LockBufHdr(bufHdr);
4027 
4028  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4029  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4030  {
4031  /* pincount is OK. */
4032  UnlockBufHdr(bufHdr, buf_state);
4033  return true;
4034  }
4035 
4036  UnlockBufHdr(bufHdr, buf_state);
4037  return false;
4038 }
4039 
4040 
4041 /*
4042  * Functions for buffer I/O handling
4043  *
4044  * Note: We assume that nested buffer I/O never occurs.
4045  * i.e at most one io_in_progress lock is held per proc.
4046  *
4047  * Also note that these are used only for shared buffers, not local ones.
4048  */
4049 
4050 /*
4051  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
4052  */
4053 static void
4055 {
4056  /*
4057  * Changed to wait until there's no IO - Inoue 01/13/2000
4058  *
4059  * Note this is *necessary* because an error abort in the process doing
4060  * I/O could release the io_in_progress_lock prematurely. See
4061  * AbortBufferIO.
4062  */
4063  for (;;)
4064  {
4065  uint32 buf_state;
4066 
4067  /*
4068  * It may not be necessary to acquire the spinlock to check the flag
4069  * here, but since this test is essential for correctness, we'd better
4070  * play it safe.
4071  */
4072  buf_state = LockBufHdr(buf);
4073  UnlockBufHdr(buf, buf_state);
4074 
4075  if (!(buf_state & BM_IO_IN_PROGRESS))
4076  break;
4079  }
4080 }
4081 
4082 /*
4083  * StartBufferIO: begin I/O on this buffer
4084  * (Assumptions)
4085  * My process is executing no IO
4086  * The buffer is Pinned
4087  *
4088  * In some scenarios there are race conditions in which multiple backends
4089  * could attempt the same I/O operation concurrently. If someone else
4090  * has already started I/O on this buffer then we will block on the
4091  * io_in_progress lock until he's done.
4092  *
4093  * Input operations are only attempted on buffers that are not BM_VALID,
4094  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
4095  * so we can always tell if the work is already done.
4096  *
4097  * Returns true if we successfully marked the buffer as I/O busy,
4098  * false if someone else already did the work.
4099  */
4100 static bool
4101 StartBufferIO(BufferDesc *buf, bool forInput)
4102 {
4103  uint32 buf_state;
4104 
4105  Assert(!InProgressBuf);
4106 
4107  for (;;)
4108  {
4109  /*
4110  * Grab the io_in_progress lock so that other processes can wait for
4111  * me to finish the I/O.
4112  */
4114 
4115  buf_state = LockBufHdr(buf);
4116 
4117  if (!(buf_state & BM_IO_IN_PROGRESS))
4118  break;
4119 
4120  /*
4121  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
4122  * lock isn't held is if the process doing the I/O is recovering from
4123  * an error (see AbortBufferIO). If that's the case, we must wait for
4124  * him to get unwedged.
4125  */
4126  UnlockBufHdr(buf, buf_state);
4128  WaitIO(buf);
4129  }
4130 
4131  /* Once we get here, there is definitely no I/O active on this buffer */
4132 
4133  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
4134  {
4135  /* someone else already did the I/O */
4136  UnlockBufHdr(buf, buf_state);
4138  return false;
4139  }
4140 
4141  buf_state |= BM_IO_IN_PROGRESS;
4142  UnlockBufHdr(buf, buf_state);
4143 
4144  InProgressBuf = buf;
4145  IsForInput = forInput;
4146 
4147  return true;
4148 }
4149 
4150 /*
4151  * TerminateBufferIO: release a buffer we were doing I/O on
4152  * (Assumptions)
4153  * My process is executing IO for the buffer
4154  * BM_IO_IN_PROGRESS bit is set for the buffer
4155  * We hold the buffer's io_in_progress lock
4156  * The buffer is Pinned
4157  *
4158  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
4159  * buffer's BM_DIRTY flag. This is appropriate when terminating a
4160  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
4161  * marking the buffer clean if it was re-dirtied while we were writing.
4162  *
4163  * set_flag_bits gets ORed into the buffer's flags. It must include
4164  * BM_IO_ERROR in a failure case. For successful completion it could
4165  * be 0, or BM_VALID if we just finished reading in the page.
4166  */
4167 static void
4168 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
4169 {
4170  uint32 buf_state;
4171 
4172  Assert(buf == InProgressBuf);
4173 
4174  buf_state = LockBufHdr(buf);
4175 
4176  Assert(buf_state & BM_IO_IN_PROGRESS);
4177 
4178  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
4179  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
4180  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
4181 
4182  buf_state |= set_flag_bits;
4183  UnlockBufHdr(buf, buf_state);
4184 
4185  InProgressBuf = NULL;
4186 
4188 }
4189 
4190 /*
4191  * AbortBufferIO: Clean up any active buffer I/O after an error.
4192  *
4193  * All LWLocks we might have held have been released,
4194  * but we haven't yet released buffer pins, so the buffer is still pinned.
4195  *
4196  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4197  * possible the error condition wasn't related to the I/O.
4198  */
4199 void
4201 {
4203 
4204  if (buf)
4205  {
4206  uint32 buf_state;
4207 
4208  /*
4209  * Since LWLockReleaseAll has already been called, we're not holding
4210  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4211  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4212  * buffer will be in a busy spin until we succeed in doing this.
4213  */
4215 
4216  buf_state = LockBufHdr(buf);
4217  Assert(buf_state & BM_IO_IN_PROGRESS);
4218  if (IsForInput)
4219  {
4220  Assert(!(buf_state & BM_DIRTY));
4221 
4222  /* We'd better not think buffer is valid yet */
4223  Assert(!(buf_state & BM_VALID));
4224  UnlockBufHdr(buf, buf_state);
4225  }
4226  else
4227  {
4228  Assert(buf_state & BM_DIRTY);
4229  UnlockBufHdr(buf, buf_state);
4230  /* Issue notice if this is not the first failure... */
4231  if (buf_state & BM_IO_ERROR)
4232  {
4233  /* Buffer is pinned, so we can read tag without spinlock */
4234  char *path;
4235 
4236  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4237  ereport(WARNING,
4238  (errcode(ERRCODE_IO_ERROR),
4239  errmsg("could not write block %u of %s",
4240  buf->tag.blockNum, path),
4241  errdetail("Multiple failures --- write error might be permanent.")));
4242  pfree(path);
4243  }
4244  }
4245  TerminateBufferIO(buf, false, BM_IO_ERROR);
4246  }
4247 }
4248 
4249 /*
4250  * Error context callback for errors occurring during shared buffer writes.
4251  */
4252 static void
4254 {
4255  BufferDesc *bufHdr = (BufferDesc *) arg;
4256 
4257  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4258  if (bufHdr != NULL)
4259  {
4260  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4261 
4262  errcontext("writing block %u of relation %s",
4263  bufHdr->tag.blockNum, path);
4264  pfree(path);
4265  }
4266 }
4267 
4268 /*
4269  * Error context callback for errors occurring during local buffer writes.
4270  */
4271 static void
4273 {
4274  BufferDesc *bufHdr = (BufferDesc *) arg;
4275 
4276  if (bufHdr != NULL)
4277  {
4278  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4279  bufHdr->tag.forkNum);
4280 
4281  errcontext("writing block %u of relation %s",
4282  bufHdr->tag.blockNum, path);
4283  pfree(path);
4284  }
4285 }
4286 
4287 /*
4288  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4289  */
4290 static int
4291 rnode_comparator(const void *p1, const void *p2)
4292 {
4293  RelFileNode n1 = *(const RelFileNode *) p1;
4294  RelFileNode n2 = *(const RelFileNode *) p2;
4295 
4296  if (n1.relNode < n2.relNode)
4297  return -1;
4298  else if (n1.relNode > n2.relNode)
4299  return 1;
4300 
4301  if (n1.dbNode < n2.dbNode)
4302  return -1;
4303  else if (n1.dbNode > n2.dbNode)
4304  return 1;
4305 
4306  if (n1.spcNode < n2.spcNode)
4307  return -1;
4308  else if (n1.spcNode > n2.spcNode)
4309  return 1;
4310  else
4311  return 0;
4312 }
4313 
4314 /*
4315  * Lock buffer header - set BM_LOCKED in buffer state.
4316  */
4317 uint32
4319 {
4320  SpinDelayStatus delayStatus;
4321  uint32 old_buf_state;
4322 
4323  init_local_spin_delay(&delayStatus);
4324 
4325  while (true)
4326  {
4327  /* set BM_LOCKED flag */
4328  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4329  /* if it wasn't set before we're OK */
4330  if (!(old_buf_state & BM_LOCKED))
4331  break;
4332  perform_spin_delay(&delayStatus);
4333  }
4334  finish_spin_delay(&delayStatus);
4335  return old_buf_state | BM_LOCKED;
4336 }
4337 
4338 /*
4339  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4340  * state at that point.
4341  *
4342  * Obviously the buffer could be locked by the time the value is returned, so
4343  * this is primarily useful in CAS style loops.
4344  */
4345 static uint32
4347 {
4348  SpinDelayStatus delayStatus;
4349  uint32 buf_state;
4350 
4351  init_local_spin_delay(&delayStatus);
4352 
4353  buf_state = pg_atomic_read_u32(&buf->state);
4354 
4355  while (buf_state & BM_LOCKED)
4356  {
4357  perform_spin_delay(&delayStatus);
4358  buf_state = pg_atomic_read_u32(&buf->state);
4359  }
4360 
4361  finish_spin_delay(&delayStatus);
4362 
4363  return buf_state;
4364 }
4365 
4366 /*
4367  * BufferTag comparator.
4368  */
4369 static int
4370 buffertag_comparator(const void *a, const void *b)
4371 {
4372  const BufferTag *ba = (const BufferTag *) a;
4373  const BufferTag *bb = (const BufferTag *) b;
4374  int ret;
4375 
4376  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4377 
4378  if (ret != 0)
4379  return ret;
4380 
4381  if (ba->forkNum < bb->forkNum)
4382  return -1;
4383  if (ba->forkNum > bb->forkNum)
4384  return 1;
4385 
4386  if (ba->blockNum < bb->blockNum)
4387  return -1;
4388  if (ba->blockNum > bb->blockNum)
4389  return 1;
4390 
4391  return 0;
4392 }
4393 
4394 /*
4395  * Comparator determining the writeout order in a checkpoint.
4396  *
4397  * It is important that tablespaces are compared first, the logic balancing
4398  * writes between tablespaces relies on it.
4399  */
4400 static int
4401 ckpt_buforder_comparator(const void *pa, const void *pb)
4402 {
4403  const CkptSortItem *a = (const CkptSortItem *) pa;
4404  const CkptSortItem *b = (const CkptSortItem *) pb;
4405 
4406  /* compare tablespace */
4407  if (a->tsId < b->tsId)
4408  return -1;
4409  else if (a->tsId > b->tsId)
4410  return 1;
4411  /* compare relation */
4412  if (a->relNode < b->relNode)
4413  return -1;
4414  else if (a->relNode > b->relNode)
4415  return 1;
4416  /* compare fork */
4417  else if (a->forkNum < b->forkNum)
4418  return -1;
4419  else if (a->forkNum > b->forkNum)
4420  return 1;
4421  /* compare block number */
4422  else if (a->blockNum < b->blockNum)
4423  return -1;
4424  else if (a->blockNum > b->blockNum)
4425  return 1;
4426  /* equal page IDs are unlikely, but not impossible */
4427  return 0;
4428 }
4429 
4430 /*
4431  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4432  * progress.
4433  */
4434 static int
4436 {
4437  CkptTsStatus *sa = (CkptTsStatus *) a;
4438  CkptTsStatus *sb = (CkptTsStatus *) b;
4439 
4440  /* we want a min-heap, so return 1 for the a < b */
4441  if (sa->progress < sb->progress)
4442  return 1;
4443  else if (sa->progress == sb->progress)
4444  return 0;
4445  else
4446  return -1;
4447 }
4448 
4449 /*
4450  * Initialize a writeback context, discarding potential previous state.
4451  *
4452  * *max_pending is a pointer instead of an immediate value, so the coalesce
4453  * limits can easily changed by the GUC mechanism, and so calling code does
4454  * not have to check the current configuration. A value of 0 means that no
4455  * writeback control will be performed.
4456  */
4457 void
4458 WritebackContextInit(WritebackContext *context, int *max_pending)
4459 {
4460  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4461 
4462  context->max_pending = max_pending;
4463  context->nr_pending = 0;
4464 }
4465 
4466 /*
4467  * Add buffer to list of pending writeback requests.
4468  */
4469 void
4471 {
4472  PendingWriteback *pending;
4473 
4474  /*
4475  * Add buffer to the pending writeback array, unless writeback control is
4476  * disabled.
4477  */
4478  if (*context->max_pending > 0)
4479  {
4481 
4482  pending = &context->pending_writebacks[context->nr_pending++];
4483 
4484  pending->tag = *tag;
4485  }
4486 
4487  /*
4488  * Perform pending flushes if the writeback limit is exceeded. This
4489  * includes the case where previously an item has been added, but control
4490  * is now disabled.
4491  */
4492  if (context->nr_pending >= *context->max_pending)
4493  IssuePendingWritebacks(context);
4494 }
4495 
4496 /*
4497  * Issue all pending writeback requests, previously scheduled with
4498  * ScheduleBufferTagForWriteback, to the OS.
4499  *
4500  * Because this is only used to improve the OSs IO scheduling we try to never
4501  * error out - it's just a hint.
4502  */
4503 void
4505 {
4506  int i;
4507 
4508  if (context->nr_pending == 0)
4509  return;
4510 
4511  /*
4512  * Executing the writes in-order can make them a lot faster, and allows to
4513  * merge writeback requests to consecutive blocks into larger writebacks.
4514  */
4515  qsort(&context->pending_writebacks, context->nr_pending,
4517 
4518  /*
4519  * Coalesce neighbouring writes, but nothing else. For that we iterate
4520  * through the, now sorted, array of pending flushes, and look forward to
4521  * find all neighbouring (or identical) writes.
4522  */
4523  for (i = 0; i < context->nr_pending; i++)
4524  {
4527  SMgrRelation reln;
4528  int ahead;
4529  BufferTag tag;
4530  Size nblocks = 1;
4531 
4532  cur = &context->pending_writebacks[i];
4533  tag = cur->tag;
4534 
4535  /*
4536  * Peek ahead, into following writeback requests, to see if they can
4537  * be combined with the current one.
4538  */
4539  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4540  {
4541  next = &context->pending_writebacks[i + ahead + 1];
4542 
4543  /* different file, stop */
4544  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4545  cur->tag.forkNum != next->tag.forkNum)
4546  break;
4547 
4548  /* ok, block queued twice, skip */
4549  if (cur->tag.blockNum == next->tag.blockNum)
4550  continue;
4551 
4552  /* only merge consecutive writes */
4553  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4554  break;
4555 
4556  nblocks++;
4557  cur = next;
4558  }
4559 
4560  i += ahead;
4561 
4562  /* and finally tell the kernel to write the data to storage */
4563  reln = smgropen(tag.rnode, InvalidBackendId);
4564  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4565  }
4566 
4567  context->nr_pending = 0;
4568 }
4569 
4570 
4571 /*
4572  * Implement slower/larger portions of TestForOldSnapshot
4573  *
4574  * Smaller/faster portions are put inline, but the entire set of logic is too
4575  * big for that.
4576  */
4577 void
4579 {
4580  if (RelationAllowsEarlyPruning(relation)
4581  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4582  ereport(ERROR,
4583  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4584  errmsg("snapshot too old")));
4585 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:64
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1590
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:680
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3807
Definition: lwlock.h:31
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:429
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2637
PgStat_Counter m_buf_alloc
Definition: pgstat.h:434
#define BM_PERMANENT
Definition: buf_internals.h:66
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
int64 VacuumPageMiss
Definition: globals.c:145
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:226
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
int errhint(const char *fmt,...)
Definition: elog.c:1071
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1660
long local_blks_read
Definition: instrument.h:26
int maintenance_io_concurrency
Definition: bufmgr.c:142
#define BM_TAG_VALID
Definition: buf_internals.h:60
Oid tsId
Definition: bufmgr.c:89
static int32 next
Definition: blutils.c:219
int VacuumCostBalance
Definition: globals.c:148
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2108
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2594
int BgWriterDelay
Definition: bgwriter.c:64
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
#define HASH_ELEM
Definition: hsearch.h:85
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
static uint32 PrivateRefCountClock
Definition: bufmgr.c:192
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3581
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
instr_time blk_read_time
Definition: instrument.h:31
bool update_process_title
Definition: ps_status.c:36
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1371
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4272
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3166
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1469
PGPROC * MyProc
Definition: proc.c:67
int backend_flush_after
Definition: bufmgr.c:150
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2478
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:431
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:71
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:653
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:917
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:205
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4054
int64 VacuumPageHit
Definition: globals.c:144
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:452
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4435
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:402
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:430
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:72
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:695
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1385
int checkpoint_flush_after
Definition: bufmgr.c:148
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:949
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
int64 VacuumPageDirty
Definition: globals.c:146
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3511
#define P_NEW
Definition: bufmgr.h:91
double bgwriter_lru_multiplier
Definition: bufmgr.c:126
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:919
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:109
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8074
#define BM_DIRTY
Definition: buf_internals.h:58
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:3344
int VacuumCostPageDirty
Definition: globals.c:140
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
Buffer recent_buffer
Definition: bufmgr.h:54
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2845
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
int effective_io_concurrency
Definition: bufmgr.c:135
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2411
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4504
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:153
signed int int32
Definition: c.h:362
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4458
int bgwriter_flush_after
Definition: bufmgr.c:149
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
struct SMgrSortArray SMgrSortArray
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:502
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:273
ErrorContextCallback * error_context_stack
Definition: elog.c:92
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define RelationOpenSmgr(relation)
Definition: rel.h:513
void ProcSendSignal(int pid)
Definition: proc.c:1808
#define SmgrIsTemp(smgr)
Definition: smgr.h:77
#define BUF_REUSABLE
Definition: bufmgr.c:69
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:218
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4101
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pfree(void *pointer)
Definition: mcxt.c:1057
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2500
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3534
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3944
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:498
bool delayChkpt
Definition: proc.h:176
#define RelationIsValid(relation)
Definition: rel.h:429
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4470
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
int bgwriter_lru_maxpages
Definition: bufmgr.c:125
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1427
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
SMgrRelation srel
Definition: bufmgr.c:120
int num_to_scan
Definition: bufmgr.c:102
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:589
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:99
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2924
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:640
static char * buf
Definition: pg_test_fsync.c:68
int index
Definition: bufmgr.c:107
float8 progress
Definition: bufmgr.c:98
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3448
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:524
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1645
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4401
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:222
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:537
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4370
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2894
#define BUF_WRITTEN
Definition: bufmgr.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static bool IsForInput
Definition: bufmgr.c:154
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3776
int VacuumCostPageHit
Definition: globals.c:138
static void BufferSync(int flags)
Definition: bufmgr.c:1831
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:4000
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1738
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1796
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:252
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:488
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:193
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:652
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1432
void UnlockBuffers(void)
Definition: bufmgr.c:3722
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4168
#define HASH_BLOBS
Definition: hsearch.h:86
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4291
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:59
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:3044
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:326
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3750
Size keysize
Definition: hsearch.h:71
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:716
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:144
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2848
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1006
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:786
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:549
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1437
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2553
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3918
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:80
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3252
CheckpointStatsData CheckpointStats
Definition: xlog.c:186
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:607
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1693
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:473
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
BackendId backend
Definition: relfilenode.h:75
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1439
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1414
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1532
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
void AbortBufferIO(void)
Definition: bufmgr.c:4200
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4346
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1401
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:488
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:496
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1391
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3491
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
#define BM_IO_ERROR
Definition: buf_internals.h:62
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:2975
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:463
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2661
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190
#define elog(elevel,...)
Definition: elog.h:214
int i
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:575
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:157
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:185
int NBuffers
Definition: globals.c:132
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
int num_scanned
Definition: bufmgr.c:104
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2524
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61
int VacuumCostPageMiss
Definition: globals.c:139
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
RelFileNode rnode
Definition: bufmgr.c:119
#define qsort(a, b, c, d)
Definition: port.h:475
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:61
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2682
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3147
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4253
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4578
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2647
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3549
#define XLogHintBitIsNeeded()
Definition: xlog.h:202
bool track_io_timing
Definition: bufmgr.c:127
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:578
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:221
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:149
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:124
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:939
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2534