PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/rel.h"
53 #include "utils/resowner_private.h"
54 #include "utils/timestamp.h"
55 
56 
57 /* Note: these two macros only work on shared buffers, not local ones! */
58 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
59 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
60 
61 /* Note: this macro only works on local buffers, not shared ones! */
62 #define LocalBufHdrGetBlock(bufHdr) \
63  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
64 
65 /* Bits in SyncOneBuffer's return value */
66 #define BUF_WRITTEN 0x01
67 #define BUF_REUSABLE 0x02
68 
69 #define DROP_RELS_BSEARCH_THRESHOLD 20
70 
71 typedef struct PrivateRefCountEntry
72 {
76 
77 /* 64 bytes, about the size of a cache line on common systems */
78 #define REFCOUNT_ARRAY_ENTRIES 8
79 
80 /*
81  * Status of buffers to checkpoint for a particular tablespace, used
82  * internally in BufferSync.
83  */
84 typedef struct CkptTsStatus
85 {
86  /* oid of the tablespace */
88 
89  /*
90  * Checkpoint progress for this tablespace. To make progress comparable
91  * between tablespaces the progress is, for each tablespace, measured as a
92  * number between 0 and the total number of to-be-checkpointed pages. Each
93  * page checkpointed in this tablespace increments this space's progress
94  * by progress_slice.
95  */
98 
99  /* number of to-be checkpointed pages in this tablespace */
101  /* already processed pages in this tablespace */
103 
104  /* current offset in CkptBufferIds for this tablespace */
105  int index;
106 } CkptTsStatus;
107 
108 /* GUC variables */
109 bool zero_damaged_pages = false;
112 bool track_io_timing = false;
114 
115 /*
116  * GUC variables about triggering kernel writeback for buffers written; OS
117  * dependent defaults are set via the GUC mechanism.
118  */
122 
123 /*
124  * How many buffers PrefetchBuffer callers should try to stay ahead of their
125  * ReadBuffer calls by. This is maintained by the assign hook for
126  * effective_io_concurrency. Zero means "never prefetch". This value is
127  * only used for buffers not belonging to tablespaces that have their
128  * effective_io_concurrency parameter set.
129  */
131 
132 /* local state for StartBufferIO and related functions */
133 static BufferDesc *InProgressBuf = NULL;
134 static bool IsForInput;
135 
136 /* local state for LockBufferForCleanup */
138 
139 /*
140  * Backend-Private refcount management:
141  *
142  * Each buffer also has a private refcount that keeps track of the number of
143  * times the buffer is pinned in the current process. This is so that the
144  * shared refcount needs to be modified only once if a buffer is pinned more
145  * than once by an individual backend. It's also used to check that no buffers
146  * are still pinned at the end of transactions and when exiting.
147  *
148  *
149  * To avoid - as we used to - requiring an array with NBuffers entries to keep
150  * track of local buffers, we use a small sequentially searched array
151  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
152  * keep track of backend local pins.
153  *
154  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
155  * refcounts are kept track of in the array; after that, new array entries
156  * displace old ones into the hash table. That way a frequently used entry
157  * can't get "stuck" in the hashtable while infrequent ones clog the array.
158  *
159  * Note that in most scenarios the number of pinned buffers will not exceed
160  * REFCOUNT_ARRAY_ENTRIES.
161  *
162  *
163  * To enter a buffer into the refcount tracking mechanism first reserve a free
164  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
165  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
166  * memory allocations in NewPrivateRefCountEntry() which can be important
167  * because in some scenarios it's called with a spinlock held...
168  */
170 static HTAB *PrivateRefCountHash = NULL;
174 
175 static void ReservePrivateRefCountEntry(void);
178 static inline int32 GetPrivateRefCount(Buffer buffer);
180 
181 /*
182  * Ensure that the PrivateRefCountArray has sufficient space to store one more
183  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
184  * a new entry - but it's perfectly fine to not use a reserved entry.
185  */
186 static void
188 {
189  /* Already reserved (or freed), nothing to do */
190  if (ReservedRefCountEntry != NULL)
191  return;
192 
193  /*
194  * First search for a free entry the array, that'll be sufficient in the
195  * majority of cases.
196  */
197  {
198  int i;
199 
200  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
201  {
203 
204  res = &PrivateRefCountArray[i];
205 
206  if (res->buffer == InvalidBuffer)
207  {
208  ReservedRefCountEntry = res;
209  return;
210  }
211  }
212  }
213 
214  /*
215  * No luck. All array entries are full. Move one array entry into the hash
216  * table.
217  */
218  {
219  /*
220  * Move entry from the current clock position in the array into the
221  * hashtable. Use that slot.
222  */
223  PrivateRefCountEntry *hashent;
224  bool found;
225 
226  /* select victim slot */
227  ReservedRefCountEntry =
229 
230  /* Better be used, otherwise we shouldn't get here. */
231  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
232 
233  /* enter victim array entry into hashtable */
234  hashent = hash_search(PrivateRefCountHash,
235  (void *) &(ReservedRefCountEntry->buffer),
236  HASH_ENTER,
237  &found);
238  Assert(!found);
239  hashent->refcount = ReservedRefCountEntry->refcount;
240 
241  /* clear the now free array slot */
242  ReservedRefCountEntry->buffer = InvalidBuffer;
243  ReservedRefCountEntry->refcount = 0;
244 
246  }
247 }
248 
249 /*
250  * Fill a previously reserved refcount entry.
251  */
252 static PrivateRefCountEntry *
254 {
256 
257  /* only allowed to be called when a reservation has been made */
258  Assert(ReservedRefCountEntry != NULL);
259 
260  /* use up the reserved entry */
261  res = ReservedRefCountEntry;
262  ReservedRefCountEntry = NULL;
263 
264  /* and fill it */
265  res->buffer = buffer;
266  res->refcount = 0;
267 
268  return res;
269 }
270 
271 /*
272  * Return the PrivateRefCount entry for the passed buffer.
273  *
274  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
275  * do_move is true, and the entry resides in the hashtable the entry is
276  * optimized for frequent access by moving it to the array.
277  */
278 static PrivateRefCountEntry *
280 {
282  int i;
283 
284  Assert(BufferIsValid(buffer));
285  Assert(!BufferIsLocal(buffer));
286 
287  /*
288  * First search for references in the array, that'll be sufficient in the
289  * majority of cases.
290  */
291  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
292  {
293  res = &PrivateRefCountArray[i];
294 
295  if (res->buffer == buffer)
296  return res;
297  }
298 
299  /*
300  * By here we know that the buffer, if already pinned, isn't residing in
301  * the array.
302  *
303  * Only look up the buffer in the hashtable if we've previously overflowed
304  * into it.
305  */
306  if (PrivateRefCountOverflowed == 0)
307  return NULL;
308 
309  res = hash_search(PrivateRefCountHash,
310  (void *) &buffer,
311  HASH_FIND,
312  NULL);
313 
314  if (res == NULL)
315  return NULL;
316  else if (!do_move)
317  {
318  /* caller doesn't want us to move the hash entry into the array */
319  return res;
320  }
321  else
322  {
323  /* move buffer from hashtable into the free array slot */
324  bool found;
326 
327  /* Ensure there's a free array slot */
329 
330  /* Use up the reserved slot */
331  Assert(ReservedRefCountEntry != NULL);
332  free = ReservedRefCountEntry;
333  ReservedRefCountEntry = NULL;
334  Assert(free->buffer == InvalidBuffer);
335 
336  /* and fill it */
337  free->buffer = buffer;
338  free->refcount = res->refcount;
339 
340  /* delete from hashtable */
341  hash_search(PrivateRefCountHash,
342  (void *) &buffer,
343  HASH_REMOVE,
344  &found);
345  Assert(found);
348 
349  return free;
350  }
351 }
352 
353 /*
354  * Returns how many times the passed buffer is pinned by this backend.
355  *
356  * Only works for shared memory buffers!
357  */
358 static inline int32
360 {
362 
363  Assert(BufferIsValid(buffer));
364  Assert(!BufferIsLocal(buffer));
365 
366  /*
367  * Not moving the entry - that's ok for the current users, but we might
368  * want to change this one day.
369  */
370  ref = GetPrivateRefCountEntry(buffer, false);
371 
372  if (ref == NULL)
373  return 0;
374  return ref->refcount;
375 }
376 
377 /*
378  * Release resources used to track the reference count of a buffer which we no
379  * longer have pinned and don't want to pin again immediately.
380  */
381 static void
383 {
384  Assert(ref->refcount == 0);
385 
386  if (ref >= &PrivateRefCountArray[0] &&
388  {
389  ref->buffer = InvalidBuffer;
390 
391  /*
392  * Mark the just used entry as reserved - in many scenarios that
393  * allows us to avoid ever having to search the array/hash for free
394  * entries.
395  */
396  ReservedRefCountEntry = ref;
397  }
398  else
399  {
400  bool found;
401  Buffer buffer = ref->buffer;
402 
403  hash_search(PrivateRefCountHash,
404  (void *) &buffer,
405  HASH_REMOVE,
406  &found);
407  Assert(found);
410  }
411 }
412 
413 /*
414  * BufferIsPinned
415  * True iff the buffer is pinned (also checks for valid buffer number).
416  *
417  * NOTE: what we check here is that *this* backend holds a pin on
418  * the buffer. We do not care whether some other backend does.
419  */
420 #define BufferIsPinned(bufnum) \
421 ( \
422  !BufferIsValid(bufnum) ? \
423  false \
424  : \
425  BufferIsLocal(bufnum) ? \
426  (LocalRefCount[-(bufnum) - 1] > 0) \
427  : \
428  (GetPrivateRefCount(bufnum) > 0) \
429 )
430 
431 
433  ForkNumber forkNum, BlockNumber blockNum,
435  bool *hit);
436 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
437 static void PinBuffer_Locked(BufferDesc *buf);
438 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
439 static void BufferSync(int flags);
441 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
442  WritebackContext *wb_context);
443 static void WaitIO(BufferDesc *buf);
444 static bool StartBufferIO(BufferDesc *buf, bool forInput);
445 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
446  uint32 set_flag_bits);
447 static void shared_buffer_write_error_callback(void *arg);
448 static void local_buffer_write_error_callback(void *arg);
449 static BufferDesc *BufferAlloc(SMgrRelation smgr,
450  char relpersistence,
451  ForkNumber forkNum,
452  BlockNumber blockNum,
453  BufferAccessStrategy strategy,
454  bool *foundPtr);
455 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
456 static void AtProcExit_Buffers(int code, Datum arg);
457 static void CheckForBufferLeaks(void);
458 static int rnode_comparator(const void *p1, const void *p2);
459 static int buffertag_comparator(const void *p1, const void *p2);
460 static int ckpt_buforder_comparator(const void *pa, const void *pb);
461 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
462 
463 
464 /*
465  * ComputeIoConcurrency -- get the number of pages to prefetch for a given
466  * number of spindles.
467  */
468 bool
469 ComputeIoConcurrency(int io_concurrency, double *target)
470 {
471  double new_prefetch_pages = 0.0;
472  int i;
473 
474  /*
475  * Make sure the io_concurrency value is within valid range; it may have
476  * been forced with a manual pg_tablespace update.
477  */
478  io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
479 
480  /*----------
481  * The user-visible GUC parameter is the number of drives (spindles),
482  * which we need to translate to a number-of-pages-to-prefetch target.
483  * The target value is stashed in *extra and then assigned to the actual
484  * variable by assign_effective_io_concurrency.
485  *
486  * The expected number of prefetch pages needed to keep N drives busy is:
487  *
488  * drives | I/O requests
489  * -------+----------------
490  * 1 | 1
491  * 2 | 2/1 + 2/2 = 3
492  * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
493  * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
494  * n | n * H(n)
495  *
496  * This is called the "coupon collector problem" and H(n) is called the
497  * harmonic series. This could be approximated by n * ln(n), but for
498  * reasonable numbers of drives we might as well just compute the series.
499  *
500  * Alternatively we could set the target to the number of pages necessary
501  * so that the expected number of active spindles is some arbitrary
502  * percentage of the total. This sounds the same but is actually slightly
503  * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
504  * that desired fraction.
505  *
506  * Experimental results show that both of these formulas aren't aggressive
507  * enough, but we don't really have any better proposals.
508  *
509  * Note that if io_concurrency = 0 (disabled), we must set target = 0.
510  *----------
511  */
512 
513  for (i = 1; i <= io_concurrency; i++)
514  new_prefetch_pages += (double) io_concurrency / (double) i;
515 
516  *target = new_prefetch_pages;
517 
518  /* This range check shouldn't fail, but let's be paranoid */
519  return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
520 }
521 
522 /*
523  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
524  *
525  * This is named by analogy to ReadBuffer but doesn't actually allocate a
526  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
527  * block will not be delayed by the I/O. Prefetching is optional.
528  * No-op if prefetching isn't compiled in.
529  */
530 void
532 {
533 #ifdef USE_PREFETCH
534  Assert(RelationIsValid(reln));
535  Assert(BlockNumberIsValid(blockNum));
536 
537  /* Open it at the smgr level if not already done */
538  RelationOpenSmgr(reln);
539 
540  if (RelationUsesLocalBuffers(reln))
541  {
542  /* see comments in ReadBufferExtended */
543  if (RELATION_IS_OTHER_TEMP(reln))
544  ereport(ERROR,
545  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
546  errmsg("cannot access temporary tables of other sessions")));
547 
548  /* pass it off to localbuf.c */
549  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
550  }
551  else
552  {
553  BufferTag newTag; /* identity of requested block */
554  uint32 newHash; /* hash value for newTag */
555  LWLock *newPartitionLock; /* buffer partition lock for it */
556  int buf_id;
557 
558  /* create a tag so we can lookup the buffer */
559  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
560  forkNum, blockNum);
561 
562  /* determine its hash code and partition lock ID */
563  newHash = BufTableHashCode(&newTag);
564  newPartitionLock = BufMappingPartitionLock(newHash);
565 
566  /* see if the block is in the buffer pool already */
567  LWLockAcquire(newPartitionLock, LW_SHARED);
568  buf_id = BufTableLookup(&newTag, newHash);
569  LWLockRelease(newPartitionLock);
570 
571  /* If not in buffers, initiate prefetch */
572  if (buf_id < 0)
573  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
574 
575  /*
576  * If the block *is* in buffers, we do nothing. This is not really
577  * ideal: the block might be just about to be evicted, which would be
578  * stupid since we know we are going to need it soon. But the only
579  * easy answer is to bump the usage_count, which does not seem like a
580  * great solution: when the caller does ultimately touch the block,
581  * usage_count would get bumped again, resulting in too much
582  * favoritism for blocks that are involved in a prefetch sequence. A
583  * real fix would involve some additional per-buffer state, and it's
584  * not clear that there's enough of a problem to justify that.
585  */
586  }
587 #endif /* USE_PREFETCH */
588 }
589 
590 
591 /*
592  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
593  * fork with RBM_NORMAL mode and default strategy.
594  */
595 Buffer
597 {
598  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
599 }
600 
601 /*
602  * ReadBufferExtended -- returns a buffer containing the requested
603  * block of the requested relation. If the blknum
604  * requested is P_NEW, extend the relation file and
605  * allocate a new block. (Caller is responsible for
606  * ensuring that only one backend tries to extend a
607  * relation at the same time!)
608  *
609  * Returns: the buffer number for the buffer containing
610  * the block read. The returned buffer has been pinned.
611  * Does not return on error --- elog's instead.
612  *
613  * Assume when this function is called, that reln has been opened already.
614  *
615  * In RBM_NORMAL mode, the page is read from disk, and the page header is
616  * validated. An error is thrown if the page header is not valid. (But
617  * note that an all-zero page is considered "valid"; see PageIsVerified().)
618  *
619  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
620  * valid, the page is zeroed instead of throwing an error. This is intended
621  * for non-critical data, where the caller is prepared to repair errors.
622  *
623  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
624  * filled with zeros instead of reading it from disk. Useful when the caller
625  * is going to fill the page from scratch, since this saves I/O and avoids
626  * unnecessary failure if the page-on-disk has corrupt page headers.
627  * The page is returned locked to ensure that the caller has a chance to
628  * initialize the page before it's made visible to others.
629  * Caution: do not use this mode to read a page that is beyond the relation's
630  * current physical EOF; that is likely to cause problems in md.c when
631  * the page is modified and written out. P_NEW is OK, though.
632  *
633  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
634  * a cleanup-strength lock on the page.
635  *
636  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
637  *
638  * If strategy is not NULL, a nondefault buffer access strategy is used.
639  * See buffer/README for details.
640  */
641 Buffer
644 {
645  bool hit;
646  Buffer buf;
647 
648  /* Open it at the smgr level if not already done */
649  RelationOpenSmgr(reln);
650 
651  /*
652  * Reject attempts to read non-local temporary relations; we would be
653  * likely to get wrong data since we have no visibility into the owning
654  * session's local buffers.
655  */
656  if (RELATION_IS_OTHER_TEMP(reln))
657  ereport(ERROR,
658  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
659  errmsg("cannot access temporary tables of other sessions")));
660 
661  /*
662  * Read the buffer, and update pgstat counters to reflect a cache hit or
663  * miss.
664  */
666  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
667  forkNum, blockNum, mode, strategy, &hit);
668  if (hit)
670  return buf;
671 }
672 
673 
674 /*
675  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
676  * a relcache entry for the relation.
677  *
678  * NB: At present, this function may only be used on permanent relations, which
679  * is OK, because we only use it during XLOG replay. If in the future we
680  * want to use it on temporary or unlogged relations, we could pass additional
681  * parameters.
682  */
683 Buffer
685  BlockNumber blockNum, ReadBufferMode mode,
686  BufferAccessStrategy strategy)
687 {
688  bool hit;
689 
690  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
691 
693 
694  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
695  mode, strategy, &hit);
696 }
697 
698 
699 /*
700  * ReadBuffer_common -- common logic for all ReadBuffer variants
701  *
702  * *hit is set to true if the request was satisfied from shared buffer cache.
703  */
704 static Buffer
706  BlockNumber blockNum, ReadBufferMode mode,
707  BufferAccessStrategy strategy, bool *hit)
708 {
709  BufferDesc *bufHdr;
710  Block bufBlock;
711  bool found;
712  bool isExtend;
713  bool isLocalBuf = SmgrIsTemp(smgr);
714 
715  *hit = false;
716 
717  /* Make sure we will have room to remember the buffer pin */
719 
720  isExtend = (blockNum == P_NEW);
721 
722  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
723  smgr->smgr_rnode.node.spcNode,
724  smgr->smgr_rnode.node.dbNode,
725  smgr->smgr_rnode.node.relNode,
726  smgr->smgr_rnode.backend,
727  isExtend);
728 
729  /* Substitute proper block number if caller asked for P_NEW */
730  if (isExtend)
731  blockNum = smgrnblocks(smgr, forkNum);
732 
733  if (isLocalBuf)
734  {
735  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
736  if (found)
738  else if (isExtend)
740  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
741  mode == RBM_ZERO_ON_ERROR)
743  }
744  else
745  {
746  /*
747  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
748  * not currently in memory.
749  */
750  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
751  strategy, &found);
752  if (found)
754  else if (isExtend)
756  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
757  mode == RBM_ZERO_ON_ERROR)
759  }
760 
761  /* At this point we do NOT hold any locks. */
762 
763  /* if it was already in the buffer pool, we're done */
764  if (found)
765  {
766  if (!isExtend)
767  {
768  /* Just need to update stats before we exit */
769  *hit = true;
770  VacuumPageHit++;
771 
772  if (VacuumCostActive)
774 
775  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
776  smgr->smgr_rnode.node.spcNode,
777  smgr->smgr_rnode.node.dbNode,
778  smgr->smgr_rnode.node.relNode,
779  smgr->smgr_rnode.backend,
780  isExtend,
781  found);
782 
783  /*
784  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
785  * locked on return.
786  */
787  if (!isLocalBuf)
788  {
789  if (mode == RBM_ZERO_AND_LOCK)
791  LW_EXCLUSIVE);
792  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
794  }
795 
796  return BufferDescriptorGetBuffer(bufHdr);
797  }
798 
799  /*
800  * We get here only in the corner case where we are trying to extend
801  * the relation but we found a pre-existing buffer marked BM_VALID.
802  * This can happen because mdread doesn't complain about reads beyond
803  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
804  * read a block beyond EOF could have left a "valid" zero-filled
805  * buffer. Unfortunately, we have also seen this case occurring
806  * because of buggy Linux kernels that sometimes return an
807  * lseek(SEEK_END) result that doesn't account for a recent write. In
808  * that situation, the pre-existing buffer would contain valid data
809  * that we don't want to overwrite. Since the legitimate case should
810  * always have left a zero-filled buffer, complain if not PageIsNew.
811  */
812  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
813  if (!PageIsNew((Page) bufBlock))
814  ereport(ERROR,
815  (errmsg("unexpected data beyond EOF in block %u of relation %s",
816  blockNum, relpath(smgr->smgr_rnode, forkNum)),
817  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
818 
819  /*
820  * We *must* do smgrextend before succeeding, else the page will not
821  * be reserved by the kernel, and the next P_NEW call will decide to
822  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
823  * call that BufferAlloc didn't, and proceed.
824  */
825  if (isLocalBuf)
826  {
827  /* Only need to adjust flags */
828  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
829 
830  Assert(buf_state & BM_VALID);
831  buf_state &= ~BM_VALID;
832  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
833  }
834  else
835  {
836  /*
837  * Loop to handle the very small possibility that someone re-sets
838  * BM_VALID between our clearing it and StartBufferIO inspecting
839  * it.
840  */
841  do
842  {
843  uint32 buf_state = LockBufHdr(bufHdr);
844 
845  Assert(buf_state & BM_VALID);
846  buf_state &= ~BM_VALID;
847  UnlockBufHdr(bufHdr, buf_state);
848  } while (!StartBufferIO(bufHdr, true));
849  }
850  }
851 
852  /*
853  * if we have gotten to this point, we have allocated a buffer for the
854  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
855  * if it's a shared buffer.
856  *
857  * Note: if smgrextend fails, we will end up with a buffer that is
858  * allocated but not marked BM_VALID. P_NEW will still select the same
859  * block number (because the relation didn't get any longer on disk) and
860  * so future attempts to extend the relation will find the same buffer (if
861  * it's not been recycled) but come right back here to try smgrextend
862  * again.
863  */
864  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
865 
866  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
867 
868  if (isExtend)
869  {
870  /* new buffers are zero-filled */
871  MemSet((char *) bufBlock, 0, BLCKSZ);
872  /* don't set checksum for all-zero page */
873  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
874 
875  /*
876  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
877  * although we're essentially performing a write. At least on linux
878  * doing so defeats the 'delayed allocation' mechanism, leading to
879  * increased file fragmentation.
880  */
881  }
882  else
883  {
884  /*
885  * Read in the page, unless the caller intends to overwrite it and
886  * just wants us to allocate a buffer.
887  */
888  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
889  MemSet((char *) bufBlock, 0, BLCKSZ);
890  else
891  {
892  instr_time io_start,
893  io_time;
894 
895  if (track_io_timing)
896  INSTR_TIME_SET_CURRENT(io_start);
897 
898  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
899 
900  if (track_io_timing)
901  {
902  INSTR_TIME_SET_CURRENT(io_time);
903  INSTR_TIME_SUBTRACT(io_time, io_start);
906  }
907 
908  /* check for garbage data */
909  if (!PageIsVerified((Page) bufBlock, blockNum))
910  {
911  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
912  {
915  errmsg("invalid page in block %u of relation %s; zeroing out page",
916  blockNum,
917  relpath(smgr->smgr_rnode, forkNum))));
918  MemSet((char *) bufBlock, 0, BLCKSZ);
919  }
920  else
921  ereport(ERROR,
923  errmsg("invalid page in block %u of relation %s",
924  blockNum,
925  relpath(smgr->smgr_rnode, forkNum))));
926  }
927  }
928  }
929 
930  /*
931  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
932  * the page as valid, to make sure that no other backend sees the zeroed
933  * page before the caller has had a chance to initialize it.
934  *
935  * Since no-one else can be looking at the page contents yet, there is no
936  * difference between an exclusive lock and a cleanup-strength lock. (Note
937  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
938  * they assert that the buffer is already valid.)
939  */
940  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
941  !isLocalBuf)
942  {
944  }
945 
946  if (isLocalBuf)
947  {
948  /* Only need to adjust flags */
949  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
950 
951  buf_state |= BM_VALID;
952  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
953  }
954  else
955  {
956  /* Set BM_VALID, terminate IO, and wake up any waiters */
957  TerminateBufferIO(bufHdr, false, BM_VALID);
958  }
959 
960  VacuumPageMiss++;
961  if (VacuumCostActive)
963 
964  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
965  smgr->smgr_rnode.node.spcNode,
966  smgr->smgr_rnode.node.dbNode,
967  smgr->smgr_rnode.node.relNode,
968  smgr->smgr_rnode.backend,
969  isExtend,
970  found);
971 
972  return BufferDescriptorGetBuffer(bufHdr);
973 }
974 
975 /*
976  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
977  * buffer. If no buffer exists already, selects a replacement
978  * victim and evicts the old page, but does NOT read in new page.
979  *
980  * "strategy" can be a buffer replacement strategy object, or NULL for
981  * the default strategy. The selected buffer's usage_count is advanced when
982  * using the default strategy, but otherwise possibly not (see PinBuffer).
983  *
984  * The returned buffer is pinned and is already marked as holding the
985  * desired page. If it already did have the desired page, *foundPtr is
986  * set true. Otherwise, *foundPtr is set false and the buffer is marked
987  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
988  *
989  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
990  * we keep it for simplicity in ReadBuffer.
991  *
992  * No locks are held either at entry or exit.
993  */
994 static BufferDesc *
996  BlockNumber blockNum,
997  BufferAccessStrategy strategy,
998  bool *foundPtr)
999 {
1000  BufferTag newTag; /* identity of requested block */
1001  uint32 newHash; /* hash value for newTag */
1002  LWLock *newPartitionLock; /* buffer partition lock for it */
1003  BufferTag oldTag; /* previous identity of selected buffer */
1004  uint32 oldHash; /* hash value for oldTag */
1005  LWLock *oldPartitionLock; /* buffer partition lock for it */
1006  uint32 oldFlags;
1007  int buf_id;
1008  BufferDesc *buf;
1009  bool valid;
1010  uint32 buf_state;
1011 
1012  /* create a tag so we can lookup the buffer */
1013  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1014 
1015  /* determine its hash code and partition lock ID */
1016  newHash = BufTableHashCode(&newTag);
1017  newPartitionLock = BufMappingPartitionLock(newHash);
1018 
1019  /* see if the block is in the buffer pool already */
1020  LWLockAcquire(newPartitionLock, LW_SHARED);
1021  buf_id = BufTableLookup(&newTag, newHash);
1022  if (buf_id >= 0)
1023  {
1024  /*
1025  * Found it. Now, pin the buffer so no one can steal it from the
1026  * buffer pool, and check to see if the correct data has been loaded
1027  * into the buffer.
1028  */
1029  buf = GetBufferDescriptor(buf_id);
1030 
1031  valid = PinBuffer(buf, strategy);
1032 
1033  /* Can release the mapping lock as soon as we've pinned it */
1034  LWLockRelease(newPartitionLock);
1035 
1036  *foundPtr = true;
1037 
1038  if (!valid)
1039  {
1040  /*
1041  * We can only get here if (a) someone else is still reading in
1042  * the page, or (b) a previous read attempt failed. We have to
1043  * wait for any active read attempt to finish, and then set up our
1044  * own read attempt if the page is still not BM_VALID.
1045  * StartBufferIO does it all.
1046  */
1047  if (StartBufferIO(buf, true))
1048  {
1049  /*
1050  * If we get here, previous attempts to read the buffer must
1051  * have failed ... but we shall bravely try again.
1052  */
1053  *foundPtr = false;
1054  }
1055  }
1056 
1057  return buf;
1058  }
1059 
1060  /*
1061  * Didn't find it in the buffer pool. We'll have to initialize a new
1062  * buffer. Remember to unlock the mapping lock while doing the work.
1063  */
1064  LWLockRelease(newPartitionLock);
1065 
1066  /* Loop here in case we have to try another victim buffer */
1067  for (;;)
1068  {
1069  /*
1070  * Ensure, while the spinlock's not yet held, that there's a free
1071  * refcount entry.
1072  */
1074 
1075  /*
1076  * Select a victim buffer. The buffer is returned with its header
1077  * spinlock still held!
1078  */
1079  buf = StrategyGetBuffer(strategy, &buf_state);
1080 
1081  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1082 
1083  /* Must copy buffer flags while we still hold the spinlock */
1084  oldFlags = buf_state & BUF_FLAG_MASK;
1085 
1086  /* Pin the buffer and then release the buffer spinlock */
1087  PinBuffer_Locked(buf);
1088 
1089  /*
1090  * If the buffer was dirty, try to write it out. There is a race
1091  * condition here, in that someone might dirty it after we released it
1092  * above, or even while we are writing it out (since our share-lock
1093  * won't prevent hint-bit updates). We will recheck the dirty bit
1094  * after re-locking the buffer header.
1095  */
1096  if (oldFlags & BM_DIRTY)
1097  {
1098  /*
1099  * We need a share-lock on the buffer contents to write it out
1100  * (else we might write invalid data, eg because someone else is
1101  * compacting the page contents while we write). We must use a
1102  * conditional lock acquisition here to avoid deadlock. Even
1103  * though the buffer was not pinned (and therefore surely not
1104  * locked) when StrategyGetBuffer returned it, someone else could
1105  * have pinned and exclusive-locked it by the time we get here. If
1106  * we try to get the lock unconditionally, we'd block waiting for
1107  * them; if they later block waiting for us, deadlock ensues.
1108  * (This has been observed to happen when two backends are both
1109  * trying to split btree index pages, and the second one just
1110  * happens to be trying to split the page the first one got from
1111  * StrategyGetBuffer.)
1112  */
1114  LW_SHARED))
1115  {
1116  /*
1117  * If using a nondefault strategy, and writing the buffer
1118  * would require a WAL flush, let the strategy decide whether
1119  * to go ahead and write/reuse the buffer or to choose another
1120  * victim. We need lock to inspect the page LSN, so this
1121  * can't be done inside StrategyGetBuffer.
1122  */
1123  if (strategy != NULL)
1124  {
1125  XLogRecPtr lsn;
1126 
1127  /* Read the LSN while holding buffer header lock */
1128  buf_state = LockBufHdr(buf);
1129  lsn = BufferGetLSN(buf);
1130  UnlockBufHdr(buf, buf_state);
1131 
1132  if (XLogNeedsFlush(lsn) &&
1133  StrategyRejectBuffer(strategy, buf))
1134  {
1135  /* Drop lock/pin and loop around for another buffer */
1137  UnpinBuffer(buf, true);
1138  continue;
1139  }
1140  }
1141 
1142  /* OK, do the I/O */
1143  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1144  smgr->smgr_rnode.node.spcNode,
1145  smgr->smgr_rnode.node.dbNode,
1146  smgr->smgr_rnode.node.relNode);
1147 
1148  FlushBuffer(buf, NULL);
1150 
1152  &buf->tag);
1153 
1154  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1155  smgr->smgr_rnode.node.spcNode,
1156  smgr->smgr_rnode.node.dbNode,
1157  smgr->smgr_rnode.node.relNode);
1158  }
1159  else
1160  {
1161  /*
1162  * Someone else has locked the buffer, so give it up and loop
1163  * back to get another one.
1164  */
1165  UnpinBuffer(buf, true);
1166  continue;
1167  }
1168  }
1169 
1170  /*
1171  * To change the association of a valid buffer, we'll need to have
1172  * exclusive lock on both the old and new mapping partitions.
1173  */
1174  if (oldFlags & BM_TAG_VALID)
1175  {
1176  /*
1177  * Need to compute the old tag's hashcode and partition lock ID.
1178  * XXX is it worth storing the hashcode in BufferDesc so we need
1179  * not recompute it here? Probably not.
1180  */
1181  oldTag = buf->tag;
1182  oldHash = BufTableHashCode(&oldTag);
1183  oldPartitionLock = BufMappingPartitionLock(oldHash);
1184 
1185  /*
1186  * Must lock the lower-numbered partition first to avoid
1187  * deadlocks.
1188  */
1189  if (oldPartitionLock < newPartitionLock)
1190  {
1191  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1192  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1193  }
1194  else if (oldPartitionLock > newPartitionLock)
1195  {
1196  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1197  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1198  }
1199  else
1200  {
1201  /* only one partition, only one lock */
1202  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1203  }
1204  }
1205  else
1206  {
1207  /* if it wasn't valid, we need only the new partition */
1208  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1209  /* remember we have no old-partition lock or tag */
1210  oldPartitionLock = NULL;
1211  /* keep the compiler quiet about uninitialized variables */
1212  oldHash = 0;
1213  }
1214 
1215  /*
1216  * Try to make a hashtable entry for the buffer under its new tag.
1217  * This could fail because while we were writing someone else
1218  * allocated another buffer for the same block we want to read in.
1219  * Note that we have not yet removed the hashtable entry for the old
1220  * tag.
1221  */
1222  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1223 
1224  if (buf_id >= 0)
1225  {
1226  /*
1227  * Got a collision. Someone has already done what we were about to
1228  * do. We'll just handle this as if it were found in the buffer
1229  * pool in the first place. First, give up the buffer we were
1230  * planning to use.
1231  */
1232  UnpinBuffer(buf, true);
1233 
1234  /* Can give up that buffer's mapping partition lock now */
1235  if (oldPartitionLock != NULL &&
1236  oldPartitionLock != newPartitionLock)
1237  LWLockRelease(oldPartitionLock);
1238 
1239  /* remaining code should match code at top of routine */
1240 
1241  buf = GetBufferDescriptor(buf_id);
1242 
1243  valid = PinBuffer(buf, strategy);
1244 
1245  /* Can release the mapping lock as soon as we've pinned it */
1246  LWLockRelease(newPartitionLock);
1247 
1248  *foundPtr = true;
1249 
1250  if (!valid)
1251  {
1252  /*
1253  * We can only get here if (a) someone else is still reading
1254  * in the page, or (b) a previous read attempt failed. We
1255  * have to wait for any active read attempt to finish, and
1256  * then set up our own read attempt if the page is still not
1257  * BM_VALID. StartBufferIO does it all.
1258  */
1259  if (StartBufferIO(buf, true))
1260  {
1261  /*
1262  * If we get here, previous attempts to read the buffer
1263  * must have failed ... but we shall bravely try again.
1264  */
1265  *foundPtr = false;
1266  }
1267  }
1268 
1269  return buf;
1270  }
1271 
1272  /*
1273  * Need to lock the buffer header too in order to change its tag.
1274  */
1275  buf_state = LockBufHdr(buf);
1276 
1277  /*
1278  * Somebody could have pinned or re-dirtied the buffer while we were
1279  * doing the I/O and making the new hashtable entry. If so, we can't
1280  * recycle this buffer; we must undo everything we've done and start
1281  * over with a new victim buffer.
1282  */
1283  oldFlags = buf_state & BUF_FLAG_MASK;
1284  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1285  break;
1286 
1287  UnlockBufHdr(buf, buf_state);
1288  BufTableDelete(&newTag, newHash);
1289  if (oldPartitionLock != NULL &&
1290  oldPartitionLock != newPartitionLock)
1291  LWLockRelease(oldPartitionLock);
1292  LWLockRelease(newPartitionLock);
1293  UnpinBuffer(buf, true);
1294  }
1295 
1296  /*
1297  * Okay, it's finally safe to rename the buffer.
1298  *
1299  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1300  * paranoia. We also reset the usage_count since any recency of use of
1301  * the old content is no longer relevant. (The usage_count starts out at
1302  * 1 so that the buffer can survive one clock-sweep pass.)
1303  *
1304  * Make sure BM_PERMANENT is set for buffers that must be written at every
1305  * checkpoint. Unlogged buffers only need to be written at shutdown
1306  * checkpoints, except for their "init" forks, which need to be treated
1307  * just like permanent relations.
1308  */
1309  buf->tag = newTag;
1310  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1313  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1314  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1315  else
1316  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1317 
1318  UnlockBufHdr(buf, buf_state);
1319 
1320  if (oldPartitionLock != NULL)
1321  {
1322  BufTableDelete(&oldTag, oldHash);
1323  if (oldPartitionLock != newPartitionLock)
1324  LWLockRelease(oldPartitionLock);
1325  }
1326 
1327  LWLockRelease(newPartitionLock);
1328 
1329  /*
1330  * Buffer contents are currently invalid. Try to get the io_in_progress
1331  * lock. If StartBufferIO returns false, then someone else managed to
1332  * read it before we did, so there's nothing left for BufferAlloc() to do.
1333  */
1334  if (StartBufferIO(buf, true))
1335  *foundPtr = false;
1336  else
1337  *foundPtr = true;
1338 
1339  return buf;
1340 }
1341 
1342 /*
1343  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1344  * freelist.
1345  *
1346  * The buffer header spinlock must be held at entry. We drop it before
1347  * returning. (This is sane because the caller must have locked the
1348  * buffer in order to be sure it should be dropped.)
1349  *
1350  * This is used only in contexts such as dropping a relation. We assume
1351  * that no other backend could possibly be interested in using the page,
1352  * so the only reason the buffer might be pinned is if someone else is
1353  * trying to write it out. We have to let them finish before we can
1354  * reclaim the buffer.
1355  *
1356  * The buffer could get reclaimed by someone else while we are waiting
1357  * to acquire the necessary locks; if so, don't mess it up.
1358  */
1359 static void
1361 {
1362  BufferTag oldTag;
1363  uint32 oldHash; /* hash value for oldTag */
1364  LWLock *oldPartitionLock; /* buffer partition lock for it */
1365  uint32 oldFlags;
1366  uint32 buf_state;
1367 
1368  /* Save the original buffer tag before dropping the spinlock */
1369  oldTag = buf->tag;
1370 
1371  buf_state = pg_atomic_read_u32(&buf->state);
1372  Assert(buf_state & BM_LOCKED);
1373  UnlockBufHdr(buf, buf_state);
1374 
1375  /*
1376  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1377  * worth storing the hashcode in BufferDesc so we need not recompute it
1378  * here? Probably not.
1379  */
1380  oldHash = BufTableHashCode(&oldTag);
1381  oldPartitionLock = BufMappingPartitionLock(oldHash);
1382 
1383 retry:
1384 
1385  /*
1386  * Acquire exclusive mapping lock in preparation for changing the buffer's
1387  * association.
1388  */
1389  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1390 
1391  /* Re-lock the buffer header */
1392  buf_state = LockBufHdr(buf);
1393 
1394  /* If it's changed while we were waiting for lock, do nothing */
1395  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1396  {
1397  UnlockBufHdr(buf, buf_state);
1398  LWLockRelease(oldPartitionLock);
1399  return;
1400  }
1401 
1402  /*
1403  * We assume the only reason for it to be pinned is that someone else is
1404  * flushing the page out. Wait for them to finish. (This could be an
1405  * infinite loop if the refcount is messed up... it would be nice to time
1406  * out after awhile, but there seems no way to be sure how many loops may
1407  * be needed. Note that if the other guy has pinned the buffer but not
1408  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1409  * be busy-looping here.)
1410  */
1411  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1412  {
1413  UnlockBufHdr(buf, buf_state);
1414  LWLockRelease(oldPartitionLock);
1415  /* safety check: should definitely not be our *own* pin */
1417  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1418  WaitIO(buf);
1419  goto retry;
1420  }
1421 
1422  /*
1423  * Clear out the buffer's tag and flags. We must do this to ensure that
1424  * linear scans of the buffer array don't think the buffer is valid.
1425  */
1426  oldFlags = buf_state & BUF_FLAG_MASK;
1427  CLEAR_BUFFERTAG(buf->tag);
1428  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1429  UnlockBufHdr(buf, buf_state);
1430 
1431  /*
1432  * Remove the buffer from the lookup hashtable, if it was in there.
1433  */
1434  if (oldFlags & BM_TAG_VALID)
1435  BufTableDelete(&oldTag, oldHash);
1436 
1437  /*
1438  * Done with mapping lock.
1439  */
1440  LWLockRelease(oldPartitionLock);
1441 
1442  /*
1443  * Insert the buffer at the head of the list of free buffers.
1444  */
1445  StrategyFreeBuffer(buf);
1446 }
1447 
1448 /*
1449  * MarkBufferDirty
1450  *
1451  * Marks buffer contents as dirty (actual write happens later).
1452  *
1453  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1454  * exclusive lock, then somebody could be in process of writing the buffer,
1455  * leading to risk of bad data written to disk.)
1456  */
1457 void
1459 {
1460  BufferDesc *bufHdr;
1461  uint32 buf_state;
1462  uint32 old_buf_state;
1463 
1464  if (!BufferIsValid(buffer))
1465  elog(ERROR, "bad buffer ID: %d", buffer);
1466 
1467  if (BufferIsLocal(buffer))
1468  {
1469  MarkLocalBufferDirty(buffer);
1470  return;
1471  }
1472 
1473  bufHdr = GetBufferDescriptor(buffer - 1);
1474 
1475  Assert(BufferIsPinned(buffer));
1477  LW_EXCLUSIVE));
1478 
1479  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1480  for (;;)
1481  {
1482  if (old_buf_state & BM_LOCKED)
1483  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1484 
1485  buf_state = old_buf_state;
1486 
1487  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1488  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1489 
1490  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1491  buf_state))
1492  break;
1493  }
1494 
1495  /*
1496  * If the buffer was not dirty already, do vacuum accounting.
1497  */
1498  if (!(old_buf_state & BM_DIRTY))
1499  {
1500  VacuumPageDirty++;
1502  if (VacuumCostActive)
1504  }
1505 }
1506 
1507 /*
1508  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1509  *
1510  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1511  * compared to calling the two routines separately. Now it's mainly just
1512  * a convenience function. However, if the passed buffer is valid and
1513  * already contains the desired block, we just return it as-is; and that
1514  * does save considerable work compared to a full release and reacquire.
1515  *
1516  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1517  * buffer actually needs to be released. This case is the same as ReadBuffer,
1518  * but can save some tests in the caller.
1519  */
1520 Buffer
1522  Relation relation,
1523  BlockNumber blockNum)
1524 {
1525  ForkNumber forkNum = MAIN_FORKNUM;
1526  BufferDesc *bufHdr;
1527 
1528  if (BufferIsValid(buffer))
1529  {
1530  Assert(BufferIsPinned(buffer));
1531  if (BufferIsLocal(buffer))
1532  {
1533  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1534  if (bufHdr->tag.blockNum == blockNum &&
1535  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1536  bufHdr->tag.forkNum == forkNum)
1537  return buffer;
1539  LocalRefCount[-buffer - 1]--;
1540  }
1541  else
1542  {
1543  bufHdr = GetBufferDescriptor(buffer - 1);
1544  /* we have pin, so it's ok to examine tag without spinlock */
1545  if (bufHdr->tag.blockNum == blockNum &&
1546  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1547  bufHdr->tag.forkNum == forkNum)
1548  return buffer;
1549  UnpinBuffer(bufHdr, true);
1550  }
1551  }
1552 
1553  return ReadBuffer(relation, blockNum);
1554 }
1555 
1556 /*
1557  * PinBuffer -- make buffer unavailable for replacement.
1558  *
1559  * For the default access strategy, the buffer's usage_count is incremented
1560  * when we first pin it; for other strategies we just make sure the usage_count
1561  * isn't zero. (The idea of the latter is that we don't want synchronized
1562  * heap scans to inflate the count, but we need it to not be zero to discourage
1563  * other backends from stealing buffers from our ring. As long as we cycle
1564  * through the ring faster than the global clock-sweep cycles, buffers in
1565  * our ring won't be chosen as victims for replacement by other backends.)
1566  *
1567  * This should be applied only to shared buffers, never local ones.
1568  *
1569  * Since buffers are pinned/unpinned very frequently, pin buffers without
1570  * taking the buffer header lock; instead update the state variable in loop of
1571  * CAS operations. Hopefully it's just a single CAS.
1572  *
1573  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1574  *
1575  * Returns true if buffer is BM_VALID, else false. This provision allows
1576  * some callers to avoid an extra spinlock cycle.
1577  */
1578 static bool
1580 {
1582  bool result;
1583  PrivateRefCountEntry *ref;
1584 
1585  ref = GetPrivateRefCountEntry(b, true);
1586 
1587  if (ref == NULL)
1588  {
1589  uint32 buf_state;
1590  uint32 old_buf_state;
1591 
1593  ref = NewPrivateRefCountEntry(b);
1594 
1595  old_buf_state = pg_atomic_read_u32(&buf->state);
1596  for (;;)
1597  {
1598  if (old_buf_state & BM_LOCKED)
1599  old_buf_state = WaitBufHdrUnlocked(buf);
1600 
1601  buf_state = old_buf_state;
1602 
1603  /* increase refcount */
1604  buf_state += BUF_REFCOUNT_ONE;
1605 
1606  if (strategy == NULL)
1607  {
1608  /* Default case: increase usagecount unless already max. */
1610  buf_state += BUF_USAGECOUNT_ONE;
1611  }
1612  else
1613  {
1614  /*
1615  * Ring buffers shouldn't evict others from pool. Thus we
1616  * don't make usagecount more than 1.
1617  */
1618  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1619  buf_state += BUF_USAGECOUNT_ONE;
1620  }
1621 
1622  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1623  buf_state))
1624  {
1625  result = (buf_state & BM_VALID) != 0;
1626  break;
1627  }
1628  }
1629  }
1630  else
1631  {
1632  /* If we previously pinned the buffer, it must surely be valid */
1633  result = true;
1634  }
1635 
1636  ref->refcount++;
1637  Assert(ref->refcount > 0);
1639  return result;
1640 }
1641 
1642 /*
1643  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1644  * The spinlock is released before return.
1645  *
1646  * As this function is called with the spinlock held, the caller has to
1647  * previously call ReservePrivateRefCountEntry().
1648  *
1649  * Currently, no callers of this function want to modify the buffer's
1650  * usage_count at all, so there's no need for a strategy parameter.
1651  * Also we don't bother with a BM_VALID test (the caller could check that for
1652  * itself).
1653  *
1654  * Also all callers only ever use this function when it's known that the
1655  * buffer can't have a preexisting pin by this backend. That allows us to skip
1656  * searching the private refcount array & hash, which is a boon, because the
1657  * spinlock is still held.
1658  *
1659  * Note: use of this routine is frequently mandatory, not just an optimization
1660  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1661  * its state can change under us.
1662  */
1663 static void
1665 {
1666  Buffer b;
1667  PrivateRefCountEntry *ref;
1668  uint32 buf_state;
1669 
1670  /*
1671  * As explained, We don't expect any preexisting pins. That allows us to
1672  * manipulate the PrivateRefCount after releasing the spinlock
1673  */
1675 
1676  /*
1677  * Since we hold the buffer spinlock, we can update the buffer state and
1678  * release the lock in one operation.
1679  */
1680  buf_state = pg_atomic_read_u32(&buf->state);
1681  Assert(buf_state & BM_LOCKED);
1682  buf_state += BUF_REFCOUNT_ONE;
1683  UnlockBufHdr(buf, buf_state);
1684 
1685  b = BufferDescriptorGetBuffer(buf);
1686 
1687  ref = NewPrivateRefCountEntry(b);
1688  ref->refcount++;
1689 
1691 }
1692 
1693 /*
1694  * UnpinBuffer -- make buffer available for replacement.
1695  *
1696  * This should be applied only to shared buffers, never local ones.
1697  *
1698  * Most but not all callers want CurrentResourceOwner to be adjusted.
1699  * Those that don't should pass fixOwner = false.
1700  */
1701 static void
1702 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1703 {
1704  PrivateRefCountEntry *ref;
1706 
1707  /* not moving as we're likely deleting it soon anyway */
1708  ref = GetPrivateRefCountEntry(b, false);
1709  Assert(ref != NULL);
1710 
1711  if (fixOwner)
1713 
1714  Assert(ref->refcount > 0);
1715  ref->refcount--;
1716  if (ref->refcount == 0)
1717  {
1718  uint32 buf_state;
1719  uint32 old_buf_state;
1720 
1721  /* I'd better not still hold any locks on the buffer */
1724 
1725  /*
1726  * Decrement the shared reference count.
1727  *
1728  * Since buffer spinlock holder can update status using just write,
1729  * it's not safe to use atomic decrement here; thus use a CAS loop.
1730  */
1731  old_buf_state = pg_atomic_read_u32(&buf->state);
1732  for (;;)
1733  {
1734  if (old_buf_state & BM_LOCKED)
1735  old_buf_state = WaitBufHdrUnlocked(buf);
1736 
1737  buf_state = old_buf_state;
1738 
1739  buf_state -= BUF_REFCOUNT_ONE;
1740 
1741  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1742  buf_state))
1743  break;
1744  }
1745 
1746  /* Support LockBufferForCleanup() */
1747  if (buf_state & BM_PIN_COUNT_WAITER)
1748  {
1749  /*
1750  * Acquire the buffer header lock, re-check that there's a waiter.
1751  * Another backend could have unpinned this buffer, and already
1752  * woken up the waiter. There's no danger of the buffer being
1753  * replaced after we unpinned it above, as it's pinned by the
1754  * waiter.
1755  */
1756  buf_state = LockBufHdr(buf);
1757 
1758  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1759  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1760  {
1761  /* we just released the last pin other than the waiter's */
1762  int wait_backend_pid = buf->wait_backend_pid;
1763 
1764  buf_state &= ~BM_PIN_COUNT_WAITER;
1765  UnlockBufHdr(buf, buf_state);
1766  ProcSendSignal(wait_backend_pid);
1767  }
1768  else
1769  UnlockBufHdr(buf, buf_state);
1770  }
1772  }
1773 }
1774 
1775 /*
1776  * BufferSync -- Write out all dirty buffers in the pool.
1777  *
1778  * This is called at checkpoint time to write out all dirty shared buffers.
1779  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1780  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1781  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1782  * unlogged buffers, which are otherwise skipped. The remaining flags
1783  * currently have no effect here.
1784  */
1785 static void
1786 BufferSync(int flags)
1787 {
1788  uint32 buf_state;
1789  int buf_id;
1790  int num_to_scan;
1791  int num_spaces;
1792  int num_processed;
1793  int num_written;
1794  CkptTsStatus *per_ts_stat = NULL;
1795  Oid last_tsid;
1796  binaryheap *ts_heap;
1797  int i;
1798  int mask = BM_DIRTY;
1799  WritebackContext wb_context;
1800 
1801  /* Make sure we can handle the pin inside SyncOneBuffer */
1803 
1804  /*
1805  * Unless this is a shutdown checkpoint or we have been explicitly told,
1806  * we write only permanent, dirty buffers. But at shutdown or end of
1807  * recovery, we write all dirty buffers.
1808  */
1811  mask |= BM_PERMANENT;
1812 
1813  /*
1814  * Loop over all buffers, and mark the ones that need to be written with
1815  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1816  * can estimate how much work needs to be done.
1817  *
1818  * This allows us to write only those pages that were dirty when the
1819  * checkpoint began, and not those that get dirtied while it proceeds.
1820  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1821  * later in this function, or by normal backends or the bgwriter cleaning
1822  * scan, the flag is cleared. Any buffer dirtied after this point won't
1823  * have the flag set.
1824  *
1825  * Note that if we fail to write some buffer, we may leave buffers with
1826  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1827  * certainly need to be written for the next checkpoint attempt, too.
1828  */
1829  num_to_scan = 0;
1830  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1831  {
1832  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1833 
1834  /*
1835  * Header spinlock is enough to examine BM_DIRTY, see comment in
1836  * SyncOneBuffer.
1837  */
1838  buf_state = LockBufHdr(bufHdr);
1839 
1840  if ((buf_state & mask) == mask)
1841  {
1842  CkptSortItem *item;
1843 
1844  buf_state |= BM_CHECKPOINT_NEEDED;
1845 
1846  item = &CkptBufferIds[num_to_scan++];
1847  item->buf_id = buf_id;
1848  item->tsId = bufHdr->tag.rnode.spcNode;
1849  item->relNode = bufHdr->tag.rnode.relNode;
1850  item->forkNum = bufHdr->tag.forkNum;
1851  item->blockNum = bufHdr->tag.blockNum;
1852  }
1853 
1854  UnlockBufHdr(bufHdr, buf_state);
1855 
1856  /* Check for barrier events in case NBuffers is large. */
1859  }
1860 
1861  if (num_to_scan == 0)
1862  return; /* nothing to do */
1863 
1865 
1866  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1867 
1868  /*
1869  * Sort buffers that need to be written to reduce the likelihood of random
1870  * IO. The sorting is also important for the implementation of balancing
1871  * writes between tablespaces. Without balancing writes we'd potentially
1872  * end up writing to the tablespaces one-by-one; possibly overloading the
1873  * underlying system.
1874  */
1875  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1877 
1878  num_spaces = 0;
1879 
1880  /*
1881  * Allocate progress status for each tablespace with buffers that need to
1882  * be flushed. This requires the to-be-flushed array to be sorted.
1883  */
1884  last_tsid = InvalidOid;
1885  for (i = 0; i < num_to_scan; i++)
1886  {
1887  CkptTsStatus *s;
1888  Oid cur_tsid;
1889 
1890  cur_tsid = CkptBufferIds[i].tsId;
1891 
1892  /*
1893  * Grow array of per-tablespace status structs, every time a new
1894  * tablespace is found.
1895  */
1896  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1897  {
1898  Size sz;
1899 
1900  num_spaces++;
1901 
1902  /*
1903  * Not worth adding grow-by-power-of-2 logic here - even with a
1904  * few hundred tablespaces this should be fine.
1905  */
1906  sz = sizeof(CkptTsStatus) * num_spaces;
1907 
1908  if (per_ts_stat == NULL)
1909  per_ts_stat = (CkptTsStatus *) palloc(sz);
1910  else
1911  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1912 
1913  s = &per_ts_stat[num_spaces - 1];
1914  memset(s, 0, sizeof(*s));
1915  s->tsId = cur_tsid;
1916 
1917  /*
1918  * The first buffer in this tablespace. As CkptBufferIds is sorted
1919  * by tablespace all (s->num_to_scan) buffers in this tablespace
1920  * will follow afterwards.
1921  */
1922  s->index = i;
1923 
1924  /*
1925  * progress_slice will be determined once we know how many buffers
1926  * are in each tablespace, i.e. after this loop.
1927  */
1928 
1929  last_tsid = cur_tsid;
1930  }
1931  else
1932  {
1933  s = &per_ts_stat[num_spaces - 1];
1934  }
1935 
1936  s->num_to_scan++;
1937 
1938  /* Check for barrier events. */
1941  }
1942 
1943  Assert(num_spaces > 0);
1944 
1945  /*
1946  * Build a min-heap over the write-progress in the individual tablespaces,
1947  * and compute how large a portion of the total progress a single
1948  * processed buffer is.
1949  */
1950  ts_heap = binaryheap_allocate(num_spaces,
1952  NULL);
1953 
1954  for (i = 0; i < num_spaces; i++)
1955  {
1956  CkptTsStatus *ts_stat = &per_ts_stat[i];
1957 
1958  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1959 
1960  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1961  }
1962 
1963  binaryheap_build(ts_heap);
1964 
1965  /*
1966  * Iterate through to-be-checkpointed buffers and write the ones (still)
1967  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1968  * tablespaces; otherwise the sorting would lead to only one tablespace
1969  * receiving writes at a time, making inefficient use of the hardware.
1970  */
1971  num_processed = 0;
1972  num_written = 0;
1973  while (!binaryheap_empty(ts_heap))
1974  {
1975  BufferDesc *bufHdr = NULL;
1976  CkptTsStatus *ts_stat = (CkptTsStatus *)
1978 
1979  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1980  Assert(buf_id != -1);
1981 
1982  bufHdr = GetBufferDescriptor(buf_id);
1983 
1984  num_processed++;
1985 
1986  /*
1987  * We don't need to acquire the lock here, because we're only looking
1988  * at a single bit. It's possible that someone else writes the buffer
1989  * and clears the flag right after we check, but that doesn't matter
1990  * since SyncOneBuffer will then do nothing. However, there is a
1991  * further race condition: it's conceivable that between the time we
1992  * examine the bit here and the time SyncOneBuffer acquires the lock,
1993  * someone else not only wrote the buffer but replaced it with another
1994  * page and dirtied it. In that improbable case, SyncOneBuffer will
1995  * write the buffer though we didn't need to. It doesn't seem worth
1996  * guarding against this, though.
1997  */
1999  {
2000  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2001  {
2002  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2004  num_written++;
2005  }
2006  }
2007 
2008  /*
2009  * Measure progress independent of actually having to flush the buffer
2010  * - otherwise writing become unbalanced.
2011  */
2012  ts_stat->progress += ts_stat->progress_slice;
2013  ts_stat->num_scanned++;
2014  ts_stat->index++;
2015 
2016  /* Have all the buffers from the tablespace been processed? */
2017  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2018  {
2019  binaryheap_remove_first(ts_heap);
2020  }
2021  else
2022  {
2023  /* update heap with the new progress */
2024  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2025  }
2026 
2027  /*
2028  * Sleep to throttle our I/O rate.
2029  *
2030  * (This will check for barrier events even if it doesn't sleep.)
2031  */
2032  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2033  }
2034 
2035  /* issue all pending flushes */
2036  IssuePendingWritebacks(&wb_context);
2037 
2038  pfree(per_ts_stat);
2039  per_ts_stat = NULL;
2040  binaryheap_free(ts_heap);
2041 
2042  /*
2043  * Update checkpoint statistics. As noted above, this doesn't include
2044  * buffers written by other backends or bgwriter scan.
2045  */
2046  CheckpointStats.ckpt_bufs_written += num_written;
2047 
2048  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2049 }
2050 
2051 /*
2052  * BgBufferSync -- Write out some dirty buffers in the pool.
2053  *
2054  * This is called periodically by the background writer process.
2055  *
2056  * Returns true if it's appropriate for the bgwriter process to go into
2057  * low-power hibernation mode. (This happens if the strategy clock sweep
2058  * has been "lapped" and no buffer allocations have occurred recently,
2059  * or if the bgwriter has been effectively disabled by setting
2060  * bgwriter_lru_maxpages to 0.)
2061  */
2062 bool
2064 {
2065  /* info obtained from freelist.c */
2066  int strategy_buf_id;
2067  uint32 strategy_passes;
2068  uint32 recent_alloc;
2069 
2070  /*
2071  * Information saved between calls so we can determine the strategy
2072  * point's advance rate and avoid scanning already-cleaned buffers.
2073  */
2074  static bool saved_info_valid = false;
2075  static int prev_strategy_buf_id;
2076  static uint32 prev_strategy_passes;
2077  static int next_to_clean;
2078  static uint32 next_passes;
2079 
2080  /* Moving averages of allocation rate and clean-buffer density */
2081  static float smoothed_alloc = 0;
2082  static float smoothed_density = 10.0;
2083 
2084  /* Potentially these could be tunables, but for now, not */
2085  float smoothing_samples = 16;
2086  float scan_whole_pool_milliseconds = 120000.0;
2087 
2088  /* Used to compute how far we scan ahead */
2089  long strategy_delta;
2090  int bufs_to_lap;
2091  int bufs_ahead;
2092  float scans_per_alloc;
2093  int reusable_buffers_est;
2094  int upcoming_alloc_est;
2095  int min_scan_buffers;
2096 
2097  /* Variables for the scanning loop proper */
2098  int num_to_scan;
2099  int num_written;
2100  int reusable_buffers;
2101 
2102  /* Variables for final smoothed_density update */
2103  long new_strategy_delta;
2104  uint32 new_recent_alloc;
2105 
2106  /*
2107  * Find out where the freelist clock sweep currently is, and how many
2108  * buffer allocations have happened since our last call.
2109  */
2110  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2111 
2112  /* Report buffer alloc counts to pgstat */
2113  BgWriterStats.m_buf_alloc += recent_alloc;
2114 
2115  /*
2116  * If we're not running the LRU scan, just stop after doing the stats
2117  * stuff. We mark the saved state invalid so that we can recover sanely
2118  * if LRU scan is turned back on later.
2119  */
2120  if (bgwriter_lru_maxpages <= 0)
2121  {
2122  saved_info_valid = false;
2123  return true;
2124  }
2125 
2126  /*
2127  * Compute strategy_delta = how many buffers have been scanned by the
2128  * clock sweep since last time. If first time through, assume none. Then
2129  * see if we are still ahead of the clock sweep, and if so, how many
2130  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2131  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2132  * behavior when the passes counts wrap around.
2133  */
2134  if (saved_info_valid)
2135  {
2136  int32 passes_delta = strategy_passes - prev_strategy_passes;
2137 
2138  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2139  strategy_delta += (long) passes_delta * NBuffers;
2140 
2141  Assert(strategy_delta >= 0);
2142 
2143  if ((int32) (next_passes - strategy_passes) > 0)
2144  {
2145  /* we're one pass ahead of the strategy point */
2146  bufs_to_lap = strategy_buf_id - next_to_clean;
2147 #ifdef BGW_DEBUG
2148  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2149  next_passes, next_to_clean,
2150  strategy_passes, strategy_buf_id,
2151  strategy_delta, bufs_to_lap);
2152 #endif
2153  }
2154  else if (next_passes == strategy_passes &&
2155  next_to_clean >= strategy_buf_id)
2156  {
2157  /* on same pass, but ahead or at least not behind */
2158  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2159 #ifdef BGW_DEBUG
2160  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2161  next_passes, next_to_clean,
2162  strategy_passes, strategy_buf_id,
2163  strategy_delta, bufs_to_lap);
2164 #endif
2165  }
2166  else
2167  {
2168  /*
2169  * We're behind, so skip forward to the strategy point and start
2170  * cleaning from there.
2171  */
2172 #ifdef BGW_DEBUG
2173  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2174  next_passes, next_to_clean,
2175  strategy_passes, strategy_buf_id,
2176  strategy_delta);
2177 #endif
2178  next_to_clean = strategy_buf_id;
2179  next_passes = strategy_passes;
2180  bufs_to_lap = NBuffers;
2181  }
2182  }
2183  else
2184  {
2185  /*
2186  * Initializing at startup or after LRU scanning had been off. Always
2187  * start at the strategy point.
2188  */
2189 #ifdef BGW_DEBUG
2190  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2191  strategy_passes, strategy_buf_id);
2192 #endif
2193  strategy_delta = 0;
2194  next_to_clean = strategy_buf_id;
2195  next_passes = strategy_passes;
2196  bufs_to_lap = NBuffers;
2197  }
2198 
2199  /* Update saved info for next time */
2200  prev_strategy_buf_id = strategy_buf_id;
2201  prev_strategy_passes = strategy_passes;
2202  saved_info_valid = true;
2203 
2204  /*
2205  * Compute how many buffers had to be scanned for each new allocation, ie,
2206  * 1/density of reusable buffers, and track a moving average of that.
2207  *
2208  * If the strategy point didn't move, we don't update the density estimate
2209  */
2210  if (strategy_delta > 0 && recent_alloc > 0)
2211  {
2212  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2213  smoothed_density += (scans_per_alloc - smoothed_density) /
2214  smoothing_samples;
2215  }
2216 
2217  /*
2218  * Estimate how many reusable buffers there are between the current
2219  * strategy point and where we've scanned ahead to, based on the smoothed
2220  * density estimate.
2221  */
2222  bufs_ahead = NBuffers - bufs_to_lap;
2223  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2224 
2225  /*
2226  * Track a moving average of recent buffer allocations. Here, rather than
2227  * a true average we want a fast-attack, slow-decline behavior: we
2228  * immediately follow any increase.
2229  */
2230  if (smoothed_alloc <= (float) recent_alloc)
2231  smoothed_alloc = recent_alloc;
2232  else
2233  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2234  smoothing_samples;
2235 
2236  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2237  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2238 
2239  /*
2240  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2241  * eventually underflow to zero, and the underflows produce annoying
2242  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2243  * zero, there's no point in tracking smaller and smaller values of
2244  * smoothed_alloc, so just reset it to exactly zero to avoid this
2245  * syndrome. It will pop back up as soon as recent_alloc increases.
2246  */
2247  if (upcoming_alloc_est == 0)
2248  smoothed_alloc = 0;
2249 
2250  /*
2251  * Even in cases where there's been little or no buffer allocation
2252  * activity, we want to make a small amount of progress through the buffer
2253  * cache so that as many reusable buffers as possible are clean after an
2254  * idle period.
2255  *
2256  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2257  * the BGW will be called during the scan_whole_pool time; slice the
2258  * buffer pool into that many sections.
2259  */
2260  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2261 
2262  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2263  {
2264 #ifdef BGW_DEBUG
2265  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2266  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2267 #endif
2268  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2269  }
2270 
2271  /*
2272  * Now write out dirty reusable buffers, working forward from the
2273  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2274  * enough buffers to match our estimate of the next cycle's allocation
2275  * requirements, or hit the bgwriter_lru_maxpages limit.
2276  */
2277 
2278  /* Make sure we can handle the pin inside SyncOneBuffer */
2280 
2281  num_to_scan = bufs_to_lap;
2282  num_written = 0;
2283  reusable_buffers = reusable_buffers_est;
2284 
2285  /* Execute the LRU scan */
2286  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2287  {
2288  int sync_state = SyncOneBuffer(next_to_clean, true,
2289  wb_context);
2290 
2291  if (++next_to_clean >= NBuffers)
2292  {
2293  next_to_clean = 0;
2294  next_passes++;
2295  }
2296  num_to_scan--;
2297 
2298  if (sync_state & BUF_WRITTEN)
2299  {
2300  reusable_buffers++;
2301  if (++num_written >= bgwriter_lru_maxpages)
2302  {
2304  break;
2305  }
2306  }
2307  else if (sync_state & BUF_REUSABLE)
2308  reusable_buffers++;
2309  }
2310 
2311  BgWriterStats.m_buf_written_clean += num_written;
2312 
2313 #ifdef BGW_DEBUG
2314  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2315  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2316  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2317  bufs_to_lap - num_to_scan,
2318  num_written,
2319  reusable_buffers - reusable_buffers_est);
2320 #endif
2321 
2322  /*
2323  * Consider the above scan as being like a new allocation scan.
2324  * Characterize its density and update the smoothed one based on it. This
2325  * effectively halves the moving average period in cases where both the
2326  * strategy and the background writer are doing some useful scanning,
2327  * which is helpful because a long memory isn't as desirable on the
2328  * density estimates.
2329  */
2330  new_strategy_delta = bufs_to_lap - num_to_scan;
2331  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2332  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2333  {
2334  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2335  smoothed_density += (scans_per_alloc - smoothed_density) /
2336  smoothing_samples;
2337 
2338 #ifdef BGW_DEBUG
2339  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2340  new_recent_alloc, new_strategy_delta,
2341  scans_per_alloc, smoothed_density);
2342 #endif
2343  }
2344 
2345  /* Return true if OK to hibernate */
2346  return (bufs_to_lap == 0 && recent_alloc == 0);
2347 }
2348 
2349 /*
2350  * SyncOneBuffer -- process a single buffer during syncing.
2351  *
2352  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2353  * buffers marked recently used, as these are not replacement candidates.
2354  *
2355  * Returns a bitmask containing the following flag bits:
2356  * BUF_WRITTEN: we wrote the buffer.
2357  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2358  * pin count 0 and usage count 0.
2359  *
2360  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2361  * after locking it, but we don't care all that much.)
2362  *
2363  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2364  */
2365 static int
2366 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2367 {
2368  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2369  int result = 0;
2370  uint32 buf_state;
2371  BufferTag tag;
2372 
2374 
2375  /*
2376  * Check whether buffer needs writing.
2377  *
2378  * We can make this check without taking the buffer content lock so long
2379  * as we mark pages dirty in access methods *before* logging changes with
2380  * XLogInsert(): if someone marks the buffer dirty just after our check we
2381  * don't worry because our checkpoint.redo points before log record for
2382  * upcoming changes and so we are not required to write such dirty buffer.
2383  */
2384  buf_state = LockBufHdr(bufHdr);
2385 
2386  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2387  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2388  {
2389  result |= BUF_REUSABLE;
2390  }
2391  else if (skip_recently_used)
2392  {
2393  /* Caller told us not to write recently-used buffers */
2394  UnlockBufHdr(bufHdr, buf_state);
2395  return result;
2396  }
2397 
2398  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2399  {
2400  /* It's clean, so nothing to do */
2401  UnlockBufHdr(bufHdr, buf_state);
2402  return result;
2403  }
2404 
2405  /*
2406  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2407  * buffer is clean by the time we've locked it.)
2408  */
2409  PinBuffer_Locked(bufHdr);
2411 
2412  FlushBuffer(bufHdr, NULL);
2413 
2415 
2416  tag = bufHdr->tag;
2417 
2418  UnpinBuffer(bufHdr, true);
2419 
2420  ScheduleBufferTagForWriteback(wb_context, &tag);
2421 
2422  return result | BUF_WRITTEN;
2423 }
2424 
2425 /*
2426  * AtEOXact_Buffers - clean up at end of transaction.
2427  *
2428  * As of PostgreSQL 8.0, buffer pins should get released by the
2429  * ResourceOwner mechanism. This routine is just a debugging
2430  * cross-check that no pins remain.
2431  */
2432 void
2433 AtEOXact_Buffers(bool isCommit)
2434 {
2436 
2437  AtEOXact_LocalBuffers(isCommit);
2438 
2440 }
2441 
2442 /*
2443  * Initialize access to shared buffer pool
2444  *
2445  * This is called during backend startup (whether standalone or under the
2446  * postmaster). It sets up for this backend's access to the already-existing
2447  * buffer pool.
2448  *
2449  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2450  * cannot do LWLockAcquire; hence we can't actually access stuff in
2451  * shared memory yet. We are only initializing local data here.
2452  * (See also InitBufferPoolBackend)
2453  */
2454 void
2456 {
2457  HASHCTL hash_ctl;
2458 
2459  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2460 
2461  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2462  hash_ctl.keysize = sizeof(int32);
2463  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2464 
2465  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2466  HASH_ELEM | HASH_BLOBS);
2467 }
2468 
2469 /*
2470  * InitBufferPoolBackend --- second-stage initialization of a new backend
2471  *
2472  * This is called after we have acquired a PGPROC and so can safely get
2473  * LWLocks. We don't currently need to do anything at this stage ...
2474  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2475  * access, and thereby has to be called at the corresponding phase of
2476  * backend shutdown.
2477  */
2478 void
2480 {
2482 }
2483 
2484 /*
2485  * During backend exit, ensure that we released all shared-buffer locks and
2486  * assert that we have no remaining pins.
2487  */
2488 static void
2490 {
2491  AbortBufferIO();
2492  UnlockBuffers();
2493 
2495 
2496  /* localbuf.c needs a chance too */
2498 }
2499 
2500 /*
2501  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2502  *
2503  * As of PostgreSQL 8.0, buffer pins should get released by the
2504  * ResourceOwner mechanism. This routine is just a debugging
2505  * cross-check that no pins remain.
2506  */
2507 static void
2509 {
2510 #ifdef USE_ASSERT_CHECKING
2511  int RefCountErrors = 0;
2512  PrivateRefCountEntry *res;
2513  int i;
2514 
2515  /* check the array */
2516  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2517  {
2518  res = &PrivateRefCountArray[i];
2519 
2520  if (res->buffer != InvalidBuffer)
2521  {
2523  RefCountErrors++;
2524  }
2525  }
2526 
2527  /* if necessary search the hash */
2529  {
2530  HASH_SEQ_STATUS hstat;
2531 
2532  hash_seq_init(&hstat, PrivateRefCountHash);
2533  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2534  {
2536  RefCountErrors++;
2537  }
2538 
2539  }
2540 
2541  Assert(RefCountErrors == 0);
2542 #endif
2543 }
2544 
2545 /*
2546  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2547  */
2548 void
2550 {
2551  BufferDesc *buf;
2552  int32 loccount;
2553  char *path;
2554  BackendId backend;
2555  uint32 buf_state;
2556 
2557  Assert(BufferIsValid(buffer));
2558  if (BufferIsLocal(buffer))
2559  {
2560  buf = GetLocalBufferDescriptor(-buffer - 1);
2561  loccount = LocalRefCount[-buffer - 1];
2562  backend = MyBackendId;
2563  }
2564  else
2565  {
2566  buf = GetBufferDescriptor(buffer - 1);
2567  loccount = GetPrivateRefCount(buffer);
2568  backend = InvalidBackendId;
2569  }
2570 
2571  /* theoretically we should lock the bufhdr here */
2572  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2573  buf_state = pg_atomic_read_u32(&buf->state);
2574  elog(WARNING,
2575  "buffer refcount leak: [%03d] "
2576  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2577  buffer, path,
2578  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2579  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2580  pfree(path);
2581 }
2582 
2583 /*
2584  * CheckPointBuffers
2585  *
2586  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2587  *
2588  * Note: temporary relations do not participate in checkpoints, so they don't
2589  * need to be flushed.
2590  */
2591 void
2593 {
2594  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2596  BufferSync(flags);
2598  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2601  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2602 }
2603 
2604 
2605 /*
2606  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2607  */
2608 void
2610 {
2611  /* Nothing to do in bufmgr anymore... */
2612 }
2613 
2614 /*
2615  * BufferGetBlockNumber
2616  * Returns the block number associated with a buffer.
2617  *
2618  * Note:
2619  * Assumes that the buffer is valid and pinned, else the
2620  * value may be obsolete immediately...
2621  */
2624 {
2625  BufferDesc *bufHdr;
2626 
2627  Assert(BufferIsPinned(buffer));
2628 
2629  if (BufferIsLocal(buffer))
2630  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2631  else
2632  bufHdr = GetBufferDescriptor(buffer - 1);
2633 
2634  /* pinned, so OK to read tag without spinlock */
2635  return bufHdr->tag.blockNum;
2636 }
2637 
2638 /*
2639  * BufferGetTag
2640  * Returns the relfilenode, fork number and block number associated with
2641  * a buffer.
2642  */
2643 void
2645  BlockNumber *blknum)
2646 {
2647  BufferDesc *bufHdr;
2648 
2649  /* Do the same checks as BufferGetBlockNumber. */
2650  Assert(BufferIsPinned(buffer));
2651 
2652  if (BufferIsLocal(buffer))
2653  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2654  else
2655  bufHdr = GetBufferDescriptor(buffer - 1);
2656 
2657  /* pinned, so OK to read tag without spinlock */
2658  *rnode = bufHdr->tag.rnode;
2659  *forknum = bufHdr->tag.forkNum;
2660  *blknum = bufHdr->tag.blockNum;
2661 }
2662 
2663 /*
2664  * FlushBuffer
2665  * Physically write out a shared buffer.
2666  *
2667  * NOTE: this actually just passes the buffer contents to the kernel; the
2668  * real write to disk won't happen until the kernel feels like it. This
2669  * is okay from our point of view since we can redo the changes from WAL.
2670  * However, we will need to force the changes to disk via fsync before
2671  * we can checkpoint WAL.
2672  *
2673  * The caller must hold a pin on the buffer and have share-locked the
2674  * buffer contents. (Note: a share-lock does not prevent updates of
2675  * hint bits in the buffer, so the page could change while the write
2676  * is in progress, but we assume that that will not invalidate the data
2677  * written.)
2678  *
2679  * If the caller has an smgr reference for the buffer's relation, pass it
2680  * as the second parameter. If not, pass NULL.
2681  */
2682 static void
2684 {
2685  XLogRecPtr recptr;
2686  ErrorContextCallback errcallback;
2687  instr_time io_start,
2688  io_time;
2689  Block bufBlock;
2690  char *bufToWrite;
2691  uint32 buf_state;
2692 
2693  /*
2694  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2695  * false, then someone else flushed the buffer before we could, so we need
2696  * not do anything.
2697  */
2698  if (!StartBufferIO(buf, false))
2699  return;
2700 
2701  /* Setup error traceback support for ereport() */
2703  errcallback.arg = (void *) buf;
2704  errcallback.previous = error_context_stack;
2705  error_context_stack = &errcallback;
2706 
2707  /* Find smgr relation for buffer */
2708  if (reln == NULL)
2709  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2710 
2711  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2712  buf->tag.blockNum,
2713  reln->smgr_rnode.node.spcNode,
2714  reln->smgr_rnode.node.dbNode,
2715  reln->smgr_rnode.node.relNode);
2716 
2717  buf_state = LockBufHdr(buf);
2718 
2719  /*
2720  * Run PageGetLSN while holding header lock, since we don't have the
2721  * buffer locked exclusively in all cases.
2722  */
2723  recptr = BufferGetLSN(buf);
2724 
2725  /* To check if block content changes while flushing. - vadim 01/17/97 */
2726  buf_state &= ~BM_JUST_DIRTIED;
2727  UnlockBufHdr(buf, buf_state);
2728 
2729  /*
2730  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2731  * rule that log updates must hit disk before any of the data-file changes
2732  * they describe do.
2733  *
2734  * However, this rule does not apply to unlogged relations, which will be
2735  * lost after a crash anyway. Most unlogged relation pages do not bear
2736  * LSNs since we never emit WAL records for them, and therefore flushing
2737  * up through the buffer LSN would be useless, but harmless. However,
2738  * GiST indexes use LSNs internally to track page-splits, and therefore
2739  * unlogged GiST pages bear "fake" LSNs generated by
2740  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2741  * LSN counter could advance past the WAL insertion point; and if it did
2742  * happen, attempting to flush WAL through that location would fail, with
2743  * disastrous system-wide consequences. To make sure that can't happen,
2744  * skip the flush if the buffer isn't permanent.
2745  */
2746  if (buf_state & BM_PERMANENT)
2747  XLogFlush(recptr);
2748 
2749  /*
2750  * Now it's safe to write buffer to disk. Note that no one else should
2751  * have been able to write it while we were busy with log flushing because
2752  * we have the io_in_progress lock.
2753  */
2754  bufBlock = BufHdrGetBlock(buf);
2755 
2756  /*
2757  * Update page checksum if desired. Since we have only shared lock on the
2758  * buffer, other processes might be updating hint bits in it, so we must
2759  * copy the page to private storage if we do checksumming.
2760  */
2761  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2762 
2763  if (track_io_timing)
2764  INSTR_TIME_SET_CURRENT(io_start);
2765 
2766  /*
2767  * bufToWrite is either the shared buffer or a copy, as appropriate.
2768  */
2769  smgrwrite(reln,
2770  buf->tag.forkNum,
2771  buf->tag.blockNum,
2772  bufToWrite,
2773  false);
2774 
2775  if (track_io_timing)
2776  {
2777  INSTR_TIME_SET_CURRENT(io_time);
2778  INSTR_TIME_SUBTRACT(io_time, io_start);
2781  }
2782 
2784 
2785  /*
2786  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2787  * end the io_in_progress state.
2788  */
2789  TerminateBufferIO(buf, true, 0);
2790 
2791  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2792  buf->tag.blockNum,
2793  reln->smgr_rnode.node.spcNode,
2794  reln->smgr_rnode.node.dbNode,
2795  reln->smgr_rnode.node.relNode);
2796 
2797  /* Pop the error context stack */
2798  error_context_stack = errcallback.previous;
2799 }
2800 
2801 /*
2802  * RelationGetNumberOfBlocksInFork
2803  * Determines the current number of pages in the specified relation fork.
2804  *
2805  * Note that the accuracy of the result will depend on the details of the
2806  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2807  * it might not be.
2808  */
2811 {
2812  switch (relation->rd_rel->relkind)
2813  {
2814  case RELKIND_SEQUENCE:
2815  case RELKIND_INDEX:
2816  case RELKIND_PARTITIONED_INDEX:
2817  /* Open it at the smgr level if not already done */
2818  RelationOpenSmgr(relation);
2819 
2820  return smgrnblocks(relation->rd_smgr, forkNum);
2821 
2822  case RELKIND_RELATION:
2823  case RELKIND_TOASTVALUE:
2824  case RELKIND_MATVIEW:
2825  {
2826  /*
2827  * Not every table AM uses BLCKSZ wide fixed size blocks.
2828  * Therefore tableam returns the size in bytes - but for the
2829  * purpose of this routine, we want the number of blocks.
2830  * Therefore divide, rounding up.
2831  */
2832  uint64 szbytes;
2833 
2834  szbytes = table_relation_size(relation, forkNum);
2835 
2836  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2837  }
2838  case RELKIND_VIEW:
2839  case RELKIND_COMPOSITE_TYPE:
2840  case RELKIND_FOREIGN_TABLE:
2841  case RELKIND_PARTITIONED_TABLE:
2842  default:
2843  Assert(false);
2844  break;
2845  }
2846 
2847  return 0; /* keep compiler quiet */
2848 }
2849 
2850 /*
2851  * BufferIsPermanent
2852  * Determines whether a buffer will potentially still be around after
2853  * a crash. Caller must hold a buffer pin.
2854  */
2855 bool
2857 {
2858  BufferDesc *bufHdr;
2859 
2860  /* Local buffers are used only for temp relations. */
2861  if (BufferIsLocal(buffer))
2862  return false;
2863 
2864  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2865  Assert(BufferIsValid(buffer));
2866  Assert(BufferIsPinned(buffer));
2867 
2868  /*
2869  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2870  * need not bother with the buffer header spinlock. Even if someone else
2871  * changes the buffer header state while we're doing this, the state is
2872  * changed atomically, so we'll read the old value or the new value, but
2873  * not random garbage.
2874  */
2875  bufHdr = GetBufferDescriptor(buffer - 1);
2876  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2877 }
2878 
2879 /*
2880  * BufferGetLSNAtomic
2881  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2882  * This is necessary for some callers who may not have an exclusive lock
2883  * on the buffer.
2884  */
2885 XLogRecPtr
2887 {
2888  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2889  char *page = BufferGetPage(buffer);
2890  XLogRecPtr lsn;
2891  uint32 buf_state;
2892 
2893  /*
2894  * If we don't need locking for correctness, fastpath out.
2895  */
2896  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2897  return PageGetLSN(page);
2898 
2899  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2900  Assert(BufferIsValid(buffer));
2901  Assert(BufferIsPinned(buffer));
2902 
2903  buf_state = LockBufHdr(bufHdr);
2904  lsn = PageGetLSN(page);
2905  UnlockBufHdr(bufHdr, buf_state);
2906 
2907  return lsn;
2908 }
2909 
2910 /* ---------------------------------------------------------------------
2911  * DropRelFileNodeBuffers
2912  *
2913  * This function removes from the buffer pool all the pages of the
2914  * specified relation forks that have block numbers >= firstDelBlock.
2915  * (In particular, with firstDelBlock = 0, all pages are removed.)
2916  * Dirty pages are simply dropped, without bothering to write them
2917  * out first. Therefore, this is NOT rollback-able, and so should be
2918  * used only with extreme caution!
2919  *
2920  * Currently, this is called only from smgr.c when the underlying file
2921  * is about to be deleted or truncated (firstDelBlock is needed for
2922  * the truncation case). The data in the affected pages would therefore
2923  * be deleted momentarily anyway, and there is no point in writing it.
2924  * It is the responsibility of higher-level code to ensure that the
2925  * deletion or truncation does not lose any data that could be needed
2926  * later. It is also the responsibility of higher-level code to ensure
2927  * that no other process could be trying to load more pages of the
2928  * relation into buffers.
2929  *
2930  * XXX currently it sequentially searches the buffer pool, should be
2931  * changed to more clever ways of searching. However, this routine
2932  * is used only in code paths that aren't very performance-critical,
2933  * and we shouldn't slow down the hot paths to make it faster ...
2934  * --------------------------------------------------------------------
2935  */
2936 void
2938  int nforks, BlockNumber *firstDelBlock)
2939 {
2940  int i;
2941  int j;
2942 
2943  /* If it's a local relation, it's localbuf.c's problem. */
2944  if (RelFileNodeBackendIsTemp(rnode))
2945  {
2946  if (rnode.backend == MyBackendId)
2947  {
2948  for (j = 0; j < nforks; j++)
2949  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2950  firstDelBlock[j]);
2951  }
2952  return;
2953  }
2954 
2955  for (i = 0; i < NBuffers; i++)
2956  {
2957  BufferDesc *bufHdr = GetBufferDescriptor(i);
2958  uint32 buf_state;
2959 
2960  /*
2961  * We can make this a tad faster by prechecking the buffer tag before
2962  * we attempt to lock the buffer; this saves a lot of lock
2963  * acquisitions in typical cases. It should be safe because the
2964  * caller must have AccessExclusiveLock on the relation, or some other
2965  * reason to be certain that no one is loading new pages of the rel
2966  * into the buffer pool. (Otherwise we might well miss such pages
2967  * entirely.) Therefore, while the tag might be changing while we
2968  * look at it, it can't be changing *to* a value we care about, only
2969  * *away* from such a value. So false negatives are impossible, and
2970  * false positives are safe because we'll recheck after getting the
2971  * buffer lock.
2972  *
2973  * We could check forkNum and blockNum as well as the rnode, but the
2974  * incremental win from doing so seems small.
2975  */
2976  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2977  continue;
2978 
2979  buf_state = LockBufHdr(bufHdr);
2980 
2981  for (j = 0; j < nforks; j++)
2982  {
2983  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2984  bufHdr->tag.forkNum == forkNum[j] &&
2985  bufHdr->tag.blockNum >= firstDelBlock[j])
2986  {
2987  InvalidateBuffer(bufHdr); /* releases spinlock */
2988  break;
2989  }
2990  }
2991  if (j >= nforks)
2992  UnlockBufHdr(bufHdr, buf_state);
2993  }
2994 }
2995 
2996 /* ---------------------------------------------------------------------
2997  * DropRelFileNodesAllBuffers
2998  *
2999  * This function removes from the buffer pool all the pages of all
3000  * forks of the specified relations. It's equivalent to calling
3001  * DropRelFileNodeBuffers once per fork per relation with
3002  * firstDelBlock = 0.
3003  * --------------------------------------------------------------------
3004  */
3005 void
3007 {
3008  int i,
3009  n = 0;
3010  RelFileNode *nodes;
3011  bool use_bsearch;
3012 
3013  if (nnodes == 0)
3014  return;
3015 
3016  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3017 
3018  /* If it's a local relation, it's localbuf.c's problem. */
3019  for (i = 0; i < nnodes; i++)
3020  {
3021  if (RelFileNodeBackendIsTemp(rnodes[i]))
3022  {
3023  if (rnodes[i].backend == MyBackendId)
3024  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3025  }
3026  else
3027  nodes[n++] = rnodes[i].node;
3028  }
3029 
3030  /*
3031  * If there are no non-local relations, then we're done. Release the
3032  * memory and return.
3033  */
3034  if (n == 0)
3035  {
3036  pfree(nodes);
3037  return;
3038  }
3039 
3040  /*
3041  * For low number of relations to drop just use a simple walk through, to
3042  * save the bsearch overhead. The threshold to use is rather a guess than
3043  * an exactly determined value, as it depends on many factors (CPU and RAM
3044  * speeds, amount of shared buffers etc.).
3045  */
3046  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
3047 
3048  /* sort the list of rnodes if necessary */
3049  if (use_bsearch)
3050  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3051 
3052  for (i = 0; i < NBuffers; i++)
3053  {
3054  RelFileNode *rnode = NULL;
3055  BufferDesc *bufHdr = GetBufferDescriptor(i);
3056  uint32 buf_state;
3057 
3058  /*
3059  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3060  * and saves some cycles.
3061  */
3062 
3063  if (!use_bsearch)
3064  {
3065  int j;
3066 
3067  for (j = 0; j < n; j++)
3068  {
3069  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3070  {
3071  rnode = &nodes[j];
3072  break;
3073  }
3074  }
3075  }
3076  else
3077  {
3078  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3079  nodes, n, sizeof(RelFileNode),
3081  }
3082 
3083  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3084  if (rnode == NULL)
3085  continue;
3086 
3087  buf_state = LockBufHdr(bufHdr);
3088  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3089  InvalidateBuffer(bufHdr); /* releases spinlock */
3090  else
3091  UnlockBufHdr(bufHdr, buf_state);
3092  }
3093 
3094  pfree(nodes);
3095 }
3096 
3097 /* ---------------------------------------------------------------------
3098  * DropDatabaseBuffers
3099  *
3100  * This function removes all the buffers in the buffer cache for a
3101  * particular database. Dirty pages are simply dropped, without
3102  * bothering to write them out first. This is used when we destroy a
3103  * database, to avoid trying to flush data to disk when the directory
3104  * tree no longer exists. Implementation is pretty similar to
3105  * DropRelFileNodeBuffers() which is for destroying just one relation.
3106  * --------------------------------------------------------------------
3107  */
3108 void
3110 {
3111  int i;
3112 
3113  /*
3114  * We needn't consider local buffers, since by assumption the target
3115  * database isn't our own.
3116  */
3117 
3118  for (i = 0; i < NBuffers; i++)
3119  {
3120  BufferDesc *bufHdr = GetBufferDescriptor(i);
3121  uint32 buf_state;
3122 
3123  /*
3124  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3125  * and saves some cycles.
3126  */
3127  if (bufHdr->tag.rnode.dbNode != dbid)
3128  continue;
3129 
3130  buf_state = LockBufHdr(bufHdr);
3131  if (bufHdr->tag.rnode.dbNode == dbid)
3132  InvalidateBuffer(bufHdr); /* releases spinlock */
3133  else
3134  UnlockBufHdr(bufHdr, buf_state);
3135  }
3136 }
3137 
3138 /* -----------------------------------------------------------------
3139  * PrintBufferDescs
3140  *
3141  * this function prints all the buffer descriptors, for debugging
3142  * use only.
3143  * -----------------------------------------------------------------
3144  */
3145 #ifdef NOT_USED
3146 void
3147 PrintBufferDescs(void)
3148 {
3149  int i;
3150 
3151  for (i = 0; i < NBuffers; ++i)
3152  {
3155 
3156  /* theoretically we should lock the bufhdr here */
3157  elog(LOG,
3158  "[%02d] (freeNext=%d, rel=%s, "
3159  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3160  i, buf->freeNext,
3162  buf->tag.blockNum, buf->flags,
3163  buf->refcount, GetPrivateRefCount(b));
3164  }
3165 }
3166 #endif
3167 
3168 #ifdef NOT_USED
3169 void
3170 PrintPinnedBufs(void)
3171 {
3172  int i;
3173 
3174  for (i = 0; i < NBuffers; ++i)
3175  {
3178 
3179  if (GetPrivateRefCount(b) > 0)
3180  {
3181  /* theoretically we should lock the bufhdr here */
3182  elog(LOG,
3183  "[%02d] (freeNext=%d, rel=%s, "
3184  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3185  i, buf->freeNext,
3186  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3187  buf->tag.blockNum, buf->flags,
3188  buf->refcount, GetPrivateRefCount(b));
3189  }
3190  }
3191 }
3192 #endif
3193 
3194 /* ---------------------------------------------------------------------
3195  * FlushRelationBuffers
3196  *
3197  * This function writes all dirty pages of a relation out to disk
3198  * (or more accurately, out to kernel disk buffers), ensuring that the
3199  * kernel has an up-to-date view of the relation.
3200  *
3201  * Generally, the caller should be holding AccessExclusiveLock on the
3202  * target relation to ensure that no other backend is busy dirtying
3203  * more blocks of the relation; the effects can't be expected to last
3204  * after the lock is released.
3205  *
3206  * XXX currently it sequentially searches the buffer pool, should be
3207  * changed to more clever ways of searching. This routine is not
3208  * used in any performance-critical code paths, so it's not worth
3209  * adding additional overhead to normal paths to make it go faster;
3210  * but see also DropRelFileNodeBuffers.
3211  * --------------------------------------------------------------------
3212  */
3213 void
3215 {
3216  int i;
3217  BufferDesc *bufHdr;
3218 
3219  /* Open rel at the smgr level if not already done */
3220  RelationOpenSmgr(rel);
3221 
3222  if (RelationUsesLocalBuffers(rel))
3223  {
3224  for (i = 0; i < NLocBuffer; i++)
3225  {
3226  uint32 buf_state;
3227 
3228  bufHdr = GetLocalBufferDescriptor(i);
3229  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3230  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3231  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3232  {
3233  ErrorContextCallback errcallback;
3234  Page localpage;
3235 
3236  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3237 
3238  /* Setup error traceback support for ereport() */
3240  errcallback.arg = (void *) bufHdr;
3241  errcallback.previous = error_context_stack;
3242  error_context_stack = &errcallback;
3243 
3244  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3245 
3246  smgrwrite(rel->rd_smgr,
3247  bufHdr->tag.forkNum,
3248  bufHdr->tag.blockNum,
3249  localpage,
3250  false);
3251 
3252  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3253  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3254 
3255  /* Pop the error context stack */
3256  error_context_stack = errcallback.previous;
3257  }
3258  }
3259 
3260  return;
3261  }
3262 
3263  /* Make sure we can handle the pin inside the loop */
3265 
3266  for (i = 0; i < NBuffers; i++)
3267  {
3268  uint32 buf_state;
3269 
3270  bufHdr = GetBufferDescriptor(i);
3271 
3272  /*
3273  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3274  * and saves some cycles.
3275  */
3276  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3277  continue;
3278 
3280 
3281  buf_state = LockBufHdr(bufHdr);
3282  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3283  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3284  {
3285  PinBuffer_Locked(bufHdr);
3287  FlushBuffer(bufHdr, rel->rd_smgr);
3289  UnpinBuffer(bufHdr, true);
3290  }
3291  else
3292  UnlockBufHdr(bufHdr, buf_state);
3293  }
3294 }
3295 
3296 /* ---------------------------------------------------------------------
3297  * FlushDatabaseBuffers
3298  *
3299  * This function writes all dirty pages of a database out to disk
3300  * (or more accurately, out to kernel disk buffers), ensuring that the
3301  * kernel has an up-to-date view of the database.
3302  *
3303  * Generally, the caller should be holding an appropriate lock to ensure
3304  * no other backend is active in the target database; otherwise more
3305  * pages could get dirtied.
3306  *
3307  * Note we don't worry about flushing any pages of temporary relations.
3308  * It's assumed these wouldn't be interesting.
3309  * --------------------------------------------------------------------
3310  */
3311 void
3313 {
3314  int i;
3315  BufferDesc *bufHdr;
3316 
3317  /* Make sure we can handle the pin inside the loop */
3319 
3320  for (i = 0; i < NBuffers; i++)
3321  {
3322  uint32 buf_state;
3323 
3324  bufHdr = GetBufferDescriptor(i);
3325 
3326  /*
3327  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3328  * and saves some cycles.
3329  */
3330  if (bufHdr->tag.rnode.dbNode != dbid)
3331  continue;
3332 
3334 
3335  buf_state = LockBufHdr(bufHdr);
3336  if (bufHdr->tag.rnode.dbNode == dbid &&
3337  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3338  {
3339  PinBuffer_Locked(bufHdr);
3341  FlushBuffer(bufHdr, NULL);
3343  UnpinBuffer(bufHdr, true);
3344  }
3345  else
3346  UnlockBufHdr(bufHdr, buf_state);
3347  }
3348 }
3349 
3350 /*
3351  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3352  * OS.
3353  */
3354 void
3356 {
3357  BufferDesc *bufHdr;
3358 
3359  /* currently not needed, but no fundamental reason not to support */
3360  Assert(!BufferIsLocal(buffer));
3361 
3362  Assert(BufferIsPinned(buffer));
3363 
3364  bufHdr = GetBufferDescriptor(buffer - 1);
3365 
3367 
3368  FlushBuffer(bufHdr, NULL);
3369 }
3370 
3371 /*
3372  * ReleaseBuffer -- release the pin on a buffer
3373  */
3374 void
3376 {
3377  if (!BufferIsValid(buffer))
3378  elog(ERROR, "bad buffer ID: %d", buffer);
3379 
3380  if (BufferIsLocal(buffer))
3381  {
3383 
3384  Assert(LocalRefCount[-buffer - 1] > 0);
3385  LocalRefCount[-buffer - 1]--;
3386  return;
3387  }
3388 
3389  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3390 }
3391 
3392 /*
3393  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3394  *
3395  * This is just a shorthand for a common combination.
3396  */
3397 void
3399 {
3400  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3401  ReleaseBuffer(buffer);
3402 }
3403 
3404 /*
3405  * IncrBufferRefCount
3406  * Increment the pin count on a buffer that we have *already* pinned
3407  * at least once.
3408  *
3409  * This function cannot be used on a buffer we do not have pinned,
3410  * because it doesn't change the shared buffer state.
3411  */
3412 void
3414 {
3415  Assert(BufferIsPinned(buffer));
3417  if (BufferIsLocal(buffer))
3418  LocalRefCount[-buffer - 1]++;
3419  else
3420  {
3421  PrivateRefCountEntry *ref;
3422 
3423  ref = GetPrivateRefCountEntry(buffer, true);
3424  Assert(ref != NULL);
3425  ref->refcount++;
3426  }
3428 }
3429 
3430 /*
3431  * MarkBufferDirtyHint
3432  *
3433  * Mark a buffer dirty for non-critical changes.
3434  *
3435  * This is essentially the same as MarkBufferDirty, except:
3436  *
3437  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3438  * to write an XLOG_FPI WAL record to protect against torn pages.
3439  * 2. The caller might have only share-lock instead of exclusive-lock on the
3440  * buffer's content lock.
3441  * 3. This function does not guarantee that the buffer is always marked dirty
3442  * (due to a race condition), so it cannot be used for important changes.
3443  */
3444 void
3446 {
3447  BufferDesc *bufHdr;
3448  Page page = BufferGetPage(buffer);
3449 
3450  if (!BufferIsValid(buffer))
3451  elog(ERROR, "bad buffer ID: %d", buffer);
3452 
3453  if (BufferIsLocal(buffer))
3454  {
3455  MarkLocalBufferDirty(buffer);
3456  return;
3457  }
3458 
3459  bufHdr = GetBufferDescriptor(buffer - 1);
3460 
3461  Assert(GetPrivateRefCount(buffer) > 0);
3462  /* here, either share or exclusive lock is OK */
3464 
3465  /*
3466  * This routine might get called many times on the same page, if we are
3467  * making the first scan after commit of an xact that added/deleted many
3468  * tuples. So, be as quick as we can if the buffer is already dirty. We
3469  * do this by not acquiring spinlock if it looks like the status bits are
3470  * already set. Since we make this test unlocked, there's a chance we
3471  * might fail to notice that the flags have just been cleared, and failed
3472  * to reset them, due to memory-ordering issues. But since this function
3473  * is only intended to be used in cases where failing to write out the
3474  * data would be harmless anyway, it doesn't really matter.
3475  */
3476  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3478  {
3480  bool dirtied = false;
3481  bool delayChkpt = false;
3482  uint32 buf_state;
3483 
3484  /*
3485  * If we need to protect hint bit updates from torn writes, WAL-log a
3486  * full page image of the page. This full page image is only necessary
3487  * if the hint bit update is the first change to the page since the
3488  * last checkpoint.
3489  *
3490  * We don't check full_page_writes here because that logic is included
3491  * when we call XLogInsert() since the value changes dynamically.
3492  */
3493  if (XLogHintBitIsNeeded() &&
3494  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3495  {
3496  /*
3497  * If we're in recovery we cannot dirty a page because of a hint.
3498  * We can set the hint, just not dirty the page as a result so the
3499  * hint is lost when we evict the page or shutdown.
3500  *
3501  * See src/backend/storage/page/README for longer discussion.
3502  */
3503  if (RecoveryInProgress())
3504  return;
3505 
3506  /*
3507  * If the block is already dirty because we either made a change
3508  * or set a hint already, then we don't need to write a full page
3509  * image. Note that aggressive cleaning of blocks dirtied by hint
3510  * bit setting would increase the call rate. Bulk setting of hint
3511  * bits would reduce the call rate...
3512  *
3513  * We must issue the WAL record before we mark the buffer dirty.
3514  * Otherwise we might write the page before we write the WAL. That
3515  * causes a race condition, since a checkpoint might occur between
3516  * writing the WAL record and marking the buffer dirty. We solve
3517  * that with a kluge, but one that is already in use during
3518  * transaction commit to prevent race conditions. Basically, we
3519  * simply prevent the checkpoint WAL record from being written
3520  * until we have marked the buffer dirty. We don't start the
3521  * checkpoint flush until we have marked dirty, so our checkpoint
3522  * must flush the change to disk successfully or the checkpoint
3523  * never gets written, so crash recovery will fix.
3524  *
3525  * It's possible we may enter here without an xid, so it is
3526  * essential that CreateCheckpoint waits for virtual transactions
3527  * rather than full transactionids.
3528  */
3529  MyPgXact->delayChkpt = delayChkpt = true;
3530  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3531  }
3532 
3533  buf_state = LockBufHdr(bufHdr);
3534 
3535  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3536 
3537  if (!(buf_state & BM_DIRTY))
3538  {
3539  dirtied = true; /* Means "will be dirtied by this action" */
3540 
3541  /*
3542  * Set the page LSN if we wrote a backup block. We aren't supposed
3543  * to set this when only holding a share lock but as long as we
3544  * serialise it somehow we're OK. We choose to set LSN while
3545  * holding the buffer header lock, which causes any reader of an
3546  * LSN who holds only a share lock to also obtain a buffer header
3547  * lock before using PageGetLSN(), which is enforced in
3548  * BufferGetLSNAtomic().
3549  *
3550  * If checksums are enabled, you might think we should reset the
3551  * checksum here. That will happen when the page is written
3552  * sometime later in this checkpoint cycle.
3553  */
3554  if (!XLogRecPtrIsInvalid(lsn))
3555  PageSetLSN(page, lsn);
3556  }
3557 
3558  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3559  UnlockBufHdr(bufHdr, buf_state);
3560 
3561  if (delayChkpt)
3562  MyPgXact->delayChkpt = false;
3563 
3564  if (dirtied)
3565  {
3566  VacuumPageDirty++;
3568  if (VacuumCostActive)
3570  }
3571  }
3572 }
3573 
3574 /*
3575  * Release buffer content locks for shared buffers.
3576  *
3577  * Used to clean up after errors.
3578  *
3579  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3580  * of releasing buffer content locks per se; the only thing we need to deal
3581  * with here is clearing any PIN_COUNT request that was in progress.
3582  */
3583 void
3585 {
3587 
3588  if (buf)
3589  {
3590  uint32 buf_state;
3591 
3592  buf_state = LockBufHdr(buf);
3593 
3594  /*
3595  * Don't complain if flag bit not set; it could have been reset but we
3596  * got a cancel/die interrupt before getting the signal.
3597  */
3598  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3599  buf->wait_backend_pid == MyProcPid)
3600  buf_state &= ~BM_PIN_COUNT_WAITER;
3601 
3602  UnlockBufHdr(buf, buf_state);
3603 
3604  PinCountWaitBuf = NULL;
3605  }
3606 }
3607 
3608 /*
3609  * Acquire or release the content_lock for the buffer.
3610  */
3611 void
3613 {
3614  BufferDesc *buf;
3615 
3616  Assert(BufferIsValid(buffer));
3617  if (BufferIsLocal(buffer))
3618  return; /* local buffers need no lock */
3619 
3620  buf = GetBufferDescriptor(buffer - 1);
3621 
3622  if (mode == BUFFER_LOCK_UNLOCK)
3624  else if (mode == BUFFER_LOCK_SHARE)
3626  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3628  else
3629  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3630 }
3631 
3632 /*
3633  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3634  *
3635  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3636  */
3637 bool
3639 {
3640  BufferDesc *buf;
3641 
3642  Assert(BufferIsValid(buffer));
3643  if (BufferIsLocal(buffer))
3644  return true; /* act as though we got it */
3645 
3646  buf = GetBufferDescriptor(buffer - 1);
3647 
3649  LW_EXCLUSIVE);
3650 }
3651 
3652 /*
3653  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3654  *
3655  * Items may be deleted from a disk page only when the caller (a) holds an
3656  * exclusive lock on the buffer and (b) has observed that no other backend
3657  * holds a pin on the buffer. If there is a pin, then the other backend
3658  * might have a pointer into the buffer (for example, a heapscan reference
3659  * to an item --- see README for more details). It's OK if a pin is added
3660  * after the cleanup starts, however; the newly-arrived backend will be
3661  * unable to look at the page until we release the exclusive lock.
3662  *
3663  * To implement this protocol, a would-be deleter must pin the buffer and
3664  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3665  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3666  * it has successfully observed pin count = 1.
3667  */
3668 void
3670 {
3671  BufferDesc *bufHdr;
3672 
3673  Assert(BufferIsValid(buffer));
3674  Assert(PinCountWaitBuf == NULL);
3675 
3676  if (BufferIsLocal(buffer))
3677  {
3678  /* There should be exactly one pin */
3679  if (LocalRefCount[-buffer - 1] != 1)
3680  elog(ERROR, "incorrect local pin count: %d",
3681  LocalRefCount[-buffer - 1]);
3682  /* Nobody else to wait for */
3683  return;
3684  }
3685 
3686  /* There should be exactly one local pin */
3687  if (GetPrivateRefCount(buffer) != 1)
3688  elog(ERROR, "incorrect local pin count: %d",
3689  GetPrivateRefCount(buffer));
3690 
3691  bufHdr = GetBufferDescriptor(buffer - 1);
3692 
3693  for (;;)
3694  {
3695  uint32 buf_state;
3696 
3697  /* Try to acquire lock */
3699  buf_state = LockBufHdr(bufHdr);
3700 
3701  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3702  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3703  {
3704  /* Successfully acquired exclusive lock with pincount 1 */
3705  UnlockBufHdr(bufHdr, buf_state);
3706  return;
3707  }
3708  /* Failed, so mark myself as waiting for pincount 1 */
3709  if (buf_state & BM_PIN_COUNT_WAITER)
3710  {
3711  UnlockBufHdr(bufHdr, buf_state);
3712  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3713  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3714  }
3715  bufHdr->wait_backend_pid = MyProcPid;
3716  PinCountWaitBuf = bufHdr;
3717  buf_state |= BM_PIN_COUNT_WAITER;
3718  UnlockBufHdr(bufHdr, buf_state);
3719  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3720 
3721  /* Wait to be signaled by UnpinBuffer() */
3722  if (InHotStandby)
3723  {
3724  /* Publish the bufid that Startup process waits on */
3725  SetStartupBufferPinWaitBufId(buffer - 1);
3726  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3728  /* Reset the published bufid */
3730  }
3731  else
3733 
3734  /*
3735  * Remove flag marking us as waiter. Normally this will not be set
3736  * anymore, but ProcWaitForSignal() can return for other signals as
3737  * well. We take care to only reset the flag if we're the waiter, as
3738  * theoretically another backend could have started waiting. That's
3739  * impossible with the current usages due to table level locking, but
3740  * better be safe.
3741  */
3742  buf_state = LockBufHdr(bufHdr);
3743  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3744  bufHdr->wait_backend_pid == MyProcPid)
3745  buf_state &= ~BM_PIN_COUNT_WAITER;
3746  UnlockBufHdr(bufHdr, buf_state);
3747 
3748  PinCountWaitBuf = NULL;
3749  /* Loop back and try again */
3750  }
3751 }
3752 
3753 /*
3754  * Check called from RecoveryConflictInterrupt handler when Startup
3755  * process requests cancellation of all pin holders that are blocking it.
3756  */
3757 bool
3759 {
3760  int bufid = GetStartupBufferPinWaitBufId();
3761 
3762  /*
3763  * If we get woken slowly then it's possible that the Startup process was
3764  * already woken by other backends before we got here. Also possible that
3765  * we get here by multiple interrupts or interrupts at inappropriate
3766  * times, so make sure we do nothing if the bufid is not set.
3767  */
3768  if (bufid < 0)
3769  return false;
3770 
3771  if (GetPrivateRefCount(bufid + 1) > 0)
3772  return true;
3773 
3774  return false;
3775 }
3776 
3777 /*
3778  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3779  *
3780  * We won't loop, but just check once to see if the pin count is OK. If
3781  * not, return false with no lock held.
3782  */
3783 bool
3785 {
3786  BufferDesc *bufHdr;
3787  uint32 buf_state,
3788  refcount;
3789 
3790  Assert(BufferIsValid(buffer));
3791 
3792  if (BufferIsLocal(buffer))
3793  {
3794  refcount = LocalRefCount[-buffer - 1];
3795  /* There should be exactly one pin */
3796  Assert(refcount > 0);
3797  if (refcount != 1)
3798  return false;
3799  /* Nobody else to wait for */
3800  return true;
3801  }
3802 
3803  /* There should be exactly one local pin */
3804  refcount = GetPrivateRefCount(buffer);
3805  Assert(refcount);
3806  if (refcount != 1)
3807  return false;
3808 
3809  /* Try to acquire lock */
3810  if (!ConditionalLockBuffer(buffer))
3811  return false;
3812 
3813  bufHdr = GetBufferDescriptor(buffer - 1);
3814  buf_state = LockBufHdr(bufHdr);
3815  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3816 
3817  Assert(refcount > 0);
3818  if (refcount == 1)
3819  {
3820  /* Successfully acquired exclusive lock with pincount 1 */
3821  UnlockBufHdr(bufHdr, buf_state);
3822  return true;
3823  }
3824 
3825  /* Failed, so release the lock */
3826  UnlockBufHdr(bufHdr, buf_state);
3827  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3828  return false;
3829 }
3830 
3831 /*
3832  * IsBufferCleanupOK - as above, but we already have the lock
3833  *
3834  * Check whether it's OK to perform cleanup on a buffer we've already
3835  * locked. If we observe that the pin count is 1, our exclusive lock
3836  * happens to be a cleanup lock, and we can proceed with anything that
3837  * would have been allowable had we sought a cleanup lock originally.
3838  */
3839 bool
3841 {
3842  BufferDesc *bufHdr;
3843  uint32 buf_state;
3844 
3845  Assert(BufferIsValid(buffer));
3846 
3847  if (BufferIsLocal(buffer))
3848  {
3849  /* There should be exactly one pin */
3850  if (LocalRefCount[-buffer - 1] != 1)
3851  return false;
3852  /* Nobody else to wait for */
3853  return true;
3854  }
3855 
3856  /* There should be exactly one local pin */
3857  if (GetPrivateRefCount(buffer) != 1)
3858  return false;
3859 
3860  bufHdr = GetBufferDescriptor(buffer - 1);
3861 
3862  /* caller must hold exclusive lock on buffer */
3864  LW_EXCLUSIVE));
3865 
3866  buf_state = LockBufHdr(bufHdr);
3867 
3868  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3869  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3870  {
3871  /* pincount is OK. */
3872  UnlockBufHdr(bufHdr, buf_state);
3873  return true;
3874  }
3875 
3876  UnlockBufHdr(bufHdr, buf_state);
3877  return false;
3878 }
3879 
3880 
3881 /*
3882  * Functions for buffer I/O handling
3883  *
3884  * Note: We assume that nested buffer I/O never occurs.
3885  * i.e at most one io_in_progress lock is held per proc.
3886  *
3887  * Also note that these are used only for shared buffers, not local ones.
3888  */
3889 
3890 /*
3891  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3892  */
3893 static void
3895 {
3896  /*
3897  * Changed to wait until there's no IO - Inoue 01/13/2000
3898  *
3899  * Note this is *necessary* because an error abort in the process doing
3900  * I/O could release the io_in_progress_lock prematurely. See
3901  * AbortBufferIO.
3902  */
3903  for (;;)
3904  {
3905  uint32 buf_state;
3906 
3907  /*
3908  * It may not be necessary to acquire the spinlock to check the flag
3909  * here, but since this test is essential for correctness, we'd better
3910  * play it safe.
3911  */
3912  buf_state = LockBufHdr(buf);
3913  UnlockBufHdr(buf, buf_state);
3914 
3915  if (!(buf_state & BM_IO_IN_PROGRESS))
3916  break;
3919  }
3920 }
3921 
3922 /*
3923  * StartBufferIO: begin I/O on this buffer
3924  * (Assumptions)
3925  * My process is executing no IO
3926  * The buffer is Pinned
3927  *
3928  * In some scenarios there are race conditions in which multiple backends
3929  * could attempt the same I/O operation concurrently. If someone else
3930  * has already started I/O on this buffer then we will block on the
3931  * io_in_progress lock until he's done.
3932  *
3933  * Input operations are only attempted on buffers that are not BM_VALID,
3934  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3935  * so we can always tell if the work is already done.
3936  *
3937  * Returns true if we successfully marked the buffer as I/O busy,
3938  * false if someone else already did the work.
3939  */
3940 static bool
3941 StartBufferIO(BufferDesc *buf, bool forInput)
3942 {
3943  uint32 buf_state;
3944 
3945  Assert(!InProgressBuf);
3946 
3947  for (;;)
3948  {
3949  /*
3950  * Grab the io_in_progress lock so that other processes can wait for
3951  * me to finish the I/O.
3952  */
3954 
3955  buf_state = LockBufHdr(buf);
3956 
3957  if (!(buf_state & BM_IO_IN_PROGRESS))
3958  break;
3959 
3960  /*
3961  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3962  * lock isn't held is if the process doing the I/O is recovering from
3963  * an error (see AbortBufferIO). If that's the case, we must wait for
3964  * him to get unwedged.
3965  */
3966  UnlockBufHdr(buf, buf_state);
3968  WaitIO(buf);
3969  }
3970 
3971  /* Once we get here, there is definitely no I/O active on this buffer */
3972 
3973  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3974  {
3975  /* someone else already did the I/O */
3976  UnlockBufHdr(buf, buf_state);
3978  return false;
3979  }
3980 
3981  buf_state |= BM_IO_IN_PROGRESS;
3982  UnlockBufHdr(buf, buf_state);
3983 
3984  InProgressBuf = buf;
3985  IsForInput = forInput;
3986 
3987  return true;
3988 }
3989 
3990 /*
3991  * TerminateBufferIO: release a buffer we were doing I/O on
3992  * (Assumptions)
3993  * My process is executing IO for the buffer
3994  * BM_IO_IN_PROGRESS bit is set for the buffer
3995  * We hold the buffer's io_in_progress lock
3996  * The buffer is Pinned
3997  *
3998  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
3999  * buffer's BM_DIRTY flag. This is appropriate when terminating a
4000  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
4001  * marking the buffer clean if it was re-dirtied while we were writing.
4002  *
4003  * set_flag_bits gets ORed into the buffer's flags. It must include
4004  * BM_IO_ERROR in a failure case. For successful completion it could
4005  * be 0, or BM_VALID if we just finished reading in the page.
4006  */
4007 static void
4008 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
4009 {
4010  uint32 buf_state;
4011 
4012  Assert(buf == InProgressBuf);
4013 
4014  buf_state = LockBufHdr(buf);
4015 
4016  Assert(buf_state & BM_IO_IN_PROGRESS);
4017 
4018  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
4019  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
4020  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
4021 
4022  buf_state |= set_flag_bits;
4023  UnlockBufHdr(buf, buf_state);
4024 
4025  InProgressBuf = NULL;
4026 
4028 }
4029 
4030 /*
4031  * AbortBufferIO: Clean up any active buffer I/O after an error.
4032  *
4033  * All LWLocks we might have held have been released,
4034  * but we haven't yet released buffer pins, so the buffer is still pinned.
4035  *
4036  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4037  * possible the error condition wasn't related to the I/O.
4038  */
4039 void
4041 {
4043 
4044  if (buf)
4045  {
4046  uint32 buf_state;
4047 
4048  /*
4049  * Since LWLockReleaseAll has already been called, we're not holding
4050  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4051  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4052  * buffer will be in a busy spin until we succeed in doing this.
4053  */
4055 
4056  buf_state = LockBufHdr(buf);
4057  Assert(buf_state & BM_IO_IN_PROGRESS);
4058  if (IsForInput)
4059  {
4060  Assert(!(buf_state & BM_DIRTY));
4061 
4062  /* We'd better not think buffer is valid yet */
4063  Assert(!(buf_state & BM_VALID));
4064  UnlockBufHdr(buf, buf_state);
4065  }
4066  else
4067  {
4068  Assert(buf_state & BM_DIRTY);
4069  UnlockBufHdr(buf, buf_state);
4070  /* Issue notice if this is not the first failure... */
4071  if (buf_state & BM_IO_ERROR)
4072  {
4073  /* Buffer is pinned, so we can read tag without spinlock */
4074  char *path;
4075 
4076  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4077  ereport(WARNING,
4078  (errcode(ERRCODE_IO_ERROR),
4079  errmsg("could not write block %u of %s",
4080  buf->tag.blockNum, path),
4081  errdetail("Multiple failures --- write error might be permanent.")));
4082  pfree(path);
4083  }
4084  }
4085  TerminateBufferIO(buf, false, BM_IO_ERROR);
4086  }
4087 }
4088 
4089 /*
4090  * Error context callback for errors occurring during shared buffer writes.
4091  */
4092 static void
4094 {
4095  BufferDesc *bufHdr = (BufferDesc *) arg;
4096 
4097  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4098  if (bufHdr != NULL)
4099  {
4100  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4101 
4102  errcontext("writing block %u of relation %s",
4103  bufHdr->tag.blockNum, path);
4104  pfree(path);
4105  }
4106 }
4107 
4108 /*
4109  * Error context callback for errors occurring during local buffer writes.
4110  */
4111 static void
4113 {
4114  BufferDesc *bufHdr = (BufferDesc *) arg;
4115 
4116  if (bufHdr != NULL)
4117  {
4118  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4119  bufHdr->tag.forkNum);
4120 
4121  errcontext("writing block %u of relation %s",
4122  bufHdr->tag.blockNum, path);
4123  pfree(path);
4124  }
4125 }
4126 
4127 /*
4128  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4129  */
4130 static int
4131 rnode_comparator(const void *p1, const void *p2)
4132 {
4133  RelFileNode n1 = *(const RelFileNode *) p1;
4134  RelFileNode n2 = *(const RelFileNode *) p2;
4135 
4136  if (n1.relNode < n2.relNode)
4137  return -1;
4138  else if (n1.relNode > n2.relNode)
4139  return 1;
4140 
4141  if (n1.dbNode < n2.dbNode)
4142  return -1;
4143  else if (n1.dbNode > n2.dbNode)
4144  return 1;
4145 
4146  if (n1.spcNode < n2.spcNode)
4147  return -1;
4148  else if (n1.spcNode > n2.spcNode)
4149  return 1;
4150  else
4151  return 0;
4152 }
4153 
4154 /*
4155  * Lock buffer header - set BM_LOCKED in buffer state.
4156  */
4157 uint32
4159 {
4160  SpinDelayStatus delayStatus;
4161  uint32 old_buf_state;
4162 
4163  init_local_spin_delay(&delayStatus);
4164 
4165  while (true)
4166  {
4167  /* set BM_LOCKED flag */
4168  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4169  /* if it wasn't set before we're OK */
4170  if (!(old_buf_state & BM_LOCKED))
4171  break;
4172  perform_spin_delay(&delayStatus);
4173  }
4174  finish_spin_delay(&delayStatus);
4175  return old_buf_state | BM_LOCKED;
4176 }
4177 
4178 /*
4179  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4180  * state at that point.
4181  *
4182  * Obviously the buffer could be locked by the time the value is returned, so
4183  * this is primarily useful in CAS style loops.
4184  */
4185 static uint32
4187 {
4188  SpinDelayStatus delayStatus;
4189  uint32 buf_state;
4190 
4191  init_local_spin_delay(&delayStatus);
4192 
4193  buf_state = pg_atomic_read_u32(&buf->state);
4194 
4195  while (buf_state & BM_LOCKED)
4196  {
4197  perform_spin_delay(&delayStatus);
4198  buf_state = pg_atomic_read_u32(&buf->state);
4199  }
4200 
4201  finish_spin_delay(&delayStatus);
4202 
4203  return buf_state;
4204 }
4205 
4206 /*
4207  * BufferTag comparator.
4208  */
4209 static int
4210 buffertag_comparator(const void *a, const void *b)
4211 {
4212  const BufferTag *ba = (const BufferTag *) a;
4213  const BufferTag *bb = (const BufferTag *) b;
4214  int ret;
4215 
4216  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4217 
4218  if (ret != 0)
4219  return ret;
4220 
4221  if (ba->forkNum < bb->forkNum)
4222  return -1;
4223  if (ba->forkNum > bb->forkNum)
4224  return 1;
4225 
4226  if (ba->blockNum < bb->blockNum)
4227  return -1;
4228  if (ba->blockNum > bb->blockNum)
4229  return 1;
4230 
4231  return 0;
4232 }
4233 
4234 /*
4235  * Comparator determining the writeout order in a checkpoint.
4236  *
4237  * It is important that tablespaces are compared first, the logic balancing
4238  * writes between tablespaces relies on it.
4239  */
4240 static int
4241 ckpt_buforder_comparator(const void *pa, const void *pb)
4242 {
4243  const CkptSortItem *a = (const CkptSortItem *) pa;
4244  const CkptSortItem *b = (const CkptSortItem *) pb;
4245 
4246  /* compare tablespace */
4247  if (a->tsId < b->tsId)
4248  return -1;
4249  else if (a->tsId > b->tsId)
4250  return 1;
4251  /* compare relation */
4252  if (a->relNode < b->relNode)
4253  return -1;
4254  else if (a->relNode > b->relNode)
4255  return 1;
4256  /* compare fork */
4257  else if (a->forkNum < b->forkNum)
4258  return -1;
4259  else if (a->forkNum > b->forkNum)
4260  return 1;
4261  /* compare block number */
4262  else if (a->blockNum < b->blockNum)
4263  return -1;
4264  else if (a->blockNum > b->blockNum)
4265  return 1;
4266  /* equal page IDs are unlikely, but not impossible */
4267  return 0;
4268 }
4269 
4270 /*
4271  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4272  * progress.
4273  */
4274 static int
4276 {
4277  CkptTsStatus *sa = (CkptTsStatus *) a;
4278  CkptTsStatus *sb = (CkptTsStatus *) b;
4279 
4280  /* we want a min-heap, so return 1 for the a < b */
4281  if (sa->progress < sb->progress)
4282  return 1;
4283  else if (sa->progress == sb->progress)
4284  return 0;
4285  else
4286  return -1;
4287 }
4288 
4289 /*
4290  * Initialize a writeback context, discarding potential previous state.
4291  *
4292  * *max_pending is a pointer instead of an immediate value, so the coalesce
4293  * limits can easily changed by the GUC mechanism, and so calling code does
4294  * not have to check the current configuration. A value is 0 means that no
4295  * writeback control will be performed.
4296  */
4297 void
4298 WritebackContextInit(WritebackContext *context, int *max_pending)
4299 {
4300  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4301 
4302  context->max_pending = max_pending;
4303  context->nr_pending = 0;
4304 }
4305 
4306 /*
4307  * Add buffer to list of pending writeback requests.
4308  */
4309 void
4311 {
4312  PendingWriteback *pending;
4313 
4314  /*
4315  * Add buffer to the pending writeback array, unless writeback control is
4316  * disabled.
4317  */
4318  if (*context->max_pending > 0)
4319  {
4321 
4322  pending = &context->pending_writebacks[context->nr_pending++];
4323 
4324  pending->tag = *tag;
4325  }
4326 
4327  /*
4328  * Perform pending flushes if the writeback limit is exceeded. This
4329  * includes the case where previously an item has been added, but control
4330  * is now disabled.
4331  */
4332  if (context->nr_pending >= *context->max_pending)
4333  IssuePendingWritebacks(context);
4334 }
4335 
4336 /*
4337  * Issue all pending writeback requests, previously scheduled with
4338  * ScheduleBufferTagForWriteback, to the OS.
4339  *
4340  * Because this is only used to improve the OSs IO scheduling we try to never
4341  * error out - it's just a hint.
4342  */
4343 void
4345 {
4346  int i;
4347 
4348  if (context->nr_pending == 0)
4349  return;
4350 
4351  /*
4352  * Executing the writes in-order can make them a lot faster, and allows to
4353  * merge writeback requests to consecutive blocks into larger writebacks.
4354  */
4355  qsort(&context->pending_writebacks, context->nr_pending,
4357 
4358  /*
4359  * Coalesce neighbouring writes, but nothing else. For that we iterate
4360  * through the, now sorted, array of pending flushes, and look forward to
4361  * find all neighbouring (or identical) writes.
4362  */
4363  for (i = 0; i < context->nr_pending; i++)
4364  {
4367  SMgrRelation reln;
4368  int ahead;
4369  BufferTag tag;
4370  Size nblocks = 1;
4371 
4372  cur = &context->pending_writebacks[i];
4373  tag = cur->tag;
4374 
4375  /*
4376  * Peek ahead, into following writeback requests, to see if they can
4377  * be combined with the current one.
4378  */
4379  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4380  {
4381  next = &context->pending_writebacks[i + ahead + 1];
4382 
4383  /* different file, stop */
4384  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4385  cur->tag.forkNum != next->tag.forkNum)
4386  break;
4387 
4388  /* ok, block queued twice, skip */
4389  if (cur->tag.blockNum == next->tag.blockNum)
4390  continue;
4391 
4392  /* only merge consecutive writes */
4393  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4394  break;
4395 
4396  nblocks++;
4397  cur = next;
4398  }
4399 
4400  i += ahead;
4401 
4402  /* and finally tell the kernel to write the data to storage */
4403  reln = smgropen(tag.rnode, InvalidBackendId);
4404  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4405  }
4406 
4407  context->nr_pending = 0;
4408 }
4409 
4410 
4411 /*
4412  * Implement slower/larger portions of TestForOldSnapshot
4413  *
4414  * Smaller/faster portions are put inline, but the entire set of logic is too
4415  * big for that.
4416  */
4417 void
4419 {
4420  if (RelationAllowsEarlyPruning(relation)
4421  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4422  ereport(ERROR,
4423  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4424  errmsg("snapshot too old")));
4425 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
void ProcessSyncRequests(void)
Definition: sync.c:236
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1579
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
#define MAX_IO_CONCURRENCY
Definition: bufmgr.h:78
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
int target_prefetch_pages
Definition: bufmgr.c:130
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:663
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3669
Definition: lwlock.h:32
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:415
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2592
PgStat_Counter m_buf_alloc
Definition: pgstat.h:420
TimestampTz ckpt_sync_end_t
Definition: xlog.h:238
#define BM_PERMANENT
Definition: buf_internals.h:66
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1860
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:216
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:1069
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1745
long local_blks_read
Definition: instrument.h:26
#define BM_TAG_VALID
Definition: buf_internals.h:60
Oid tsId
Definition: bufmgr.c:87
static int32 next
Definition: blutils.c:217
int VacuumCostBalance
Definition: globals.c:147
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2063
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2549
int BgWriterDelay
Definition: bgwriter.c:64
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
#define HASH_ELEM
Definition: hsearch.h:87
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
static uint32 PrivateRefCountClock
Definition: bufmgr.c:172
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3445
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1842
instr_time blk_read_time
Definition: instrument.h:31
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1360
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4112
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3125
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
int backend_flush_after
Definition: bufmgr.c:121
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2433
int VacuumPageHit
Definition: globals.c:143
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:417
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:893
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:200
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3894
#define Min(x, y)
Definition: c.h:911
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:429
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4275
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:382
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:416
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:684
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1169
int checkpoint_flush_after
Definition: bufmgr.c:119
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:608
#define MemSet(start, val, len)
Definition: c.h:962
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3375
#define P_NEW
Definition: bufmgr.h:81
double bgwriter_lru_multiplier
Definition: bufmgr.c:111
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:906
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:88
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:84
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:7930
#define BM_DIRTY
Definition: buf_internals.h:58
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:69
int VacuumCostPageDirty
Definition: globals.c:139
void(* callback)(void *arg)
Definition: elog.h:256
struct ErrorContextCallback * previous
Definition: elog.h:255
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:906
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2804
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2683
int effective_io_concurrency
Definition: bufmgr.c:113
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2366
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4344
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:133
signed int int32
Definition: c.h:347
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4298
PGXACT * MyPgXact
Definition: proc.c:68
int bgwriter_flush_after
Definition: bufmgr.c:120
bool ComputeIoConcurrency(int io_concurrency, double *target)
Definition: bufmgr.c:469
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:508
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:253
ErrorContextCallback * error_context_stack
Definition: elog.c:91
#define RelationOpenSmgr(relation)
Definition: rel.h:485
void ProcSendSignal(int pid)
Definition: proc.c:1806
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
#define BUF_REUSABLE
Definition: bufmgr.c:67
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:208
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3941
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:1056
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2455
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3398
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3784
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:492
char relpersistence
Definition: pg_class.h:78
#define RelationIsValid(relation)
Definition: rel.h:401
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:461
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4310
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
int bgwriter_lru_maxpages
Definition: bufmgr.c:110
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1385
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
int num_to_scan
Definition: bufmgr.c:100
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:97
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2886
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1294
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
static char * buf
Definition: pg_test_fsync.c:67
int index
Definition: bufmgr.c:105
float8 progress
Definition: bufmgr.c:96
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3312
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:530
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1610
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4241
int errdetail(const char *fmt,...)
Definition: elog.c:955
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:212
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:543
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4210
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2856
#define BUF_WRITTEN
Definition: bufmgr.c:66
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
static bool IsForInput
Definition: bufmgr.c:134
#define ereport(elevel, rest)
Definition: elog.h:141
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3638
bool delayChkpt
Definition: proc.h:235
int VacuumCostPageHit
Definition: globals.c:137
static void BufferSync(int flags)
Definition: bufmgr.c:1786
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3840
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1794
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
int ckpt_bufs_written
Definition: xlog.h:241
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:173
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1390
void UnlockBuffers(void)
Definition: bufmgr.c:3584
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4008
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4131
int VacuumPageDirty
Definition: globals.c:145
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:59
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:3006
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:58
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:316
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3612
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:705
#define InvalidOid
Definition: postgres_ext.h:36
void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:531
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2810
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:995
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:756
RelFileNode rd_node
Definition: rel.h:55
#define Max(x, y)
Definition: c.h:905
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4158
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:555
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:739
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1395
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2508
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:558
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3758
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:78
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3214
CheckpointStatsData CheckpointStats
Definition: xlog.c:181
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1664
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:467
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:236
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1397
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1198
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1521
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
void AbortBufferIO(void)
Definition: bufmgr.c:4040
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4186
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1389
TimestampTz ckpt_sync_t
Definition: xlog.h:237
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1379
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3355
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:537
#define BM_IO_ERROR
Definition: buf_internals.h:62
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:2937
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:483
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2623
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:822
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170
#define elog(elevel,...)
Definition: elog.h:228
int i
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:137
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:494
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:183
int NBuffers
Definition: globals.c:131
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
int num_scanned
Definition: bufmgr.c:102
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2479
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61
int VacuumPageMiss
Definition: globals.c:144
int VacuumCostPageMiss
Definition: globals.c:138
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
#define qsort(a, b, c, d)
Definition: port.h:491
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:59
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2644
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3109
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4093
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4418
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2609
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3413
#define XLogHintBitIsNeeded()
Definition: xlog.h:192
bool track_io_timing
Definition: bufmgr.c:112
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:211
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:109
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:915
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2489