PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/rel.h"
53 #include "utils/resowner_private.h"
54 #include "utils/timestamp.h"
55 
56 
57 /* Note: these two macros only work on shared buffers, not local ones! */
58 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
59 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
60 
61 /* Note: this macro only works on local buffers, not shared ones! */
62 #define LocalBufHdrGetBlock(bufHdr) \
63  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
64 
65 /* Bits in SyncOneBuffer's return value */
66 #define BUF_WRITTEN 0x01
67 #define BUF_REUSABLE 0x02
68 
69 #define DROP_RELS_BSEARCH_THRESHOLD 20
70 
71 typedef struct PrivateRefCountEntry
72 {
76 
77 /* 64 bytes, about the size of a cache line on common systems */
78 #define REFCOUNT_ARRAY_ENTRIES 8
79 
80 /*
81  * Status of buffers to checkpoint for a particular tablespace, used
82  * internally in BufferSync.
83  */
84 typedef struct CkptTsStatus
85 {
86  /* oid of the tablespace */
88 
89  /*
90  * Checkpoint progress for this tablespace. To make progress comparable
91  * between tablespaces the progress is, for each tablespace, measured as a
92  * number between 0 and the total number of to-be-checkpointed pages. Each
93  * page checkpointed in this tablespace increments this space's progress
94  * by progress_slice.
95  */
98 
99  /* number of to-be checkpointed pages in this tablespace */
101  /* already processed pages in this tablespace */
103 
104  /* current offset in CkptBufferIds for this tablespace */
105  int index;
106 } CkptTsStatus;
107 
108 /* GUC variables */
109 bool zero_damaged_pages = false;
112 bool track_io_timing = false;
114 
115 /*
116  * GUC variables about triggering kernel writeback for buffers written; OS
117  * dependent defaults are set via the GUC mechanism.
118  */
122 
123 /*
124  * How many buffers PrefetchBuffer callers should try to stay ahead of their
125  * ReadBuffer calls by. This is maintained by the assign hook for
126  * effective_io_concurrency. Zero means "never prefetch". This value is
127  * only used for buffers not belonging to tablespaces that have their
128  * effective_io_concurrency parameter set.
129  */
131 
132 /* local state for StartBufferIO and related functions */
133 static BufferDesc *InProgressBuf = NULL;
134 static bool IsForInput;
135 
136 /* local state for LockBufferForCleanup */
138 
139 /*
140  * Backend-Private refcount management:
141  *
142  * Each buffer also has a private refcount that keeps track of the number of
143  * times the buffer is pinned in the current process. This is so that the
144  * shared refcount needs to be modified only once if a buffer is pinned more
145  * than once by an individual backend. It's also used to check that no buffers
146  * are still pinned at the end of transactions and when exiting.
147  *
148  *
149  * To avoid - as we used to - requiring an array with NBuffers entries to keep
150  * track of local buffers, we use a small sequentially searched array
151  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
152  * keep track of backend local pins.
153  *
154  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
155  * refcounts are kept track of in the array; after that, new array entries
156  * displace old ones into the hash table. That way a frequently used entry
157  * can't get "stuck" in the hashtable while infrequent ones clog the array.
158  *
159  * Note that in most scenarios the number of pinned buffers will not exceed
160  * REFCOUNT_ARRAY_ENTRIES.
161  *
162  *
163  * To enter a buffer into the refcount tracking mechanism first reserve a free
164  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
165  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
166  * memory allocations in NewPrivateRefCountEntry() which can be important
167  * because in some scenarios it's called with a spinlock held...
168  */
170 static HTAB *PrivateRefCountHash = NULL;
174 
175 static void ReservePrivateRefCountEntry(void);
178 static inline int32 GetPrivateRefCount(Buffer buffer);
180 
181 /*
182  * Ensure that the PrivateRefCountArray has sufficient space to store one more
183  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
184  * a new entry - but it's perfectly fine to not use a reserved entry.
185  */
186 static void
188 {
189  /* Already reserved (or freed), nothing to do */
190  if (ReservedRefCountEntry != NULL)
191  return;
192 
193  /*
194  * First search for a free entry the array, that'll be sufficient in the
195  * majority of cases.
196  */
197  {
198  int i;
199 
200  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
201  {
203 
204  res = &PrivateRefCountArray[i];
205 
206  if (res->buffer == InvalidBuffer)
207  {
208  ReservedRefCountEntry = res;
209  return;
210  }
211  }
212  }
213 
214  /*
215  * No luck. All array entries are full. Move one array entry into the hash
216  * table.
217  */
218  {
219  /*
220  * Move entry from the current clock position in the array into the
221  * hashtable. Use that slot.
222  */
223  PrivateRefCountEntry *hashent;
224  bool found;
225 
226  /* select victim slot */
227  ReservedRefCountEntry =
229 
230  /* Better be used, otherwise we shouldn't get here. */
231  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
232 
233  /* enter victim array entry into hashtable */
234  hashent = hash_search(PrivateRefCountHash,
235  (void *) &(ReservedRefCountEntry->buffer),
236  HASH_ENTER,
237  &found);
238  Assert(!found);
239  hashent->refcount = ReservedRefCountEntry->refcount;
240 
241  /* clear the now free array slot */
242  ReservedRefCountEntry->buffer = InvalidBuffer;
243  ReservedRefCountEntry->refcount = 0;
244 
246  }
247 }
248 
249 /*
250  * Fill a previously reserved refcount entry.
251  */
252 static PrivateRefCountEntry *
254 {
256 
257  /* only allowed to be called when a reservation has been made */
258  Assert(ReservedRefCountEntry != NULL);
259 
260  /* use up the reserved entry */
261  res = ReservedRefCountEntry;
262  ReservedRefCountEntry = NULL;
263 
264  /* and fill it */
265  res->buffer = buffer;
266  res->refcount = 0;
267 
268  return res;
269 }
270 
271 /*
272  * Return the PrivateRefCount entry for the passed buffer.
273  *
274  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
275  * do_move is true, and the entry resides in the hashtable the entry is
276  * optimized for frequent access by moving it to the array.
277  */
278 static PrivateRefCountEntry *
280 {
282  int i;
283 
284  Assert(BufferIsValid(buffer));
285  Assert(!BufferIsLocal(buffer));
286 
287  /*
288  * First search for references in the array, that'll be sufficient in the
289  * majority of cases.
290  */
291  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
292  {
293  res = &PrivateRefCountArray[i];
294 
295  if (res->buffer == buffer)
296  return res;
297  }
298 
299  /*
300  * By here we know that the buffer, if already pinned, isn't residing in
301  * the array.
302  *
303  * Only look up the buffer in the hashtable if we've previously overflowed
304  * into it.
305  */
306  if (PrivateRefCountOverflowed == 0)
307  return NULL;
308 
309  res = hash_search(PrivateRefCountHash,
310  (void *) &buffer,
311  HASH_FIND,
312  NULL);
313 
314  if (res == NULL)
315  return NULL;
316  else if (!do_move)
317  {
318  /* caller doesn't want us to move the hash entry into the array */
319  return res;
320  }
321  else
322  {
323  /* move buffer from hashtable into the free array slot */
324  bool found;
326 
327  /* Ensure there's a free array slot */
329 
330  /* Use up the reserved slot */
331  Assert(ReservedRefCountEntry != NULL);
332  free = ReservedRefCountEntry;
333  ReservedRefCountEntry = NULL;
334  Assert(free->buffer == InvalidBuffer);
335 
336  /* and fill it */
337  free->buffer = buffer;
338  free->refcount = res->refcount;
339 
340  /* delete from hashtable */
341  hash_search(PrivateRefCountHash,
342  (void *) &buffer,
343  HASH_REMOVE,
344  &found);
345  Assert(found);
348 
349  return free;
350  }
351 }
352 
353 /*
354  * Returns how many times the passed buffer is pinned by this backend.
355  *
356  * Only works for shared memory buffers!
357  */
358 static inline int32
360 {
362 
363  Assert(BufferIsValid(buffer));
364  Assert(!BufferIsLocal(buffer));
365 
366  /*
367  * Not moving the entry - that's ok for the current users, but we might
368  * want to change this one day.
369  */
370  ref = GetPrivateRefCountEntry(buffer, false);
371 
372  if (ref == NULL)
373  return 0;
374  return ref->refcount;
375 }
376 
377 /*
378  * Release resources used to track the reference count of a buffer which we no
379  * longer have pinned and don't want to pin again immediately.
380  */
381 static void
383 {
384  Assert(ref->refcount == 0);
385 
386  if (ref >= &PrivateRefCountArray[0] &&
388  {
389  ref->buffer = InvalidBuffer;
390 
391  /*
392  * Mark the just used entry as reserved - in many scenarios that
393  * allows us to avoid ever having to search the array/hash for free
394  * entries.
395  */
396  ReservedRefCountEntry = ref;
397  }
398  else
399  {
400  bool found;
401  Buffer buffer = ref->buffer;
402 
403  hash_search(PrivateRefCountHash,
404  (void *) &buffer,
405  HASH_REMOVE,
406  &found);
407  Assert(found);
410  }
411 }
412 
413 /*
414  * BufferIsPinned
415  * True iff the buffer is pinned (also checks for valid buffer number).
416  *
417  * NOTE: what we check here is that *this* backend holds a pin on
418  * the buffer. We do not care whether some other backend does.
419  */
420 #define BufferIsPinned(bufnum) \
421 ( \
422  !BufferIsValid(bufnum) ? \
423  false \
424  : \
425  BufferIsLocal(bufnum) ? \
426  (LocalRefCount[-(bufnum) - 1] > 0) \
427  : \
428  (GetPrivateRefCount(bufnum) > 0) \
429 )
430 
431 
433  ForkNumber forkNum, BlockNumber blockNum,
435  bool *hit);
436 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
437 static void PinBuffer_Locked(BufferDesc *buf);
438 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
439 static void BufferSync(int flags);
441 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
442  WritebackContext *wb_context);
443 static void WaitIO(BufferDesc *buf);
444 static bool StartBufferIO(BufferDesc *buf, bool forInput);
445 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
446  uint32 set_flag_bits);
447 static void shared_buffer_write_error_callback(void *arg);
448 static void local_buffer_write_error_callback(void *arg);
449 static BufferDesc *BufferAlloc(SMgrRelation smgr,
450  char relpersistence,
451  ForkNumber forkNum,
452  BlockNumber blockNum,
453  BufferAccessStrategy strategy,
454  bool *foundPtr);
455 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
456 static void AtProcExit_Buffers(int code, Datum arg);
457 static void CheckForBufferLeaks(void);
458 static int rnode_comparator(const void *p1, const void *p2);
459 static int buffertag_comparator(const void *p1, const void *p2);
460 static int ckpt_buforder_comparator(const void *pa, const void *pb);
461 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
462 
463 
464 /*
465  * ComputeIoConcurrency -- get the number of pages to prefetch for a given
466  * number of spindles.
467  */
468 bool
469 ComputeIoConcurrency(int io_concurrency, double *target)
470 {
471  double new_prefetch_pages = 0.0;
472  int i;
473 
474  /*
475  * Make sure the io_concurrency value is within valid range; it may have
476  * been forced with a manual pg_tablespace update.
477  */
478  io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
479 
480  /*----------
481  * The user-visible GUC parameter is the number of drives (spindles),
482  * which we need to translate to a number-of-pages-to-prefetch target.
483  * The target value is stashed in *extra and then assigned to the actual
484  * variable by assign_effective_io_concurrency.
485  *
486  * The expected number of prefetch pages needed to keep N drives busy is:
487  *
488  * drives | I/O requests
489  * -------+----------------
490  * 1 | 1
491  * 2 | 2/1 + 2/2 = 3
492  * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
493  * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
494  * n | n * H(n)
495  *
496  * This is called the "coupon collector problem" and H(n) is called the
497  * harmonic series. This could be approximated by n * ln(n), but for
498  * reasonable numbers of drives we might as well just compute the series.
499  *
500  * Alternatively we could set the target to the number of pages necessary
501  * so that the expected number of active spindles is some arbitrary
502  * percentage of the total. This sounds the same but is actually slightly
503  * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
504  * that desired fraction.
505  *
506  * Experimental results show that both of these formulas aren't aggressive
507  * enough, but we don't really have any better proposals.
508  *
509  * Note that if io_concurrency = 0 (disabled), we must set target = 0.
510  *----------
511  */
512 
513  for (i = 1; i <= io_concurrency; i++)
514  new_prefetch_pages += (double) io_concurrency / (double) i;
515 
516  *target = new_prefetch_pages;
517 
518  /* This range check shouldn't fail, but let's be paranoid */
519  return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
520 }
521 
522 /*
523  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
524  *
525  * This is named by analogy to ReadBuffer but doesn't actually allocate a
526  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
527  * block will not be delayed by the I/O. Prefetching is optional.
528  * No-op if prefetching isn't compiled in.
529  */
530 void
532 {
533 #ifdef USE_PREFETCH
534  Assert(RelationIsValid(reln));
535  Assert(BlockNumberIsValid(blockNum));
536 
537  /* Open it at the smgr level if not already done */
538  RelationOpenSmgr(reln);
539 
540  if (RelationUsesLocalBuffers(reln))
541  {
542  /* see comments in ReadBufferExtended */
543  if (RELATION_IS_OTHER_TEMP(reln))
544  ereport(ERROR,
545  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
546  errmsg("cannot access temporary tables of other sessions")));
547 
548  /* pass it off to localbuf.c */
549  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
550  }
551  else
552  {
553  BufferTag newTag; /* identity of requested block */
554  uint32 newHash; /* hash value for newTag */
555  LWLock *newPartitionLock; /* buffer partition lock for it */
556  int buf_id;
557 
558  /* create a tag so we can lookup the buffer */
559  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
560  forkNum, blockNum);
561 
562  /* determine its hash code and partition lock ID */
563  newHash = BufTableHashCode(&newTag);
564  newPartitionLock = BufMappingPartitionLock(newHash);
565 
566  /* see if the block is in the buffer pool already */
567  LWLockAcquire(newPartitionLock, LW_SHARED);
568  buf_id = BufTableLookup(&newTag, newHash);
569  LWLockRelease(newPartitionLock);
570 
571  /* If not in buffers, initiate prefetch */
572  if (buf_id < 0)
573  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
574 
575  /*
576  * If the block *is* in buffers, we do nothing. This is not really
577  * ideal: the block might be just about to be evicted, which would be
578  * stupid since we know we are going to need it soon. But the only
579  * easy answer is to bump the usage_count, which does not seem like a
580  * great solution: when the caller does ultimately touch the block,
581  * usage_count would get bumped again, resulting in too much
582  * favoritism for blocks that are involved in a prefetch sequence. A
583  * real fix would involve some additional per-buffer state, and it's
584  * not clear that there's enough of a problem to justify that.
585  */
586  }
587 #endif /* USE_PREFETCH */
588 }
589 
590 
591 /*
592  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
593  * fork with RBM_NORMAL mode and default strategy.
594  */
595 Buffer
597 {
598  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
599 }
600 
601 /*
602  * ReadBufferExtended -- returns a buffer containing the requested
603  * block of the requested relation. If the blknum
604  * requested is P_NEW, extend the relation file and
605  * allocate a new block. (Caller is responsible for
606  * ensuring that only one backend tries to extend a
607  * relation at the same time!)
608  *
609  * Returns: the buffer number for the buffer containing
610  * the block read. The returned buffer has been pinned.
611  * Does not return on error --- elog's instead.
612  *
613  * Assume when this function is called, that reln has been opened already.
614  *
615  * In RBM_NORMAL mode, the page is read from disk, and the page header is
616  * validated. An error is thrown if the page header is not valid. (But
617  * note that an all-zero page is considered "valid"; see PageIsVerified().)
618  *
619  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
620  * valid, the page is zeroed instead of throwing an error. This is intended
621  * for non-critical data, where the caller is prepared to repair errors.
622  *
623  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
624  * filled with zeros instead of reading it from disk. Useful when the caller
625  * is going to fill the page from scratch, since this saves I/O and avoids
626  * unnecessary failure if the page-on-disk has corrupt page headers.
627  * The page is returned locked to ensure that the caller has a chance to
628  * initialize the page before it's made visible to others.
629  * Caution: do not use this mode to read a page that is beyond the relation's
630  * current physical EOF; that is likely to cause problems in md.c when
631  * the page is modified and written out. P_NEW is OK, though.
632  *
633  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
634  * a cleanup-strength lock on the page.
635  *
636  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
637  *
638  * If strategy is not NULL, a nondefault buffer access strategy is used.
639  * See buffer/README for details.
640  */
641 Buffer
644 {
645  bool hit;
646  Buffer buf;
647 
648  /* Open it at the smgr level if not already done */
649  RelationOpenSmgr(reln);
650 
651  /*
652  * Reject attempts to read non-local temporary relations; we would be
653  * likely to get wrong data since we have no visibility into the owning
654  * session's local buffers.
655  */
656  if (RELATION_IS_OTHER_TEMP(reln))
657  ereport(ERROR,
658  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
659  errmsg("cannot access temporary tables of other sessions")));
660 
661  /*
662  * Read the buffer, and update pgstat counters to reflect a cache hit or
663  * miss.
664  */
666  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
667  forkNum, blockNum, mode, strategy, &hit);
668  if (hit)
670  return buf;
671 }
672 
673 
674 /*
675  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
676  * a relcache entry for the relation.
677  *
678  * NB: At present, this function may only be used on permanent relations, which
679  * is OK, because we only use it during XLOG replay. If in the future we
680  * want to use it on temporary or unlogged relations, we could pass additional
681  * parameters.
682  */
683 Buffer
685  BlockNumber blockNum, ReadBufferMode mode,
686  BufferAccessStrategy strategy)
687 {
688  bool hit;
689 
690  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
691 
693 
694  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
695  mode, strategy, &hit);
696 }
697 
698 
699 /*
700  * ReadBuffer_common -- common logic for all ReadBuffer variants
701  *
702  * *hit is set to true if the request was satisfied from shared buffer cache.
703  */
704 static Buffer
706  BlockNumber blockNum, ReadBufferMode mode,
707  BufferAccessStrategy strategy, bool *hit)
708 {
709  BufferDesc *bufHdr;
710  Block bufBlock;
711  bool found;
712  bool isExtend;
713  bool isLocalBuf = SmgrIsTemp(smgr);
714 
715  *hit = false;
716 
717  /* Make sure we will have room to remember the buffer pin */
719 
720  isExtend = (blockNum == P_NEW);
721 
722  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
723  smgr->smgr_rnode.node.spcNode,
724  smgr->smgr_rnode.node.dbNode,
725  smgr->smgr_rnode.node.relNode,
726  smgr->smgr_rnode.backend,
727  isExtend);
728 
729  /* Substitute proper block number if caller asked for P_NEW */
730  if (isExtend)
731  blockNum = smgrnblocks(smgr, forkNum);
732 
733  if (isLocalBuf)
734  {
735  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
736  if (found)
738  else if (isExtend)
740  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
741  mode == RBM_ZERO_ON_ERROR)
743  }
744  else
745  {
746  /*
747  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
748  * not currently in memory.
749  */
750  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
751  strategy, &found);
752  if (found)
754  else if (isExtend)
756  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
757  mode == RBM_ZERO_ON_ERROR)
759  }
760 
761  /* At this point we do NOT hold any locks. */
762 
763  /* if it was already in the buffer pool, we're done */
764  if (found)
765  {
766  if (!isExtend)
767  {
768  /* Just need to update stats before we exit */
769  *hit = true;
770  VacuumPageHit++;
771 
772  if (VacuumCostActive)
774 
775  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
776  smgr->smgr_rnode.node.spcNode,
777  smgr->smgr_rnode.node.dbNode,
778  smgr->smgr_rnode.node.relNode,
779  smgr->smgr_rnode.backend,
780  isExtend,
781  found);
782 
783  /*
784  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
785  * locked on return.
786  */
787  if (!isLocalBuf)
788  {
789  if (mode == RBM_ZERO_AND_LOCK)
791  LW_EXCLUSIVE);
792  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
794  }
795 
796  return BufferDescriptorGetBuffer(bufHdr);
797  }
798 
799  /*
800  * We get here only in the corner case where we are trying to extend
801  * the relation but we found a pre-existing buffer marked BM_VALID.
802  * This can happen because mdread doesn't complain about reads beyond
803  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
804  * read a block beyond EOF could have left a "valid" zero-filled
805  * buffer. Unfortunately, we have also seen this case occurring
806  * because of buggy Linux kernels that sometimes return an
807  * lseek(SEEK_END) result that doesn't account for a recent write. In
808  * that situation, the pre-existing buffer would contain valid data
809  * that we don't want to overwrite. Since the legitimate case should
810  * always have left a zero-filled buffer, complain if not PageIsNew.
811  */
812  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
813  if (!PageIsNew((Page) bufBlock))
814  ereport(ERROR,
815  (errmsg("unexpected data beyond EOF in block %u of relation %s",
816  blockNum, relpath(smgr->smgr_rnode, forkNum)),
817  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
818 
819  /*
820  * We *must* do smgrextend before succeeding, else the page will not
821  * be reserved by the kernel, and the next P_NEW call will decide to
822  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
823  * call that BufferAlloc didn't, and proceed.
824  */
825  if (isLocalBuf)
826  {
827  /* Only need to adjust flags */
828  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
829 
830  Assert(buf_state & BM_VALID);
831  buf_state &= ~BM_VALID;
832  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
833  }
834  else
835  {
836  /*
837  * Loop to handle the very small possibility that someone re-sets
838  * BM_VALID between our clearing it and StartBufferIO inspecting
839  * it.
840  */
841  do
842  {
843  uint32 buf_state = LockBufHdr(bufHdr);
844 
845  Assert(buf_state & BM_VALID);
846  buf_state &= ~BM_VALID;
847  UnlockBufHdr(bufHdr, buf_state);
848  } while (!StartBufferIO(bufHdr, true));
849  }
850  }
851 
852  /*
853  * if we have gotten to this point, we have allocated a buffer for the
854  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
855  * if it's a shared buffer.
856  *
857  * Note: if smgrextend fails, we will end up with a buffer that is
858  * allocated but not marked BM_VALID. P_NEW will still select the same
859  * block number (because the relation didn't get any longer on disk) and
860  * so future attempts to extend the relation will find the same buffer (if
861  * it's not been recycled) but come right back here to try smgrextend
862  * again.
863  */
864  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
865 
866  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
867 
868  if (isExtend)
869  {
870  /* new buffers are zero-filled */
871  MemSet((char *) bufBlock, 0, BLCKSZ);
872  /* don't set checksum for all-zero page */
873  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
874 
875  /*
876  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
877  * although we're essentially performing a write. At least on linux
878  * doing so defeats the 'delayed allocation' mechanism, leading to
879  * increased file fragmentation.
880  */
881  }
882  else
883  {
884  /*
885  * Read in the page, unless the caller intends to overwrite it and
886  * just wants us to allocate a buffer.
887  */
888  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
889  MemSet((char *) bufBlock, 0, BLCKSZ);
890  else
891  {
892  instr_time io_start,
893  io_time;
894 
895  if (track_io_timing)
896  INSTR_TIME_SET_CURRENT(io_start);
897 
898  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
899 
900  if (track_io_timing)
901  {
902  INSTR_TIME_SET_CURRENT(io_time);
903  INSTR_TIME_SUBTRACT(io_time, io_start);
906  }
907 
908  /* check for garbage data */
909  if (!PageIsVerified((Page) bufBlock, blockNum))
910  {
911  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
912  {
915  errmsg("invalid page in block %u of relation %s; zeroing out page",
916  blockNum,
917  relpath(smgr->smgr_rnode, forkNum))));
918  MemSet((char *) bufBlock, 0, BLCKSZ);
919  }
920  else
921  ereport(ERROR,
923  errmsg("invalid page in block %u of relation %s",
924  blockNum,
925  relpath(smgr->smgr_rnode, forkNum))));
926  }
927  }
928  }
929 
930  /*
931  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
932  * the page as valid, to make sure that no other backend sees the zeroed
933  * page before the caller has had a chance to initialize it.
934  *
935  * Since no-one else can be looking at the page contents yet, there is no
936  * difference between an exclusive lock and a cleanup-strength lock. (Note
937  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
938  * they assert that the buffer is already valid.)
939  */
940  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
941  !isLocalBuf)
942  {
944  }
945 
946  if (isLocalBuf)
947  {
948  /* Only need to adjust flags */
949  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
950 
951  buf_state |= BM_VALID;
952  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
953  }
954  else
955  {
956  /* Set BM_VALID, terminate IO, and wake up any waiters */
957  TerminateBufferIO(bufHdr, false, BM_VALID);
958  }
959 
960  VacuumPageMiss++;
961  if (VacuumCostActive)
963 
964  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
965  smgr->smgr_rnode.node.spcNode,
966  smgr->smgr_rnode.node.dbNode,
967  smgr->smgr_rnode.node.relNode,
968  smgr->smgr_rnode.backend,
969  isExtend,
970  found);
971 
972  return BufferDescriptorGetBuffer(bufHdr);
973 }
974 
975 /*
976  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
977  * buffer. If no buffer exists already, selects a replacement
978  * victim and evicts the old page, but does NOT read in new page.
979  *
980  * "strategy" can be a buffer replacement strategy object, or NULL for
981  * the default strategy. The selected buffer's usage_count is advanced when
982  * using the default strategy, but otherwise possibly not (see PinBuffer).
983  *
984  * The returned buffer is pinned and is already marked as holding the
985  * desired page. If it already did have the desired page, *foundPtr is
986  * set true. Otherwise, *foundPtr is set false and the buffer is marked
987  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
988  *
989  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
990  * we keep it for simplicity in ReadBuffer.
991  *
992  * No locks are held either at entry or exit.
993  */
994 static BufferDesc *
996  BlockNumber blockNum,
997  BufferAccessStrategy strategy,
998  bool *foundPtr)
999 {
1000  BufferTag newTag; /* identity of requested block */
1001  uint32 newHash; /* hash value for newTag */
1002  LWLock *newPartitionLock; /* buffer partition lock for it */
1003  BufferTag oldTag; /* previous identity of selected buffer */
1004  uint32 oldHash; /* hash value for oldTag */
1005  LWLock *oldPartitionLock; /* buffer partition lock for it */
1006  uint32 oldFlags;
1007  int buf_id;
1008  BufferDesc *buf;
1009  bool valid;
1010  uint32 buf_state;
1011 
1012  /* create a tag so we can lookup the buffer */
1013  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1014 
1015  /* determine its hash code and partition lock ID */
1016  newHash = BufTableHashCode(&newTag);
1017  newPartitionLock = BufMappingPartitionLock(newHash);
1018 
1019  /* see if the block is in the buffer pool already */
1020  LWLockAcquire(newPartitionLock, LW_SHARED);
1021  buf_id = BufTableLookup(&newTag, newHash);
1022  if (buf_id >= 0)
1023  {
1024  /*
1025  * Found it. Now, pin the buffer so no one can steal it from the
1026  * buffer pool, and check to see if the correct data has been loaded
1027  * into the buffer.
1028  */
1029  buf = GetBufferDescriptor(buf_id);
1030 
1031  valid = PinBuffer(buf, strategy);
1032 
1033  /* Can release the mapping lock as soon as we've pinned it */
1034  LWLockRelease(newPartitionLock);
1035 
1036  *foundPtr = true;
1037 
1038  if (!valid)
1039  {
1040  /*
1041  * We can only get here if (a) someone else is still reading in
1042  * the page, or (b) a previous read attempt failed. We have to
1043  * wait for any active read attempt to finish, and then set up our
1044  * own read attempt if the page is still not BM_VALID.
1045  * StartBufferIO does it all.
1046  */
1047  if (StartBufferIO(buf, true))
1048  {
1049  /*
1050  * If we get here, previous attempts to read the buffer must
1051  * have failed ... but we shall bravely try again.
1052  */
1053  *foundPtr = false;
1054  }
1055  }
1056 
1057  return buf;
1058  }
1059 
1060  /*
1061  * Didn't find it in the buffer pool. We'll have to initialize a new
1062  * buffer. Remember to unlock the mapping lock while doing the work.
1063  */
1064  LWLockRelease(newPartitionLock);
1065 
1066  /* Loop here in case we have to try another victim buffer */
1067  for (;;)
1068  {
1069  /*
1070  * Ensure, while the spinlock's not yet held, that there's a free
1071  * refcount entry.
1072  */
1074 
1075  /*
1076  * Select a victim buffer. The buffer is returned with its header
1077  * spinlock still held!
1078  */
1079  buf = StrategyGetBuffer(strategy, &buf_state);
1080 
1081  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1082 
1083  /* Must copy buffer flags while we still hold the spinlock */
1084  oldFlags = buf_state & BUF_FLAG_MASK;
1085 
1086  /* Pin the buffer and then release the buffer spinlock */
1087  PinBuffer_Locked(buf);
1088 
1089  /*
1090  * If the buffer was dirty, try to write it out. There is a race
1091  * condition here, in that someone might dirty it after we released it
1092  * above, or even while we are writing it out (since our share-lock
1093  * won't prevent hint-bit updates). We will recheck the dirty bit
1094  * after re-locking the buffer header.
1095  */
1096  if (oldFlags & BM_DIRTY)
1097  {
1098  /*
1099  * We need a share-lock on the buffer contents to write it out
1100  * (else we might write invalid data, eg because someone else is
1101  * compacting the page contents while we write). We must use a
1102  * conditional lock acquisition here to avoid deadlock. Even
1103  * though the buffer was not pinned (and therefore surely not
1104  * locked) when StrategyGetBuffer returned it, someone else could
1105  * have pinned and exclusive-locked it by the time we get here. If
1106  * we try to get the lock unconditionally, we'd block waiting for
1107  * them; if they later block waiting for us, deadlock ensues.
1108  * (This has been observed to happen when two backends are both
1109  * trying to split btree index pages, and the second one just
1110  * happens to be trying to split the page the first one got from
1111  * StrategyGetBuffer.)
1112  */
1114  LW_SHARED))
1115  {
1116  /*
1117  * If using a nondefault strategy, and writing the buffer
1118  * would require a WAL flush, let the strategy decide whether
1119  * to go ahead and write/reuse the buffer or to choose another
1120  * victim. We need lock to inspect the page LSN, so this
1121  * can't be done inside StrategyGetBuffer.
1122  */
1123  if (strategy != NULL)
1124  {
1125  XLogRecPtr lsn;
1126 
1127  /* Read the LSN while holding buffer header lock */
1128  buf_state = LockBufHdr(buf);
1129  lsn = BufferGetLSN(buf);
1130  UnlockBufHdr(buf, buf_state);
1131 
1132  if (XLogNeedsFlush(lsn) &&
1133  StrategyRejectBuffer(strategy, buf))
1134  {
1135  /* Drop lock/pin and loop around for another buffer */
1137  UnpinBuffer(buf, true);
1138  continue;
1139  }
1140  }
1141 
1142  /* OK, do the I/O */
1143  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1144  smgr->smgr_rnode.node.spcNode,
1145  smgr->smgr_rnode.node.dbNode,
1146  smgr->smgr_rnode.node.relNode);
1147 
1148  FlushBuffer(buf, NULL);
1150 
1152  &buf->tag);
1153 
1154  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1155  smgr->smgr_rnode.node.spcNode,
1156  smgr->smgr_rnode.node.dbNode,
1157  smgr->smgr_rnode.node.relNode);
1158  }
1159  else
1160  {
1161  /*
1162  * Someone else has locked the buffer, so give it up and loop
1163  * back to get another one.
1164  */
1165  UnpinBuffer(buf, true);
1166  continue;
1167  }
1168  }
1169 
1170  /*
1171  * To change the association of a valid buffer, we'll need to have
1172  * exclusive lock on both the old and new mapping partitions.
1173  */
1174  if (oldFlags & BM_TAG_VALID)
1175  {
1176  /*
1177  * Need to compute the old tag's hashcode and partition lock ID.
1178  * XXX is it worth storing the hashcode in BufferDesc so we need
1179  * not recompute it here? Probably not.
1180  */
1181  oldTag = buf->tag;
1182  oldHash = BufTableHashCode(&oldTag);
1183  oldPartitionLock = BufMappingPartitionLock(oldHash);
1184 
1185  /*
1186  * Must lock the lower-numbered partition first to avoid
1187  * deadlocks.
1188  */
1189  if (oldPartitionLock < newPartitionLock)
1190  {
1191  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1192  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1193  }
1194  else if (oldPartitionLock > newPartitionLock)
1195  {
1196  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1197  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1198  }
1199  else
1200  {
1201  /* only one partition, only one lock */
1202  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1203  }
1204  }
1205  else
1206  {
1207  /* if it wasn't valid, we need only the new partition */
1208  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1209  /* remember we have no old-partition lock or tag */
1210  oldPartitionLock = NULL;
1211  /* this just keeps the compiler quiet about uninit variables */
1212  oldHash = 0;
1213  }
1214 
1215  /*
1216  * Try to make a hashtable entry for the buffer under its new tag.
1217  * This could fail because while we were writing someone else
1218  * allocated another buffer for the same block we want to read in.
1219  * Note that we have not yet removed the hashtable entry for the old
1220  * tag.
1221  */
1222  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1223 
1224  if (buf_id >= 0)
1225  {
1226  /*
1227  * Got a collision. Someone has already done what we were about to
1228  * do. We'll just handle this as if it were found in the buffer
1229  * pool in the first place. First, give up the buffer we were
1230  * planning to use.
1231  */
1232  UnpinBuffer(buf, true);
1233 
1234  /* Can give up that buffer's mapping partition lock now */
1235  if (oldPartitionLock != NULL &&
1236  oldPartitionLock != newPartitionLock)
1237  LWLockRelease(oldPartitionLock);
1238 
1239  /* remaining code should match code at top of routine */
1240 
1241  buf = GetBufferDescriptor(buf_id);
1242 
1243  valid = PinBuffer(buf, strategy);
1244 
1245  /* Can release the mapping lock as soon as we've pinned it */
1246  LWLockRelease(newPartitionLock);
1247 
1248  *foundPtr = true;
1249 
1250  if (!valid)
1251  {
1252  /*
1253  * We can only get here if (a) someone else is still reading
1254  * in the page, or (b) a previous read attempt failed. We
1255  * have to wait for any active read attempt to finish, and
1256  * then set up our own read attempt if the page is still not
1257  * BM_VALID. StartBufferIO does it all.
1258  */
1259  if (StartBufferIO(buf, true))
1260  {
1261  /*
1262  * If we get here, previous attempts to read the buffer
1263  * must have failed ... but we shall bravely try again.
1264  */
1265  *foundPtr = false;
1266  }
1267  }
1268 
1269  return buf;
1270  }
1271 
1272  /*
1273  * Need to lock the buffer header too in order to change its tag.
1274  */
1275  buf_state = LockBufHdr(buf);
1276 
1277  /*
1278  * Somebody could have pinned or re-dirtied the buffer while we were
1279  * doing the I/O and making the new hashtable entry. If so, we can't
1280  * recycle this buffer; we must undo everything we've done and start
1281  * over with a new victim buffer.
1282  */
1283  oldFlags = buf_state & BUF_FLAG_MASK;
1284  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1285  break;
1286 
1287  UnlockBufHdr(buf, buf_state);
1288  BufTableDelete(&newTag, newHash);
1289  if (oldPartitionLock != NULL &&
1290  oldPartitionLock != newPartitionLock)
1291  LWLockRelease(oldPartitionLock);
1292  LWLockRelease(newPartitionLock);
1293  UnpinBuffer(buf, true);
1294  }
1295 
1296  /*
1297  * Okay, it's finally safe to rename the buffer.
1298  *
1299  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1300  * paranoia. We also reset the usage_count since any recency of use of
1301  * the old content is no longer relevant. (The usage_count starts out at
1302  * 1 so that the buffer can survive one clock-sweep pass.)
1303  *
1304  * Make sure BM_PERMANENT is set for buffers that must be written at every
1305  * checkpoint. Unlogged buffers only need to be written at shutdown
1306  * checkpoints, except for their "init" forks, which need to be treated
1307  * just like permanent relations.
1308  */
1309  buf->tag = newTag;
1310  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1313  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1314  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1315  else
1316  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1317 
1318  UnlockBufHdr(buf, buf_state);
1319 
1320  if (oldPartitionLock != NULL)
1321  {
1322  BufTableDelete(&oldTag, oldHash);
1323  if (oldPartitionLock != newPartitionLock)
1324  LWLockRelease(oldPartitionLock);
1325  }
1326 
1327  LWLockRelease(newPartitionLock);
1328 
1329  /*
1330  * Buffer contents are currently invalid. Try to get the io_in_progress
1331  * lock. If StartBufferIO returns false, then someone else managed to
1332  * read it before we did, so there's nothing left for BufferAlloc() to do.
1333  */
1334  if (StartBufferIO(buf, true))
1335  *foundPtr = false;
1336  else
1337  *foundPtr = true;
1338 
1339  return buf;
1340 }
1341 
1342 /*
1343  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1344  * freelist.
1345  *
1346  * The buffer header spinlock must be held at entry. We drop it before
1347  * returning. (This is sane because the caller must have locked the
1348  * buffer in order to be sure it should be dropped.)
1349  *
1350  * This is used only in contexts such as dropping a relation. We assume
1351  * that no other backend could possibly be interested in using the page,
1352  * so the only reason the buffer might be pinned is if someone else is
1353  * trying to write it out. We have to let them finish before we can
1354  * reclaim the buffer.
1355  *
1356  * The buffer could get reclaimed by someone else while we are waiting
1357  * to acquire the necessary locks; if so, don't mess it up.
1358  */
1359 static void
1361 {
1362  BufferTag oldTag;
1363  uint32 oldHash; /* hash value for oldTag */
1364  LWLock *oldPartitionLock; /* buffer partition lock for it */
1365  uint32 oldFlags;
1366  uint32 buf_state;
1367 
1368  /* Save the original buffer tag before dropping the spinlock */
1369  oldTag = buf->tag;
1370 
1371  buf_state = pg_atomic_read_u32(&buf->state);
1372  Assert(buf_state & BM_LOCKED);
1373  UnlockBufHdr(buf, buf_state);
1374 
1375  /*
1376  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1377  * worth storing the hashcode in BufferDesc so we need not recompute it
1378  * here? Probably not.
1379  */
1380  oldHash = BufTableHashCode(&oldTag);
1381  oldPartitionLock = BufMappingPartitionLock(oldHash);
1382 
1383 retry:
1384 
1385  /*
1386  * Acquire exclusive mapping lock in preparation for changing the buffer's
1387  * association.
1388  */
1389  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1390 
1391  /* Re-lock the buffer header */
1392  buf_state = LockBufHdr(buf);
1393 
1394  /* If it's changed while we were waiting for lock, do nothing */
1395  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1396  {
1397  UnlockBufHdr(buf, buf_state);
1398  LWLockRelease(oldPartitionLock);
1399  return;
1400  }
1401 
1402  /*
1403  * We assume the only reason for it to be pinned is that someone else is
1404  * flushing the page out. Wait for them to finish. (This could be an
1405  * infinite loop if the refcount is messed up... it would be nice to time
1406  * out after awhile, but there seems no way to be sure how many loops may
1407  * be needed. Note that if the other guy has pinned the buffer but not
1408  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1409  * be busy-looping here.)
1410  */
1411  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1412  {
1413  UnlockBufHdr(buf, buf_state);
1414  LWLockRelease(oldPartitionLock);
1415  /* safety check: should definitely not be our *own* pin */
1417  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1418  WaitIO(buf);
1419  goto retry;
1420  }
1421 
1422  /*
1423  * Clear out the buffer's tag and flags. We must do this to ensure that
1424  * linear scans of the buffer array don't think the buffer is valid.
1425  */
1426  oldFlags = buf_state & BUF_FLAG_MASK;
1427  CLEAR_BUFFERTAG(buf->tag);
1428  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1429  UnlockBufHdr(buf, buf_state);
1430 
1431  /*
1432  * Remove the buffer from the lookup hashtable, if it was in there.
1433  */
1434  if (oldFlags & BM_TAG_VALID)
1435  BufTableDelete(&oldTag, oldHash);
1436 
1437  /*
1438  * Done with mapping lock.
1439  */
1440  LWLockRelease(oldPartitionLock);
1441 
1442  /*
1443  * Insert the buffer at the head of the list of free buffers.
1444  */
1445  StrategyFreeBuffer(buf);
1446 }
1447 
1448 /*
1449  * MarkBufferDirty
1450  *
1451  * Marks buffer contents as dirty (actual write happens later).
1452  *
1453  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1454  * exclusive lock, then somebody could be in process of writing the buffer,
1455  * leading to risk of bad data written to disk.)
1456  */
1457 void
1459 {
1460  BufferDesc *bufHdr;
1461  uint32 buf_state;
1462  uint32 old_buf_state;
1463 
1464  if (!BufferIsValid(buffer))
1465  elog(ERROR, "bad buffer ID: %d", buffer);
1466 
1467  if (BufferIsLocal(buffer))
1468  {
1469  MarkLocalBufferDirty(buffer);
1470  return;
1471  }
1472 
1473  bufHdr = GetBufferDescriptor(buffer - 1);
1474 
1475  Assert(BufferIsPinned(buffer));
1477  LW_EXCLUSIVE));
1478 
1479  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1480  for (;;)
1481  {
1482  if (old_buf_state & BM_LOCKED)
1483  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1484 
1485  buf_state = old_buf_state;
1486 
1487  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1488  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1489 
1490  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1491  buf_state))
1492  break;
1493  }
1494 
1495  /*
1496  * If the buffer was not dirty already, do vacuum accounting.
1497  */
1498  if (!(old_buf_state & BM_DIRTY))
1499  {
1500  VacuumPageDirty++;
1502  if (VacuumCostActive)
1504  }
1505 }
1506 
1507 /*
1508  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1509  *
1510  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1511  * compared to calling the two routines separately. Now it's mainly just
1512  * a convenience function. However, if the passed buffer is valid and
1513  * already contains the desired block, we just return it as-is; and that
1514  * does save considerable work compared to a full release and reacquire.
1515  *
1516  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1517  * buffer actually needs to be released. This case is the same as ReadBuffer,
1518  * but can save some tests in the caller.
1519  */
1520 Buffer
1522  Relation relation,
1523  BlockNumber blockNum)
1524 {
1525  ForkNumber forkNum = MAIN_FORKNUM;
1526  BufferDesc *bufHdr;
1527 
1528  if (BufferIsValid(buffer))
1529  {
1530  Assert(BufferIsPinned(buffer));
1531  if (BufferIsLocal(buffer))
1532  {
1533  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1534  if (bufHdr->tag.blockNum == blockNum &&
1535  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1536  bufHdr->tag.forkNum == forkNum)
1537  return buffer;
1539  LocalRefCount[-buffer - 1]--;
1540  }
1541  else
1542  {
1543  bufHdr = GetBufferDescriptor(buffer - 1);
1544  /* we have pin, so it's ok to examine tag without spinlock */
1545  if (bufHdr->tag.blockNum == blockNum &&
1546  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1547  bufHdr->tag.forkNum == forkNum)
1548  return buffer;
1549  UnpinBuffer(bufHdr, true);
1550  }
1551  }
1552 
1553  return ReadBuffer(relation, blockNum);
1554 }
1555 
1556 /*
1557  * PinBuffer -- make buffer unavailable for replacement.
1558  *
1559  * For the default access strategy, the buffer's usage_count is incremented
1560  * when we first pin it; for other strategies we just make sure the usage_count
1561  * isn't zero. (The idea of the latter is that we don't want synchronized
1562  * heap scans to inflate the count, but we need it to not be zero to discourage
1563  * other backends from stealing buffers from our ring. As long as we cycle
1564  * through the ring faster than the global clock-sweep cycles, buffers in
1565  * our ring won't be chosen as victims for replacement by other backends.)
1566  *
1567  * This should be applied only to shared buffers, never local ones.
1568  *
1569  * Since buffers are pinned/unpinned very frequently, pin buffers without
1570  * taking the buffer header lock; instead update the state variable in loop of
1571  * CAS operations. Hopefully it's just a single CAS.
1572  *
1573  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1574  *
1575  * Returns true if buffer is BM_VALID, else false. This provision allows
1576  * some callers to avoid an extra spinlock cycle.
1577  */
1578 static bool
1580 {
1582  bool result;
1583  PrivateRefCountEntry *ref;
1584 
1585  ref = GetPrivateRefCountEntry(b, true);
1586 
1587  if (ref == NULL)
1588  {
1589  uint32 buf_state;
1590  uint32 old_buf_state;
1591 
1593  ref = NewPrivateRefCountEntry(b);
1594 
1595  old_buf_state = pg_atomic_read_u32(&buf->state);
1596  for (;;)
1597  {
1598  if (old_buf_state & BM_LOCKED)
1599  old_buf_state = WaitBufHdrUnlocked(buf);
1600 
1601  buf_state = old_buf_state;
1602 
1603  /* increase refcount */
1604  buf_state += BUF_REFCOUNT_ONE;
1605 
1606  if (strategy == NULL)
1607  {
1608  /* Default case: increase usagecount unless already max. */
1610  buf_state += BUF_USAGECOUNT_ONE;
1611  }
1612  else
1613  {
1614  /*
1615  * Ring buffers shouldn't evict others from pool. Thus we
1616  * don't make usagecount more than 1.
1617  */
1618  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1619  buf_state += BUF_USAGECOUNT_ONE;
1620  }
1621 
1622  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1623  buf_state))
1624  {
1625  result = (buf_state & BM_VALID) != 0;
1626  break;
1627  }
1628  }
1629  }
1630  else
1631  {
1632  /* If we previously pinned the buffer, it must surely be valid */
1633  result = true;
1634  }
1635 
1636  ref->refcount++;
1637  Assert(ref->refcount > 0);
1639  return result;
1640 }
1641 
1642 /*
1643  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1644  * The spinlock is released before return.
1645  *
1646  * As this function is called with the spinlock held, the caller has to
1647  * previously call ReservePrivateRefCountEntry().
1648  *
1649  * Currently, no callers of this function want to modify the buffer's
1650  * usage_count at all, so there's no need for a strategy parameter.
1651  * Also we don't bother with a BM_VALID test (the caller could check that for
1652  * itself).
1653  *
1654  * Also all callers only ever use this function when it's known that the
1655  * buffer can't have a preexisting pin by this backend. That allows us to skip
1656  * searching the private refcount array & hash, which is a boon, because the
1657  * spinlock is still held.
1658  *
1659  * Note: use of this routine is frequently mandatory, not just an optimization
1660  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1661  * its state can change under us.
1662  */
1663 static void
1665 {
1666  Buffer b;
1667  PrivateRefCountEntry *ref;
1668  uint32 buf_state;
1669 
1670  /*
1671  * As explained, We don't expect any preexisting pins. That allows us to
1672  * manipulate the PrivateRefCount after releasing the spinlock
1673  */
1675 
1676  /*
1677  * Since we hold the buffer spinlock, we can update the buffer state and
1678  * release the lock in one operation.
1679  */
1680  buf_state = pg_atomic_read_u32(&buf->state);
1681  Assert(buf_state & BM_LOCKED);
1682  buf_state += BUF_REFCOUNT_ONE;
1683  UnlockBufHdr(buf, buf_state);
1684 
1685  b = BufferDescriptorGetBuffer(buf);
1686 
1687  ref = NewPrivateRefCountEntry(b);
1688  ref->refcount++;
1689 
1691 }
1692 
1693 /*
1694  * UnpinBuffer -- make buffer available for replacement.
1695  *
1696  * This should be applied only to shared buffers, never local ones.
1697  *
1698  * Most but not all callers want CurrentResourceOwner to be adjusted.
1699  * Those that don't should pass fixOwner = false.
1700  */
1701 static void
1702 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1703 {
1704  PrivateRefCountEntry *ref;
1706 
1707  /* not moving as we're likely deleting it soon anyway */
1708  ref = GetPrivateRefCountEntry(b, false);
1709  Assert(ref != NULL);
1710 
1711  if (fixOwner)
1713 
1714  Assert(ref->refcount > 0);
1715  ref->refcount--;
1716  if (ref->refcount == 0)
1717  {
1718  uint32 buf_state;
1719  uint32 old_buf_state;
1720 
1721  /* I'd better not still hold any locks on the buffer */
1724 
1725  /*
1726  * Decrement the shared reference count.
1727  *
1728  * Since buffer spinlock holder can update status using just write,
1729  * it's not safe to use atomic decrement here; thus use a CAS loop.
1730  */
1731  old_buf_state = pg_atomic_read_u32(&buf->state);
1732  for (;;)
1733  {
1734  if (old_buf_state & BM_LOCKED)
1735  old_buf_state = WaitBufHdrUnlocked(buf);
1736 
1737  buf_state = old_buf_state;
1738 
1739  buf_state -= BUF_REFCOUNT_ONE;
1740 
1741  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1742  buf_state))
1743  break;
1744  }
1745 
1746  /* Support LockBufferForCleanup() */
1747  if (buf_state & BM_PIN_COUNT_WAITER)
1748  {
1749  /*
1750  * Acquire the buffer header lock, re-check that there's a waiter.
1751  * Another backend could have unpinned this buffer, and already
1752  * woken up the waiter. There's no danger of the buffer being
1753  * replaced after we unpinned it above, as it's pinned by the
1754  * waiter.
1755  */
1756  buf_state = LockBufHdr(buf);
1757 
1758  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1759  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1760  {
1761  /* we just released the last pin other than the waiter's */
1762  int wait_backend_pid = buf->wait_backend_pid;
1763 
1764  buf_state &= ~BM_PIN_COUNT_WAITER;
1765  UnlockBufHdr(buf, buf_state);
1766  ProcSendSignal(wait_backend_pid);
1767  }
1768  else
1769  UnlockBufHdr(buf, buf_state);
1770  }
1772  }
1773 }
1774 
1775 /*
1776  * BufferSync -- Write out all dirty buffers in the pool.
1777  *
1778  * This is called at checkpoint time to write out all dirty shared buffers.
1779  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1780  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1781  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1782  * unlogged buffers, which are otherwise skipped. The remaining flags
1783  * currently have no effect here.
1784  */
1785 static void
1786 BufferSync(int flags)
1787 {
1788  uint32 buf_state;
1789  int buf_id;
1790  int num_to_scan;
1791  int num_spaces;
1792  int num_processed;
1793  int num_written;
1794  CkptTsStatus *per_ts_stat = NULL;
1795  Oid last_tsid;
1796  binaryheap *ts_heap;
1797  int i;
1798  int mask = BM_DIRTY;
1799  WritebackContext wb_context;
1800 
1801  /* Make sure we can handle the pin inside SyncOneBuffer */
1803 
1804  /*
1805  * Unless this is a shutdown checkpoint or we have been explicitly told,
1806  * we write only permanent, dirty buffers. But at shutdown or end of
1807  * recovery, we write all dirty buffers.
1808  */
1811  mask |= BM_PERMANENT;
1812 
1813  /*
1814  * Loop over all buffers, and mark the ones that need to be written with
1815  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1816  * can estimate how much work needs to be done.
1817  *
1818  * This allows us to write only those pages that were dirty when the
1819  * checkpoint began, and not those that get dirtied while it proceeds.
1820  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1821  * later in this function, or by normal backends or the bgwriter cleaning
1822  * scan, the flag is cleared. Any buffer dirtied after this point won't
1823  * have the flag set.
1824  *
1825  * Note that if we fail to write some buffer, we may leave buffers with
1826  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1827  * certainly need to be written for the next checkpoint attempt, too.
1828  */
1829  num_to_scan = 0;
1830  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1831  {
1832  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1833 
1834  /*
1835  * Header spinlock is enough to examine BM_DIRTY, see comment in
1836  * SyncOneBuffer.
1837  */
1838  buf_state = LockBufHdr(bufHdr);
1839 
1840  if ((buf_state & mask) == mask)
1841  {
1842  CkptSortItem *item;
1843 
1844  buf_state |= BM_CHECKPOINT_NEEDED;
1845 
1846  item = &CkptBufferIds[num_to_scan++];
1847  item->buf_id = buf_id;
1848  item->tsId = bufHdr->tag.rnode.spcNode;
1849  item->relNode = bufHdr->tag.rnode.relNode;
1850  item->forkNum = bufHdr->tag.forkNum;
1851  item->blockNum = bufHdr->tag.blockNum;
1852  }
1853 
1854  UnlockBufHdr(bufHdr, buf_state);
1855  }
1856 
1857  if (num_to_scan == 0)
1858  return; /* nothing to do */
1859 
1861 
1862  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1863 
1864  /*
1865  * Sort buffers that need to be written to reduce the likelihood of random
1866  * IO. The sorting is also important for the implementation of balancing
1867  * writes between tablespaces. Without balancing writes we'd potentially
1868  * end up writing to the tablespaces one-by-one; possibly overloading the
1869  * underlying system.
1870  */
1871  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1873 
1874  num_spaces = 0;
1875 
1876  /*
1877  * Allocate progress status for each tablespace with buffers that need to
1878  * be flushed. This requires the to-be-flushed array to be sorted.
1879  */
1880  last_tsid = InvalidOid;
1881  for (i = 0; i < num_to_scan; i++)
1882  {
1883  CkptTsStatus *s;
1884  Oid cur_tsid;
1885 
1886  cur_tsid = CkptBufferIds[i].tsId;
1887 
1888  /*
1889  * Grow array of per-tablespace status structs, every time a new
1890  * tablespace is found.
1891  */
1892  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1893  {
1894  Size sz;
1895 
1896  num_spaces++;
1897 
1898  /*
1899  * Not worth adding grow-by-power-of-2 logic here - even with a
1900  * few hundred tablespaces this should be fine.
1901  */
1902  sz = sizeof(CkptTsStatus) * num_spaces;
1903 
1904  if (per_ts_stat == NULL)
1905  per_ts_stat = (CkptTsStatus *) palloc(sz);
1906  else
1907  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1908 
1909  s = &per_ts_stat[num_spaces - 1];
1910  memset(s, 0, sizeof(*s));
1911  s->tsId = cur_tsid;
1912 
1913  /*
1914  * The first buffer in this tablespace. As CkptBufferIds is sorted
1915  * by tablespace all (s->num_to_scan) buffers in this tablespace
1916  * will follow afterwards.
1917  */
1918  s->index = i;
1919 
1920  /*
1921  * progress_slice will be determined once we know how many buffers
1922  * are in each tablespace, i.e. after this loop.
1923  */
1924 
1925  last_tsid = cur_tsid;
1926  }
1927  else
1928  {
1929  s = &per_ts_stat[num_spaces - 1];
1930  }
1931 
1932  s->num_to_scan++;
1933  }
1934 
1935  Assert(num_spaces > 0);
1936 
1937  /*
1938  * Build a min-heap over the write-progress in the individual tablespaces,
1939  * and compute how large a portion of the total progress a single
1940  * processed buffer is.
1941  */
1942  ts_heap = binaryheap_allocate(num_spaces,
1944  NULL);
1945 
1946  for (i = 0; i < num_spaces; i++)
1947  {
1948  CkptTsStatus *ts_stat = &per_ts_stat[i];
1949 
1950  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1951 
1952  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1953  }
1954 
1955  binaryheap_build(ts_heap);
1956 
1957  /*
1958  * Iterate through to-be-checkpointed buffers and write the ones (still)
1959  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1960  * tablespaces; otherwise the sorting would lead to only one tablespace
1961  * receiving writes at a time, making inefficient use of the hardware.
1962  */
1963  num_processed = 0;
1964  num_written = 0;
1965  while (!binaryheap_empty(ts_heap))
1966  {
1967  BufferDesc *bufHdr = NULL;
1968  CkptTsStatus *ts_stat = (CkptTsStatus *)
1970 
1971  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1972  Assert(buf_id != -1);
1973 
1974  bufHdr = GetBufferDescriptor(buf_id);
1975 
1976  num_processed++;
1977 
1978  /*
1979  * We don't need to acquire the lock here, because we're only looking
1980  * at a single bit. It's possible that someone else writes the buffer
1981  * and clears the flag right after we check, but that doesn't matter
1982  * since SyncOneBuffer will then do nothing. However, there is a
1983  * further race condition: it's conceivable that between the time we
1984  * examine the bit here and the time SyncOneBuffer acquires the lock,
1985  * someone else not only wrote the buffer but replaced it with another
1986  * page and dirtied it. In that improbable case, SyncOneBuffer will
1987  * write the buffer though we didn't need to. It doesn't seem worth
1988  * guarding against this, though.
1989  */
1991  {
1992  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1993  {
1994  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1996  num_written++;
1997  }
1998  }
1999 
2000  /*
2001  * Measure progress independent of actually having to flush the buffer
2002  * - otherwise writing become unbalanced.
2003  */
2004  ts_stat->progress += ts_stat->progress_slice;
2005  ts_stat->num_scanned++;
2006  ts_stat->index++;
2007 
2008  /* Have all the buffers from the tablespace been processed? */
2009  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2010  {
2011  binaryheap_remove_first(ts_heap);
2012  }
2013  else
2014  {
2015  /* update heap with the new progress */
2016  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2017  }
2018 
2019  /*
2020  * Sleep to throttle our I/O rate.
2021  */
2022  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2023  }
2024 
2025  /* issue all pending flushes */
2026  IssuePendingWritebacks(&wb_context);
2027 
2028  pfree(per_ts_stat);
2029  per_ts_stat = NULL;
2030  binaryheap_free(ts_heap);
2031 
2032  /*
2033  * Update checkpoint statistics. As noted above, this doesn't include
2034  * buffers written by other backends or bgwriter scan.
2035  */
2036  CheckpointStats.ckpt_bufs_written += num_written;
2037 
2038  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2039 }
2040 
2041 /*
2042  * BgBufferSync -- Write out some dirty buffers in the pool.
2043  *
2044  * This is called periodically by the background writer process.
2045  *
2046  * Returns true if it's appropriate for the bgwriter process to go into
2047  * low-power hibernation mode. (This happens if the strategy clock sweep
2048  * has been "lapped" and no buffer allocations have occurred recently,
2049  * or if the bgwriter has been effectively disabled by setting
2050  * bgwriter_lru_maxpages to 0.)
2051  */
2052 bool
2054 {
2055  /* info obtained from freelist.c */
2056  int strategy_buf_id;
2057  uint32 strategy_passes;
2058  uint32 recent_alloc;
2059 
2060  /*
2061  * Information saved between calls so we can determine the strategy
2062  * point's advance rate and avoid scanning already-cleaned buffers.
2063  */
2064  static bool saved_info_valid = false;
2065  static int prev_strategy_buf_id;
2066  static uint32 prev_strategy_passes;
2067  static int next_to_clean;
2068  static uint32 next_passes;
2069 
2070  /* Moving averages of allocation rate and clean-buffer density */
2071  static float smoothed_alloc = 0;
2072  static float smoothed_density = 10.0;
2073 
2074  /* Potentially these could be tunables, but for now, not */
2075  float smoothing_samples = 16;
2076  float scan_whole_pool_milliseconds = 120000.0;
2077 
2078  /* Used to compute how far we scan ahead */
2079  long strategy_delta;
2080  int bufs_to_lap;
2081  int bufs_ahead;
2082  float scans_per_alloc;
2083  int reusable_buffers_est;
2084  int upcoming_alloc_est;
2085  int min_scan_buffers;
2086 
2087  /* Variables for the scanning loop proper */
2088  int num_to_scan;
2089  int num_written;
2090  int reusable_buffers;
2091 
2092  /* Variables for final smoothed_density update */
2093  long new_strategy_delta;
2094  uint32 new_recent_alloc;
2095 
2096  /*
2097  * Find out where the freelist clock sweep currently is, and how many
2098  * buffer allocations have happened since our last call.
2099  */
2100  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2101 
2102  /* Report buffer alloc counts to pgstat */
2103  BgWriterStats.m_buf_alloc += recent_alloc;
2104 
2105  /*
2106  * If we're not running the LRU scan, just stop after doing the stats
2107  * stuff. We mark the saved state invalid so that we can recover sanely
2108  * if LRU scan is turned back on later.
2109  */
2110  if (bgwriter_lru_maxpages <= 0)
2111  {
2112  saved_info_valid = false;
2113  return true;
2114  }
2115 
2116  /*
2117  * Compute strategy_delta = how many buffers have been scanned by the
2118  * clock sweep since last time. If first time through, assume none. Then
2119  * see if we are still ahead of the clock sweep, and if so, how many
2120  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2121  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2122  * behavior when the passes counts wrap around.
2123  */
2124  if (saved_info_valid)
2125  {
2126  int32 passes_delta = strategy_passes - prev_strategy_passes;
2127 
2128  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2129  strategy_delta += (long) passes_delta * NBuffers;
2130 
2131  Assert(strategy_delta >= 0);
2132 
2133  if ((int32) (next_passes - strategy_passes) > 0)
2134  {
2135  /* we're one pass ahead of the strategy point */
2136  bufs_to_lap = strategy_buf_id - next_to_clean;
2137 #ifdef BGW_DEBUG
2138  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2139  next_passes, next_to_clean,
2140  strategy_passes, strategy_buf_id,
2141  strategy_delta, bufs_to_lap);
2142 #endif
2143  }
2144  else if (next_passes == strategy_passes &&
2145  next_to_clean >= strategy_buf_id)
2146  {
2147  /* on same pass, but ahead or at least not behind */
2148  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2149 #ifdef BGW_DEBUG
2150  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2151  next_passes, next_to_clean,
2152  strategy_passes, strategy_buf_id,
2153  strategy_delta, bufs_to_lap);
2154 #endif
2155  }
2156  else
2157  {
2158  /*
2159  * We're behind, so skip forward to the strategy point and start
2160  * cleaning from there.
2161  */
2162 #ifdef BGW_DEBUG
2163  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2164  next_passes, next_to_clean,
2165  strategy_passes, strategy_buf_id,
2166  strategy_delta);
2167 #endif
2168  next_to_clean = strategy_buf_id;
2169  next_passes = strategy_passes;
2170  bufs_to_lap = NBuffers;
2171  }
2172  }
2173  else
2174  {
2175  /*
2176  * Initializing at startup or after LRU scanning had been off. Always
2177  * start at the strategy point.
2178  */
2179 #ifdef BGW_DEBUG
2180  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2181  strategy_passes, strategy_buf_id);
2182 #endif
2183  strategy_delta = 0;
2184  next_to_clean = strategy_buf_id;
2185  next_passes = strategy_passes;
2186  bufs_to_lap = NBuffers;
2187  }
2188 
2189  /* Update saved info for next time */
2190  prev_strategy_buf_id = strategy_buf_id;
2191  prev_strategy_passes = strategy_passes;
2192  saved_info_valid = true;
2193 
2194  /*
2195  * Compute how many buffers had to be scanned for each new allocation, ie,
2196  * 1/density of reusable buffers, and track a moving average of that.
2197  *
2198  * If the strategy point didn't move, we don't update the density estimate
2199  */
2200  if (strategy_delta > 0 && recent_alloc > 0)
2201  {
2202  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2203  smoothed_density += (scans_per_alloc - smoothed_density) /
2204  smoothing_samples;
2205  }
2206 
2207  /*
2208  * Estimate how many reusable buffers there are between the current
2209  * strategy point and where we've scanned ahead to, based on the smoothed
2210  * density estimate.
2211  */
2212  bufs_ahead = NBuffers - bufs_to_lap;
2213  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2214 
2215  /*
2216  * Track a moving average of recent buffer allocations. Here, rather than
2217  * a true average we want a fast-attack, slow-decline behavior: we
2218  * immediately follow any increase.
2219  */
2220  if (smoothed_alloc <= (float) recent_alloc)
2221  smoothed_alloc = recent_alloc;
2222  else
2223  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2224  smoothing_samples;
2225 
2226  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2227  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2228 
2229  /*
2230  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2231  * eventually underflow to zero, and the underflows produce annoying
2232  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2233  * zero, there's no point in tracking smaller and smaller values of
2234  * smoothed_alloc, so just reset it to exactly zero to avoid this
2235  * syndrome. It will pop back up as soon as recent_alloc increases.
2236  */
2237  if (upcoming_alloc_est == 0)
2238  smoothed_alloc = 0;
2239 
2240  /*
2241  * Even in cases where there's been little or no buffer allocation
2242  * activity, we want to make a small amount of progress through the buffer
2243  * cache so that as many reusable buffers as possible are clean after an
2244  * idle period.
2245  *
2246  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2247  * the BGW will be called during the scan_whole_pool time; slice the
2248  * buffer pool into that many sections.
2249  */
2250  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2251 
2252  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2253  {
2254 #ifdef BGW_DEBUG
2255  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2256  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2257 #endif
2258  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2259  }
2260 
2261  /*
2262  * Now write out dirty reusable buffers, working forward from the
2263  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2264  * enough buffers to match our estimate of the next cycle's allocation
2265  * requirements, or hit the bgwriter_lru_maxpages limit.
2266  */
2267 
2268  /* Make sure we can handle the pin inside SyncOneBuffer */
2270 
2271  num_to_scan = bufs_to_lap;
2272  num_written = 0;
2273  reusable_buffers = reusable_buffers_est;
2274 
2275  /* Execute the LRU scan */
2276  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2277  {
2278  int sync_state = SyncOneBuffer(next_to_clean, true,
2279  wb_context);
2280 
2281  if (++next_to_clean >= NBuffers)
2282  {
2283  next_to_clean = 0;
2284  next_passes++;
2285  }
2286  num_to_scan--;
2287 
2288  if (sync_state & BUF_WRITTEN)
2289  {
2290  reusable_buffers++;
2291  if (++num_written >= bgwriter_lru_maxpages)
2292  {
2294  break;
2295  }
2296  }
2297  else if (sync_state & BUF_REUSABLE)
2298  reusable_buffers++;
2299  }
2300 
2301  BgWriterStats.m_buf_written_clean += num_written;
2302 
2303 #ifdef BGW_DEBUG
2304  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2305  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2306  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2307  bufs_to_lap - num_to_scan,
2308  num_written,
2309  reusable_buffers - reusable_buffers_est);
2310 #endif
2311 
2312  /*
2313  * Consider the above scan as being like a new allocation scan.
2314  * Characterize its density and update the smoothed one based on it. This
2315  * effectively halves the moving average period in cases where both the
2316  * strategy and the background writer are doing some useful scanning,
2317  * which is helpful because a long memory isn't as desirable on the
2318  * density estimates.
2319  */
2320  new_strategy_delta = bufs_to_lap - num_to_scan;
2321  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2322  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2323  {
2324  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2325  smoothed_density += (scans_per_alloc - smoothed_density) /
2326  smoothing_samples;
2327 
2328 #ifdef BGW_DEBUG
2329  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2330  new_recent_alloc, new_strategy_delta,
2331  scans_per_alloc, smoothed_density);
2332 #endif
2333  }
2334 
2335  /* Return true if OK to hibernate */
2336  return (bufs_to_lap == 0 && recent_alloc == 0);
2337 }
2338 
2339 /*
2340  * SyncOneBuffer -- process a single buffer during syncing.
2341  *
2342  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2343  * buffers marked recently used, as these are not replacement candidates.
2344  *
2345  * Returns a bitmask containing the following flag bits:
2346  * BUF_WRITTEN: we wrote the buffer.
2347  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2348  * pin count 0 and usage count 0.
2349  *
2350  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2351  * after locking it, but we don't care all that much.)
2352  *
2353  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2354  */
2355 static int
2356 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2357 {
2358  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2359  int result = 0;
2360  uint32 buf_state;
2361  BufferTag tag;
2362 
2364 
2365  /*
2366  * Check whether buffer needs writing.
2367  *
2368  * We can make this check without taking the buffer content lock so long
2369  * as we mark pages dirty in access methods *before* logging changes with
2370  * XLogInsert(): if someone marks the buffer dirty just after our check we
2371  * don't worry because our checkpoint.redo points before log record for
2372  * upcoming changes and so we are not required to write such dirty buffer.
2373  */
2374  buf_state = LockBufHdr(bufHdr);
2375 
2376  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2377  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2378  {
2379  result |= BUF_REUSABLE;
2380  }
2381  else if (skip_recently_used)
2382  {
2383  /* Caller told us not to write recently-used buffers */
2384  UnlockBufHdr(bufHdr, buf_state);
2385  return result;
2386  }
2387 
2388  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2389  {
2390  /* It's clean, so nothing to do */
2391  UnlockBufHdr(bufHdr, buf_state);
2392  return result;
2393  }
2394 
2395  /*
2396  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2397  * buffer is clean by the time we've locked it.)
2398  */
2399  PinBuffer_Locked(bufHdr);
2401 
2402  FlushBuffer(bufHdr, NULL);
2403 
2405 
2406  tag = bufHdr->tag;
2407 
2408  UnpinBuffer(bufHdr, true);
2409 
2410  ScheduleBufferTagForWriteback(wb_context, &tag);
2411 
2412  return result | BUF_WRITTEN;
2413 }
2414 
2415 /*
2416  * AtEOXact_Buffers - clean up at end of transaction.
2417  *
2418  * As of PostgreSQL 8.0, buffer pins should get released by the
2419  * ResourceOwner mechanism. This routine is just a debugging
2420  * cross-check that no pins remain.
2421  */
2422 void
2423 AtEOXact_Buffers(bool isCommit)
2424 {
2426 
2427  AtEOXact_LocalBuffers(isCommit);
2428 
2430 }
2431 
2432 /*
2433  * Initialize access to shared buffer pool
2434  *
2435  * This is called during backend startup (whether standalone or under the
2436  * postmaster). It sets up for this backend's access to the already-existing
2437  * buffer pool.
2438  *
2439  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2440  * cannot do LWLockAcquire; hence we can't actually access stuff in
2441  * shared memory yet. We are only initializing local data here.
2442  * (See also InitBufferPoolBackend)
2443  */
2444 void
2446 {
2447  HASHCTL hash_ctl;
2448 
2449  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2450 
2451  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2452  hash_ctl.keysize = sizeof(int32);
2453  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2454 
2455  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2456  HASH_ELEM | HASH_BLOBS);
2457 }
2458 
2459 /*
2460  * InitBufferPoolBackend --- second-stage initialization of a new backend
2461  *
2462  * This is called after we have acquired a PGPROC and so can safely get
2463  * LWLocks. We don't currently need to do anything at this stage ...
2464  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2465  * access, and thereby has to be called at the corresponding phase of
2466  * backend shutdown.
2467  */
2468 void
2470 {
2472 }
2473 
2474 /*
2475  * During backend exit, ensure that we released all shared-buffer locks and
2476  * assert that we have no remaining pins.
2477  */
2478 static void
2480 {
2481  AbortBufferIO();
2482  UnlockBuffers();
2483 
2485 
2486  /* localbuf.c needs a chance too */
2488 }
2489 
2490 /*
2491  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2492  *
2493  * As of PostgreSQL 8.0, buffer pins should get released by the
2494  * ResourceOwner mechanism. This routine is just a debugging
2495  * cross-check that no pins remain.
2496  */
2497 static void
2499 {
2500 #ifdef USE_ASSERT_CHECKING
2501  int RefCountErrors = 0;
2502  PrivateRefCountEntry *res;
2503  int i;
2504 
2505  /* check the array */
2506  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2507  {
2508  res = &PrivateRefCountArray[i];
2509 
2510  if (res->buffer != InvalidBuffer)
2511  {
2513  RefCountErrors++;
2514  }
2515  }
2516 
2517  /* if necessary search the hash */
2519  {
2520  HASH_SEQ_STATUS hstat;
2521 
2522  hash_seq_init(&hstat, PrivateRefCountHash);
2523  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2524  {
2526  RefCountErrors++;
2527  }
2528 
2529  }
2530 
2531  Assert(RefCountErrors == 0);
2532 #endif
2533 }
2534 
2535 /*
2536  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2537  */
2538 void
2540 {
2541  BufferDesc *buf;
2542  int32 loccount;
2543  char *path;
2544  BackendId backend;
2545  uint32 buf_state;
2546 
2547  Assert(BufferIsValid(buffer));
2548  if (BufferIsLocal(buffer))
2549  {
2550  buf = GetLocalBufferDescriptor(-buffer - 1);
2551  loccount = LocalRefCount[-buffer - 1];
2552  backend = MyBackendId;
2553  }
2554  else
2555  {
2556  buf = GetBufferDescriptor(buffer - 1);
2557  loccount = GetPrivateRefCount(buffer);
2558  backend = InvalidBackendId;
2559  }
2560 
2561  /* theoretically we should lock the bufhdr here */
2562  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2563  buf_state = pg_atomic_read_u32(&buf->state);
2564  elog(WARNING,
2565  "buffer refcount leak: [%03d] "
2566  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2567  buffer, path,
2568  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2569  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2570  pfree(path);
2571 }
2572 
2573 /*
2574  * CheckPointBuffers
2575  *
2576  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2577  *
2578  * Note: temporary relations do not participate in checkpoints, so they don't
2579  * need to be flushed.
2580  */
2581 void
2583 {
2584  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2586  BufferSync(flags);
2588  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2591  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2592 }
2593 
2594 
2595 /*
2596  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2597  */
2598 void
2600 {
2601  /* Nothing to do in bufmgr anymore... */
2602 }
2603 
2604 /*
2605  * BufferGetBlockNumber
2606  * Returns the block number associated with a buffer.
2607  *
2608  * Note:
2609  * Assumes that the buffer is valid and pinned, else the
2610  * value may be obsolete immediately...
2611  */
2614 {
2615  BufferDesc *bufHdr;
2616 
2617  Assert(BufferIsPinned(buffer));
2618 
2619  if (BufferIsLocal(buffer))
2620  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2621  else
2622  bufHdr = GetBufferDescriptor(buffer - 1);
2623 
2624  /* pinned, so OK to read tag without spinlock */
2625  return bufHdr->tag.blockNum;
2626 }
2627 
2628 /*
2629  * BufferGetTag
2630  * Returns the relfilenode, fork number and block number associated with
2631  * a buffer.
2632  */
2633 void
2635  BlockNumber *blknum)
2636 {
2637  BufferDesc *bufHdr;
2638 
2639  /* Do the same checks as BufferGetBlockNumber. */
2640  Assert(BufferIsPinned(buffer));
2641 
2642  if (BufferIsLocal(buffer))
2643  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2644  else
2645  bufHdr = GetBufferDescriptor(buffer - 1);
2646 
2647  /* pinned, so OK to read tag without spinlock */
2648  *rnode = bufHdr->tag.rnode;
2649  *forknum = bufHdr->tag.forkNum;
2650  *blknum = bufHdr->tag.blockNum;
2651 }
2652 
2653 /*
2654  * FlushBuffer
2655  * Physically write out a shared buffer.
2656  *
2657  * NOTE: this actually just passes the buffer contents to the kernel; the
2658  * real write to disk won't happen until the kernel feels like it. This
2659  * is okay from our point of view since we can redo the changes from WAL.
2660  * However, we will need to force the changes to disk via fsync before
2661  * we can checkpoint WAL.
2662  *
2663  * The caller must hold a pin on the buffer and have share-locked the
2664  * buffer contents. (Note: a share-lock does not prevent updates of
2665  * hint bits in the buffer, so the page could change while the write
2666  * is in progress, but we assume that that will not invalidate the data
2667  * written.)
2668  *
2669  * If the caller has an smgr reference for the buffer's relation, pass it
2670  * as the second parameter. If not, pass NULL.
2671  */
2672 static void
2674 {
2675  XLogRecPtr recptr;
2676  ErrorContextCallback errcallback;
2677  instr_time io_start,
2678  io_time;
2679  Block bufBlock;
2680  char *bufToWrite;
2681  uint32 buf_state;
2682 
2683  /*
2684  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2685  * false, then someone else flushed the buffer before we could, so we need
2686  * not do anything.
2687  */
2688  if (!StartBufferIO(buf, false))
2689  return;
2690 
2691  /* Setup error traceback support for ereport() */
2693  errcallback.arg = (void *) buf;
2694  errcallback.previous = error_context_stack;
2695  error_context_stack = &errcallback;
2696 
2697  /* Find smgr relation for buffer */
2698  if (reln == NULL)
2699  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2700 
2701  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2702  buf->tag.blockNum,
2703  reln->smgr_rnode.node.spcNode,
2704  reln->smgr_rnode.node.dbNode,
2705  reln->smgr_rnode.node.relNode);
2706 
2707  buf_state = LockBufHdr(buf);
2708 
2709  /*
2710  * Run PageGetLSN while holding header lock, since we don't have the
2711  * buffer locked exclusively in all cases.
2712  */
2713  recptr = BufferGetLSN(buf);
2714 
2715  /* To check if block content changes while flushing. - vadim 01/17/97 */
2716  buf_state &= ~BM_JUST_DIRTIED;
2717  UnlockBufHdr(buf, buf_state);
2718 
2719  /*
2720  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2721  * rule that log updates must hit disk before any of the data-file changes
2722  * they describe do.
2723  *
2724  * However, this rule does not apply to unlogged relations, which will be
2725  * lost after a crash anyway. Most unlogged relation pages do not bear
2726  * LSNs since we never emit WAL records for them, and therefore flushing
2727  * up through the buffer LSN would be useless, but harmless. However,
2728  * GiST indexes use LSNs internally to track page-splits, and therefore
2729  * unlogged GiST pages bear "fake" LSNs generated by
2730  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2731  * LSN counter could advance past the WAL insertion point; and if it did
2732  * happen, attempting to flush WAL through that location would fail, with
2733  * disastrous system-wide consequences. To make sure that can't happen,
2734  * skip the flush if the buffer isn't permanent.
2735  */
2736  if (buf_state & BM_PERMANENT)
2737  XLogFlush(recptr);
2738 
2739  /*
2740  * Now it's safe to write buffer to disk. Note that no one else should
2741  * have been able to write it while we were busy with log flushing because
2742  * we have the io_in_progress lock.
2743  */
2744  bufBlock = BufHdrGetBlock(buf);
2745 
2746  /*
2747  * Update page checksum if desired. Since we have only shared lock on the
2748  * buffer, other processes might be updating hint bits in it, so we must
2749  * copy the page to private storage if we do checksumming.
2750  */
2751  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2752 
2753  if (track_io_timing)
2754  INSTR_TIME_SET_CURRENT(io_start);
2755 
2756  /*
2757  * bufToWrite is either the shared buffer or a copy, as appropriate.
2758  */
2759  smgrwrite(reln,
2760  buf->tag.forkNum,
2761  buf->tag.blockNum,
2762  bufToWrite,
2763  false);
2764 
2765  if (track_io_timing)
2766  {
2767  INSTR_TIME_SET_CURRENT(io_time);
2768  INSTR_TIME_SUBTRACT(io_time, io_start);
2771  }
2772 
2774 
2775  /*
2776  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2777  * end the io_in_progress state.
2778  */
2779  TerminateBufferIO(buf, true, 0);
2780 
2781  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2782  buf->tag.blockNum,
2783  reln->smgr_rnode.node.spcNode,
2784  reln->smgr_rnode.node.dbNode,
2785  reln->smgr_rnode.node.relNode);
2786 
2787  /* Pop the error context stack */
2788  error_context_stack = errcallback.previous;
2789 }
2790 
2791 /*
2792  * RelationGetNumberOfBlocksInFork
2793  * Determines the current number of pages in the specified relation fork.
2794  *
2795  * Note that the accuracy of the result will depend on the details of the
2796  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2797  * it might not be.
2798  */
2801 {
2802  switch (relation->rd_rel->relkind)
2803  {
2804  case RELKIND_SEQUENCE:
2805  case RELKIND_INDEX:
2806  case RELKIND_PARTITIONED_INDEX:
2807  /* Open it at the smgr level if not already done */
2808  RelationOpenSmgr(relation);
2809 
2810  return smgrnblocks(relation->rd_smgr, forkNum);
2811 
2812  case RELKIND_RELATION:
2813  case RELKIND_TOASTVALUE:
2814  case RELKIND_MATVIEW:
2815  {
2816  /*
2817  * Not every table AM uses BLCKSZ wide fixed size blocks.
2818  * Therefore tableam returns the size in bytes - but for the
2819  * purpose of this routine, we want the number of blocks.
2820  * Therefore divide, rounding up.
2821  */
2822  uint64 szbytes;
2823 
2824  szbytes = table_relation_size(relation, forkNum);
2825 
2826  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2827  }
2828  case RELKIND_VIEW:
2829  case RELKIND_COMPOSITE_TYPE:
2830  case RELKIND_FOREIGN_TABLE:
2831  case RELKIND_PARTITIONED_TABLE:
2832  default:
2833  Assert(false);
2834  break;
2835  }
2836 
2837  return 0; /* keep compiler quiet */
2838 }
2839 
2840 /*
2841  * BufferIsPermanent
2842  * Determines whether a buffer will potentially still be around after
2843  * a crash. Caller must hold a buffer pin.
2844  */
2845 bool
2847 {
2848  BufferDesc *bufHdr;
2849 
2850  /* Local buffers are used only for temp relations. */
2851  if (BufferIsLocal(buffer))
2852  return false;
2853 
2854  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2855  Assert(BufferIsValid(buffer));
2856  Assert(BufferIsPinned(buffer));
2857 
2858  /*
2859  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2860  * need not bother with the buffer header spinlock. Even if someone else
2861  * changes the buffer header state while we're doing this, the state is
2862  * changed atomically, so we'll read the old value or the new value, but
2863  * not random garbage.
2864  */
2865  bufHdr = GetBufferDescriptor(buffer - 1);
2866  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2867 }
2868 
2869 /*
2870  * BufferGetLSNAtomic
2871  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2872  * This is necessary for some callers who may not have an exclusive lock
2873  * on the buffer.
2874  */
2875 XLogRecPtr
2877 {
2878  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2879  char *page = BufferGetPage(buffer);
2880  XLogRecPtr lsn;
2881  uint32 buf_state;
2882 
2883  /*
2884  * If we don't need locking for correctness, fastpath out.
2885  */
2886  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2887  return PageGetLSN(page);
2888 
2889  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2890  Assert(BufferIsValid(buffer));
2891  Assert(BufferIsPinned(buffer));
2892 
2893  buf_state = LockBufHdr(bufHdr);
2894  lsn = PageGetLSN(page);
2895  UnlockBufHdr(bufHdr, buf_state);
2896 
2897  return lsn;
2898 }
2899 
2900 /* ---------------------------------------------------------------------
2901  * DropRelFileNodeBuffers
2902  *
2903  * This function removes from the buffer pool all the pages of the
2904  * specified relation fork that have block numbers >= firstDelBlock.
2905  * (In particular, with firstDelBlock = 0, all pages are removed.)
2906  * Dirty pages are simply dropped, without bothering to write them
2907  * out first. Therefore, this is NOT rollback-able, and so should be
2908  * used only with extreme caution!
2909  *
2910  * Currently, this is called only from smgr.c when the underlying file
2911  * is about to be deleted or truncated (firstDelBlock is needed for
2912  * the truncation case). The data in the affected pages would therefore
2913  * be deleted momentarily anyway, and there is no point in writing it.
2914  * It is the responsibility of higher-level code to ensure that the
2915  * deletion or truncation does not lose any data that could be needed
2916  * later. It is also the responsibility of higher-level code to ensure
2917  * that no other process could be trying to load more pages of the
2918  * relation into buffers.
2919  *
2920  * XXX currently it sequentially searches the buffer pool, should be
2921  * changed to more clever ways of searching. However, this routine
2922  * is used only in code paths that aren't very performance-critical,
2923  * and we shouldn't slow down the hot paths to make it faster ...
2924  * --------------------------------------------------------------------
2925  */
2926 void
2928  BlockNumber firstDelBlock)
2929 {
2930  int i;
2931 
2932  /* If it's a local relation, it's localbuf.c's problem. */
2933  if (RelFileNodeBackendIsTemp(rnode))
2934  {
2935  if (rnode.backend == MyBackendId)
2936  DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
2937  return;
2938  }
2939 
2940  for (i = 0; i < NBuffers; i++)
2941  {
2942  BufferDesc *bufHdr = GetBufferDescriptor(i);
2943  uint32 buf_state;
2944 
2945  /*
2946  * We can make this a tad faster by prechecking the buffer tag before
2947  * we attempt to lock the buffer; this saves a lot of lock
2948  * acquisitions in typical cases. It should be safe because the
2949  * caller must have AccessExclusiveLock on the relation, or some other
2950  * reason to be certain that no one is loading new pages of the rel
2951  * into the buffer pool. (Otherwise we might well miss such pages
2952  * entirely.) Therefore, while the tag might be changing while we
2953  * look at it, it can't be changing *to* a value we care about, only
2954  * *away* from such a value. So false negatives are impossible, and
2955  * false positives are safe because we'll recheck after getting the
2956  * buffer lock.
2957  *
2958  * We could check forkNum and blockNum as well as the rnode, but the
2959  * incremental win from doing so seems small.
2960  */
2961  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2962  continue;
2963 
2964  buf_state = LockBufHdr(bufHdr);
2965  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2966  bufHdr->tag.forkNum == forkNum &&
2967  bufHdr->tag.blockNum >= firstDelBlock)
2968  InvalidateBuffer(bufHdr); /* releases spinlock */
2969  else
2970  UnlockBufHdr(bufHdr, buf_state);
2971  }
2972 }
2973 
2974 /* ---------------------------------------------------------------------
2975  * DropRelFileNodesAllBuffers
2976  *
2977  * This function removes from the buffer pool all the pages of all
2978  * forks of the specified relations. It's equivalent to calling
2979  * DropRelFileNodeBuffers once per fork per relation with
2980  * firstDelBlock = 0.
2981  * --------------------------------------------------------------------
2982  */
2983 void
2985 {
2986  int i,
2987  n = 0;
2988  RelFileNode *nodes;
2989  bool use_bsearch;
2990 
2991  if (nnodes == 0)
2992  return;
2993 
2994  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2995 
2996  /* If it's a local relation, it's localbuf.c's problem. */
2997  for (i = 0; i < nnodes; i++)
2998  {
2999  if (RelFileNodeBackendIsTemp(rnodes[i]))
3000  {
3001  if (rnodes[i].backend == MyBackendId)
3002  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3003  }
3004  else
3005  nodes[n++] = rnodes[i].node;
3006  }
3007 
3008  /*
3009  * If there are no non-local relations, then we're done. Release the
3010  * memory and return.
3011  */
3012  if (n == 0)
3013  {
3014  pfree(nodes);
3015  return;
3016  }
3017 
3018  /*
3019  * For low number of relations to drop just use a simple walk through, to
3020  * save the bsearch overhead. The threshold to use is rather a guess than
3021  * an exactly determined value, as it depends on many factors (CPU and RAM
3022  * speeds, amount of shared buffers etc.).
3023  */
3024  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
3025 
3026  /* sort the list of rnodes if necessary */
3027  if (use_bsearch)
3028  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3029 
3030  for (i = 0; i < NBuffers; i++)
3031  {
3032  RelFileNode *rnode = NULL;
3033  BufferDesc *bufHdr = GetBufferDescriptor(i);
3034  uint32 buf_state;
3035 
3036  /*
3037  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3038  * and saves some cycles.
3039  */
3040 
3041  if (!use_bsearch)
3042  {
3043  int j;
3044 
3045  for (j = 0; j < n; j++)
3046  {
3047  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3048  {
3049  rnode = &nodes[j];
3050  break;
3051  }
3052  }
3053  }
3054  else
3055  {
3056  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3057  nodes, n, sizeof(RelFileNode),
3059  }
3060 
3061  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3062  if (rnode == NULL)
3063  continue;
3064 
3065  buf_state = LockBufHdr(bufHdr);
3066  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3067  InvalidateBuffer(bufHdr); /* releases spinlock */
3068  else
3069  UnlockBufHdr(bufHdr, buf_state);
3070  }
3071 
3072  pfree(nodes);
3073 }
3074 
3075 /* ---------------------------------------------------------------------
3076  * DropDatabaseBuffers
3077  *
3078  * This function removes all the buffers in the buffer cache for a
3079  * particular database. Dirty pages are simply dropped, without
3080  * bothering to write them out first. This is used when we destroy a
3081  * database, to avoid trying to flush data to disk when the directory
3082  * tree no longer exists. Implementation is pretty similar to
3083  * DropRelFileNodeBuffers() which is for destroying just one relation.
3084  * --------------------------------------------------------------------
3085  */
3086 void
3088 {
3089  int i;
3090 
3091  /*
3092  * We needn't consider local buffers, since by assumption the target
3093  * database isn't our own.
3094  */
3095 
3096  for (i = 0; i < NBuffers; i++)
3097  {
3098  BufferDesc *bufHdr = GetBufferDescriptor(i);
3099  uint32 buf_state;
3100 
3101  /*
3102  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3103  * and saves some cycles.
3104  */
3105  if (bufHdr->tag.rnode.dbNode != dbid)
3106  continue;
3107 
3108  buf_state = LockBufHdr(bufHdr);
3109  if (bufHdr->tag.rnode.dbNode == dbid)
3110  InvalidateBuffer(bufHdr); /* releases spinlock */
3111  else
3112  UnlockBufHdr(bufHdr, buf_state);
3113  }
3114 }
3115 
3116 /* -----------------------------------------------------------------
3117  * PrintBufferDescs
3118  *
3119  * this function prints all the buffer descriptors, for debugging
3120  * use only.
3121  * -----------------------------------------------------------------
3122  */
3123 #ifdef NOT_USED
3124 void
3125 PrintBufferDescs(void)
3126 {
3127  int i;
3128 
3129  for (i = 0; i < NBuffers; ++i)
3130  {
3133 
3134  /* theoretically we should lock the bufhdr here */
3135  elog(LOG,
3136  "[%02d] (freeNext=%d, rel=%s, "
3137  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3138  i, buf->freeNext,
3140  buf->tag.blockNum, buf->flags,
3141  buf->refcount, GetPrivateRefCount(b));
3142  }
3143 }
3144 #endif
3145 
3146 #ifdef NOT_USED
3147 void
3148 PrintPinnedBufs(void)
3149 {
3150  int i;
3151 
3152  for (i = 0; i < NBuffers; ++i)
3153  {
3156 
3157  if (GetPrivateRefCount(b) > 0)
3158  {
3159  /* theoretically we should lock the bufhdr here */
3160  elog(LOG,
3161  "[%02d] (freeNext=%d, rel=%s, "
3162  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3163  i, buf->freeNext,
3164  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3165  buf->tag.blockNum, buf->flags,
3166  buf->refcount, GetPrivateRefCount(b));
3167  }
3168  }
3169 }
3170 #endif
3171 
3172 /* ---------------------------------------------------------------------
3173  * FlushRelationBuffers
3174  *
3175  * This function writes all dirty pages of a relation out to disk
3176  * (or more accurately, out to kernel disk buffers), ensuring that the
3177  * kernel has an up-to-date view of the relation.
3178  *
3179  * Generally, the caller should be holding AccessExclusiveLock on the
3180  * target relation to ensure that no other backend is busy dirtying
3181  * more blocks of the relation; the effects can't be expected to last
3182  * after the lock is released.
3183  *
3184  * XXX currently it sequentially searches the buffer pool, should be
3185  * changed to more clever ways of searching. This routine is not
3186  * used in any performance-critical code paths, so it's not worth
3187  * adding additional overhead to normal paths to make it go faster;
3188  * but see also DropRelFileNodeBuffers.
3189  * --------------------------------------------------------------------
3190  */
3191 void
3193 {
3194  int i;
3195  BufferDesc *bufHdr;
3196 
3197  /* Open rel at the smgr level if not already done */
3198  RelationOpenSmgr(rel);
3199 
3200  if (RelationUsesLocalBuffers(rel))
3201  {
3202  for (i = 0; i < NLocBuffer; i++)
3203  {
3204  uint32 buf_state;
3205 
3206  bufHdr = GetLocalBufferDescriptor(i);
3207  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3208  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3209  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3210  {
3211  ErrorContextCallback errcallback;
3212  Page localpage;
3213 
3214  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3215 
3216  /* Setup error traceback support for ereport() */
3218  errcallback.arg = (void *) bufHdr;
3219  errcallback.previous = error_context_stack;
3220  error_context_stack = &errcallback;
3221 
3222  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3223 
3224  smgrwrite(rel->rd_smgr,
3225  bufHdr->tag.forkNum,
3226  bufHdr->tag.blockNum,
3227  localpage,
3228  false);
3229 
3230  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3231  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3232 
3233  /* Pop the error context stack */
3234  error_context_stack = errcallback.previous;
3235  }
3236  }
3237 
3238  return;
3239  }
3240 
3241  /* Make sure we can handle the pin inside the loop */
3243 
3244  for (i = 0; i < NBuffers; i++)
3245  {
3246  uint32 buf_state;
3247 
3248  bufHdr = GetBufferDescriptor(i);
3249 
3250  /*
3251  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3252  * and saves some cycles.
3253  */
3254  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3255  continue;
3256 
3258 
3259  buf_state = LockBufHdr(bufHdr);
3260  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3261  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3262  {
3263  PinBuffer_Locked(bufHdr);
3265  FlushBuffer(bufHdr, rel->rd_smgr);
3267  UnpinBuffer(bufHdr, true);
3268  }
3269  else
3270  UnlockBufHdr(bufHdr, buf_state);
3271  }
3272 }
3273 
3274 /* ---------------------------------------------------------------------
3275  * FlushDatabaseBuffers
3276  *
3277  * This function writes all dirty pages of a database out to disk
3278  * (or more accurately, out to kernel disk buffers), ensuring that the
3279  * kernel has an up-to-date view of the database.
3280  *
3281  * Generally, the caller should be holding an appropriate lock to ensure
3282  * no other backend is active in the target database; otherwise more
3283  * pages could get dirtied.
3284  *
3285  * Note we don't worry about flushing any pages of temporary relations.
3286  * It's assumed these wouldn't be interesting.
3287  * --------------------------------------------------------------------
3288  */
3289 void
3291 {
3292  int i;
3293  BufferDesc *bufHdr;
3294 
3295  /* Make sure we can handle the pin inside the loop */
3297 
3298  for (i = 0; i < NBuffers; i++)
3299  {
3300  uint32 buf_state;
3301 
3302  bufHdr = GetBufferDescriptor(i);
3303 
3304  /*
3305  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3306  * and saves some cycles.
3307  */
3308  if (bufHdr->tag.rnode.dbNode != dbid)
3309  continue;
3310 
3312 
3313  buf_state = LockBufHdr(bufHdr);
3314  if (bufHdr->tag.rnode.dbNode == dbid &&
3315  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3316  {
3317  PinBuffer_Locked(bufHdr);
3319  FlushBuffer(bufHdr, NULL);
3321  UnpinBuffer(bufHdr, true);
3322  }
3323  else
3324  UnlockBufHdr(bufHdr, buf_state);
3325  }
3326 }
3327 
3328 /*
3329  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3330  * OS.
3331  */
3332 void
3334 {
3335  BufferDesc *bufHdr;
3336 
3337  /* currently not needed, but no fundamental reason not to support */
3338  Assert(!BufferIsLocal(buffer));
3339 
3340  Assert(BufferIsPinned(buffer));
3341 
3342  bufHdr = GetBufferDescriptor(buffer - 1);
3343 
3345 
3346  FlushBuffer(bufHdr, NULL);
3347 }
3348 
3349 /*
3350  * ReleaseBuffer -- release the pin on a buffer
3351  */
3352 void
3354 {
3355  if (!BufferIsValid(buffer))
3356  elog(ERROR, "bad buffer ID: %d", buffer);
3357 
3358  if (BufferIsLocal(buffer))
3359  {
3361 
3362  Assert(LocalRefCount[-buffer - 1] > 0);
3363  LocalRefCount[-buffer - 1]--;
3364  return;
3365  }
3366 
3367  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3368 }
3369 
3370 /*
3371  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3372  *
3373  * This is just a shorthand for a common combination.
3374  */
3375 void
3377 {
3378  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3379  ReleaseBuffer(buffer);
3380 }
3381 
3382 /*
3383  * IncrBufferRefCount
3384  * Increment the pin count on a buffer that we have *already* pinned
3385  * at least once.
3386  *
3387  * This function cannot be used on a buffer we do not have pinned,
3388  * because it doesn't change the shared buffer state.
3389  */
3390 void
3392 {
3393  Assert(BufferIsPinned(buffer));
3395  if (BufferIsLocal(buffer))
3396  LocalRefCount[-buffer - 1]++;
3397  else
3398  {
3399  PrivateRefCountEntry *ref;
3400 
3401  ref = GetPrivateRefCountEntry(buffer, true);
3402  Assert(ref != NULL);
3403  ref->refcount++;
3404  }
3406 }
3407 
3408 /*
3409  * MarkBufferDirtyHint
3410  *
3411  * Mark a buffer dirty for non-critical changes.
3412  *
3413  * This is essentially the same as MarkBufferDirty, except:
3414  *
3415  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3416  * to write an XLOG_FPI WAL record to protect against torn pages.
3417  * 2. The caller might have only share-lock instead of exclusive-lock on the
3418  * buffer's content lock.
3419  * 3. This function does not guarantee that the buffer is always marked dirty
3420  * (due to a race condition), so it cannot be used for important changes.
3421  */
3422 void
3424 {
3425  BufferDesc *bufHdr;
3426  Page page = BufferGetPage(buffer);
3427 
3428  if (!BufferIsValid(buffer))
3429  elog(ERROR, "bad buffer ID: %d", buffer);
3430 
3431  if (BufferIsLocal(buffer))
3432  {
3433  MarkLocalBufferDirty(buffer);
3434  return;
3435  }
3436 
3437  bufHdr = GetBufferDescriptor(buffer - 1);
3438 
3439  Assert(GetPrivateRefCount(buffer) > 0);
3440  /* here, either share or exclusive lock is OK */
3442 
3443  /*
3444  * This routine might get called many times on the same page, if we are
3445  * making the first scan after commit of an xact that added/deleted many
3446  * tuples. So, be as quick as we can if the buffer is already dirty. We
3447  * do this by not acquiring spinlock if it looks like the status bits are
3448  * already set. Since we make this test unlocked, there's a chance we
3449  * might fail to notice that the flags have just been cleared, and failed
3450  * to reset them, due to memory-ordering issues. But since this function
3451  * is only intended to be used in cases where failing to write out the
3452  * data would be harmless anyway, it doesn't really matter.
3453  */
3454  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3456  {
3458  bool dirtied = false;
3459  bool delayChkpt = false;
3460  uint32 buf_state;
3461 
3462  /*
3463  * If we need to protect hint bit updates from torn writes, WAL-log a
3464  * full page image of the page. This full page image is only necessary
3465  * if the hint bit update is the first change to the page since the
3466  * last checkpoint.
3467  *
3468  * We don't check full_page_writes here because that logic is included
3469  * when we call XLogInsert() since the value changes dynamically.
3470  */
3471  if (XLogHintBitIsNeeded() &&
3472  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3473  {
3474  /*
3475  * If we're in recovery we cannot dirty a page because of a hint.
3476  * We can set the hint, just not dirty the page as a result so the
3477  * hint is lost when we evict the page or shutdown.
3478  *
3479  * See src/backend/storage/page/README for longer discussion.
3480  */
3481  if (RecoveryInProgress())
3482  return;
3483 
3484  /*
3485  * If the block is already dirty because we either made a change
3486  * or set a hint already, then we don't need to write a full page
3487  * image. Note that aggressive cleaning of blocks dirtied by hint
3488  * bit setting would increase the call rate. Bulk setting of hint
3489  * bits would reduce the call rate...
3490  *
3491  * We must issue the WAL record before we mark the buffer dirty.
3492  * Otherwise we might write the page before we write the WAL. That
3493  * causes a race condition, since a checkpoint might occur between
3494  * writing the WAL record and marking the buffer dirty. We solve
3495  * that with a kluge, but one that is already in use during
3496  * transaction commit to prevent race conditions. Basically, we
3497  * simply prevent the checkpoint WAL record from being written
3498  * until we have marked the buffer dirty. We don't start the
3499  * checkpoint flush until we have marked dirty, so our checkpoint
3500  * must flush the change to disk successfully or the checkpoint
3501  * never gets written, so crash recovery will fix.
3502  *
3503  * It's possible we may enter here without an xid, so it is
3504  * essential that CreateCheckpoint waits for virtual transactions
3505  * rather than full transactionids.
3506  */
3507  MyPgXact->delayChkpt = delayChkpt = true;
3508  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3509  }
3510 
3511  buf_state = LockBufHdr(bufHdr);
3512 
3513  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3514 
3515  if (!(buf_state & BM_DIRTY))
3516  {
3517  dirtied = true; /* Means "will be dirtied by this action" */
3518 
3519  /*
3520  * Set the page LSN if we wrote a backup block. We aren't supposed
3521  * to set this when only holding a share lock but as long as we
3522  * serialise it somehow we're OK. We choose to set LSN while
3523  * holding the buffer header lock, which causes any reader of an
3524  * LSN who holds only a share lock to also obtain a buffer header
3525  * lock before using PageGetLSN(), which is enforced in
3526  * BufferGetLSNAtomic().
3527  *
3528  * If checksums are enabled, you might think we should reset the
3529  * checksum here. That will happen when the page is written
3530  * sometime later in this checkpoint cycle.
3531  */
3532  if (!XLogRecPtrIsInvalid(lsn))
3533  PageSetLSN(page, lsn);
3534  }
3535 
3536  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3537  UnlockBufHdr(bufHdr, buf_state);
3538 
3539  if (delayChkpt)
3540  MyPgXact->delayChkpt = false;
3541 
3542  if (dirtied)
3543  {
3544  VacuumPageDirty++;
3546  if (VacuumCostActive)
3548  }
3549  }
3550 }
3551 
3552 /*
3553  * Release buffer content locks for shared buffers.
3554  *
3555  * Used to clean up after errors.
3556  *
3557  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3558  * of releasing buffer content locks per se; the only thing we need to deal
3559  * with here is clearing any PIN_COUNT request that was in progress.
3560  */
3561 void
3563 {
3565 
3566  if (buf)
3567  {
3568  uint32 buf_state;
3569 
3570  buf_state = LockBufHdr(buf);
3571 
3572  /*
3573  * Don't complain if flag bit not set; it could have been reset but we
3574  * got a cancel/die interrupt before getting the signal.
3575  */
3576  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3577  buf->wait_backend_pid == MyProcPid)
3578  buf_state &= ~BM_PIN_COUNT_WAITER;
3579 
3580  UnlockBufHdr(buf, buf_state);
3581 
3582  PinCountWaitBuf = NULL;
3583  }
3584 }
3585 
3586 /*
3587  * Acquire or release the content_lock for the buffer.
3588  */
3589 void
3591 {
3592  BufferDesc *buf;
3593 
3594  Assert(BufferIsValid(buffer));
3595  if (BufferIsLocal(buffer))
3596  return; /* local buffers need no lock */
3597 
3598  buf = GetBufferDescriptor(buffer - 1);
3599 
3600  if (mode == BUFFER_LOCK_UNLOCK)
3602  else if (mode == BUFFER_LOCK_SHARE)
3604  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3606  else
3607  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3608 }
3609 
3610 /*
3611  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3612  *
3613  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3614  */
3615 bool
3617 {
3618  BufferDesc *buf;
3619 
3620  Assert(BufferIsValid(buffer));
3621  if (BufferIsLocal(buffer))
3622  return true; /* act as though we got it */
3623 
3624  buf = GetBufferDescriptor(buffer - 1);
3625 
3627  LW_EXCLUSIVE);
3628 }
3629 
3630 /*
3631  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3632  *
3633  * Items may be deleted from a disk page only when the caller (a) holds an
3634  * exclusive lock on the buffer and (b) has observed that no other backend
3635  * holds a pin on the buffer. If there is a pin, then the other backend
3636  * might have a pointer into the buffer (for example, a heapscan reference
3637  * to an item --- see README for more details). It's OK if a pin is added
3638  * after the cleanup starts, however; the newly-arrived backend will be
3639  * unable to look at the page until we release the exclusive lock.
3640  *
3641  * To implement this protocol, a would-be deleter must pin the buffer and
3642  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3643  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3644  * it has successfully observed pin count = 1.
3645  */
3646 void
3648 {
3649  BufferDesc *bufHdr;
3650 
3651  Assert(BufferIsValid(buffer));
3652  Assert(PinCountWaitBuf == NULL);
3653 
3654  if (BufferIsLocal(buffer))
3655  {
3656  /* There should be exactly one pin */
3657  if (LocalRefCount[-buffer - 1] != 1)
3658  elog(ERROR, "incorrect local pin count: %d",
3659  LocalRefCount[-buffer - 1]);
3660  /* Nobody else to wait for */
3661  return;
3662  }
3663 
3664  /* There should be exactly one local pin */
3665  if (GetPrivateRefCount(buffer) != 1)
3666  elog(ERROR, "incorrect local pin count: %d",
3667  GetPrivateRefCount(buffer));
3668 
3669  bufHdr = GetBufferDescriptor(buffer - 1);
3670 
3671  for (;;)
3672  {
3673  uint32 buf_state;
3674 
3675  /* Try to acquire lock */
3677  buf_state = LockBufHdr(bufHdr);
3678 
3679  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3680  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3681  {
3682  /* Successfully acquired exclusive lock with pincount 1 */
3683  UnlockBufHdr(bufHdr, buf_state);
3684  return;
3685  }
3686  /* Failed, so mark myself as waiting for pincount 1 */
3687  if (buf_state & BM_PIN_COUNT_WAITER)
3688  {
3689  UnlockBufHdr(bufHdr, buf_state);
3690  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3691  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3692  }
3693  bufHdr->wait_backend_pid = MyProcPid;
3694  PinCountWaitBuf = bufHdr;
3695  buf_state |= BM_PIN_COUNT_WAITER;
3696  UnlockBufHdr(bufHdr, buf_state);
3697  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3698 
3699  /* Wait to be signaled by UnpinBuffer() */
3700  if (InHotStandby)
3701  {
3702  /* Publish the bufid that Startup process waits on */
3703  SetStartupBufferPinWaitBufId(buffer - 1);
3704  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3706  /* Reset the published bufid */
3708  }
3709  else
3711 
3712  /*
3713  * Remove flag marking us as waiter. Normally this will not be set
3714  * anymore, but ProcWaitForSignal() can return for other signals as
3715  * well. We take care to only reset the flag if we're the waiter, as
3716  * theoretically another backend could have started waiting. That's
3717  * impossible with the current usages due to table level locking, but
3718  * better be safe.
3719  */
3720  buf_state = LockBufHdr(bufHdr);
3721  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3722  bufHdr->wait_backend_pid == MyProcPid)
3723  buf_state &= ~BM_PIN_COUNT_WAITER;
3724  UnlockBufHdr(bufHdr, buf_state);
3725 
3726  PinCountWaitBuf = NULL;
3727  /* Loop back and try again */
3728  }
3729 }
3730 
3731 /*
3732  * Check called from RecoveryConflictInterrupt handler when Startup
3733  * process requests cancellation of all pin holders that are blocking it.
3734  */
3735 bool
3737 {
3738  int bufid = GetStartupBufferPinWaitBufId();
3739 
3740  /*
3741  * If we get woken slowly then it's possible that the Startup process was
3742  * already woken by other backends before we got here. Also possible that
3743  * we get here by multiple interrupts or interrupts at inappropriate
3744  * times, so make sure we do nothing if the bufid is not set.
3745  */
3746  if (bufid < 0)
3747  return false;
3748 
3749  if (GetPrivateRefCount(bufid + 1) > 0)
3750  return true;
3751 
3752  return false;
3753 }
3754 
3755 /*
3756  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3757  *
3758  * We won't loop, but just check once to see if the pin count is OK. If
3759  * not, return false with no lock held.
3760  */
3761 bool
3763 {
3764  BufferDesc *bufHdr;
3765  uint32 buf_state,
3766  refcount;
3767 
3768  Assert(BufferIsValid(buffer));
3769 
3770  if (BufferIsLocal(buffer))
3771  {
3772  refcount = LocalRefCount[-buffer - 1];
3773  /* There should be exactly one pin */
3774  Assert(refcount > 0);
3775  if (refcount != 1)
3776  return false;
3777  /* Nobody else to wait for */
3778  return true;
3779  }
3780 
3781  /* There should be exactly one local pin */
3782  refcount = GetPrivateRefCount(buffer);
3783  Assert(refcount);
3784  if (refcount != 1)
3785  return false;
3786 
3787  /* Try to acquire lock */
3788  if (!ConditionalLockBuffer(buffer))
3789  return false;
3790 
3791  bufHdr = GetBufferDescriptor(buffer - 1);
3792  buf_state = LockBufHdr(bufHdr);
3793  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3794 
3795  Assert(refcount > 0);
3796  if (refcount == 1)
3797  {
3798  /* Successfully acquired exclusive lock with pincount 1 */
3799  UnlockBufHdr(bufHdr, buf_state);
3800  return true;
3801  }
3802 
3803  /* Failed, so release the lock */
3804  UnlockBufHdr(bufHdr, buf_state);
3805  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3806  return false;
3807 }
3808 
3809 /*
3810  * IsBufferCleanupOK - as above, but we already have the lock
3811  *
3812  * Check whether it's OK to perform cleanup on a buffer we've already
3813  * locked. If we observe that the pin count is 1, our exclusive lock
3814  * happens to be a cleanup lock, and we can proceed with anything that
3815  * would have been allowable had we sought a cleanup lock originally.
3816  */
3817 bool
3819 {
3820  BufferDesc *bufHdr;
3821  uint32 buf_state;
3822 
3823  Assert(BufferIsValid(buffer));
3824 
3825  if (BufferIsLocal(buffer))
3826  {
3827  /* There should be exactly one pin */
3828  if (LocalRefCount[-buffer - 1] != 1)
3829  return false;
3830  /* Nobody else to wait for */
3831  return true;
3832  }
3833 
3834  /* There should be exactly one local pin */
3835  if (GetPrivateRefCount(buffer) != 1)
3836  return false;
3837 
3838  bufHdr = GetBufferDescriptor(buffer - 1);
3839 
3840  /* caller must hold exclusive lock on buffer */
3842  LW_EXCLUSIVE));
3843 
3844  buf_state = LockBufHdr(bufHdr);
3845 
3846  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3847  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3848  {
3849  /* pincount is OK. */
3850  UnlockBufHdr(bufHdr, buf_state);
3851  return true;
3852  }
3853 
3854  UnlockBufHdr(bufHdr, buf_state);
3855  return false;
3856 }
3857 
3858 
3859 /*
3860  * Functions for buffer I/O handling
3861  *
3862  * Note: We assume that nested buffer I/O never occurs.
3863  * i.e at most one io_in_progress lock is held per proc.
3864  *
3865  * Also note that these are used only for shared buffers, not local ones.
3866  */
3867 
3868 /*
3869  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3870  */
3871 static void
3873 {
3874  /*
3875  * Changed to wait until there's no IO - Inoue 01/13/2000
3876  *
3877  * Note this is *necessary* because an error abort in the process doing
3878  * I/O could release the io_in_progress_lock prematurely. See
3879  * AbortBufferIO.
3880  */
3881  for (;;)
3882  {
3883  uint32 buf_state;
3884 
3885  /*
3886  * It may not be necessary to acquire the spinlock to check the flag
3887  * here, but since this test is essential for correctness, we'd better
3888  * play it safe.
3889  */
3890  buf_state = LockBufHdr(buf);
3891  UnlockBufHdr(buf, buf_state);
3892 
3893  if (!(buf_state & BM_IO_IN_PROGRESS))
3894  break;
3897  }
3898 }
3899 
3900 /*
3901  * StartBufferIO: begin I/O on this buffer
3902  * (Assumptions)
3903  * My process is executing no IO
3904  * The buffer is Pinned
3905  *
3906  * In some scenarios there are race conditions in which multiple backends
3907  * could attempt the same I/O operation concurrently. If someone else
3908  * has already started I/O on this buffer then we will block on the
3909  * io_in_progress lock until he's done.
3910  *
3911  * Input operations are only attempted on buffers that are not BM_VALID,
3912  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3913  * so we can always tell if the work is already done.
3914  *
3915  * Returns true if we successfully marked the buffer as I/O busy,
3916  * false if someone else already did the work.
3917  */
3918 static bool
3919 StartBufferIO(BufferDesc *buf, bool forInput)
3920 {
3921  uint32 buf_state;
3922 
3923  Assert(!InProgressBuf);
3924 
3925  for (;;)
3926  {
3927  /*
3928  * Grab the io_in_progress lock so that other processes can wait for
3929  * me to finish the I/O.
3930  */
3932 
3933  buf_state = LockBufHdr(buf);
3934 
3935  if (!(buf_state & BM_IO_IN_PROGRESS))
3936  break;
3937 
3938  /*
3939  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3940  * lock isn't held is if the process doing the I/O is recovering from
3941  * an error (see AbortBufferIO). If that's the case, we must wait for
3942  * him to get unwedged.
3943  */
3944  UnlockBufHdr(buf, buf_state);
3946  WaitIO(buf);
3947  }
3948 
3949  /* Once we get here, there is definitely no I/O active on this buffer */
3950 
3951  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3952  {
3953  /* someone else already did the I/O */
3954  UnlockBufHdr(buf, buf_state);
3956  return false;
3957  }
3958 
3959  buf_state |= BM_IO_IN_PROGRESS;
3960  UnlockBufHdr(buf, buf_state);
3961 
3962  InProgressBuf = buf;
3963  IsForInput = forInput;
3964 
3965  return true;
3966 }
3967 
3968 /*
3969  * TerminateBufferIO: release a buffer we were doing I/O on
3970  * (Assumptions)
3971  * My process is executing IO for the buffer
3972  * BM_IO_IN_PROGRESS bit is set for the buffer
3973  * We hold the buffer's io_in_progress lock
3974  * The buffer is Pinned
3975  *
3976  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
3977  * buffer's BM_DIRTY flag. This is appropriate when terminating a
3978  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
3979  * marking the buffer clean if it was re-dirtied while we were writing.
3980  *
3981  * set_flag_bits gets ORed into the buffer's flags. It must include
3982  * BM_IO_ERROR in a failure case. For successful completion it could
3983  * be 0, or BM_VALID if we just finished reading in the page.
3984  */
3985 static void
3986 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
3987 {
3988  uint32 buf_state;
3989 
3990  Assert(buf == InProgressBuf);
3991 
3992  buf_state = LockBufHdr(buf);
3993 
3994  Assert(buf_state & BM_IO_IN_PROGRESS);
3995 
3996  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
3997  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
3998  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
3999 
4000  buf_state |= set_flag_bits;
4001  UnlockBufHdr(buf, buf_state);
4002 
4003  InProgressBuf = NULL;
4004 
4006 }
4007 
4008 /*
4009  * AbortBufferIO: Clean up any active buffer I/O after an error.
4010  *
4011  * All LWLocks we might have held have been released,
4012  * but we haven't yet released buffer pins, so the buffer is still pinned.
4013  *
4014  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4015  * possible the error condition wasn't related to the I/O.
4016  */
4017 void
4019 {
4021 
4022  if (buf)
4023  {
4024  uint32 buf_state;
4025 
4026  /*
4027  * Since LWLockReleaseAll has already been called, we're not holding
4028  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4029  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4030  * buffer will be in a busy spin until we succeed in doing this.
4031  */
4033 
4034  buf_state = LockBufHdr(buf);
4035  Assert(buf_state & BM_IO_IN_PROGRESS);
4036  if (IsForInput)
4037  {
4038  Assert(!(buf_state & BM_DIRTY));
4039 
4040  /* We'd better not think buffer is valid yet */
4041  Assert(!(buf_state & BM_VALID));
4042  UnlockBufHdr(buf, buf_state);
4043  }
4044  else
4045  {
4046  Assert(buf_state & BM_DIRTY);
4047  UnlockBufHdr(buf, buf_state);
4048  /* Issue notice if this is not the first failure... */
4049  if (buf_state & BM_IO_ERROR)
4050  {
4051  /* Buffer is pinned, so we can read tag without spinlock */
4052  char *path;
4053 
4054  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4055  ereport(WARNING,
4056  (errcode(ERRCODE_IO_ERROR),
4057  errmsg("could not write block %u of %s",
4058  buf->tag.blockNum, path),
4059  errdetail("Multiple failures --- write error might be permanent.")));
4060  pfree(path);
4061  }
4062  }
4063  TerminateBufferIO(buf, false, BM_IO_ERROR);
4064  }
4065 }
4066 
4067 /*
4068  * Error context callback for errors occurring during shared buffer writes.
4069  */
4070 static void
4072 {
4073  BufferDesc *bufHdr = (BufferDesc *) arg;
4074 
4075  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4076  if (bufHdr != NULL)
4077  {
4078  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4079 
4080  errcontext("writing block %u of relation %s",
4081  bufHdr->tag.blockNum, path);
4082  pfree(path);
4083  }
4084 }
4085 
4086 /*
4087  * Error context callback for errors occurring during local buffer writes.
4088  */
4089 static void
4091 {
4092  BufferDesc *bufHdr = (BufferDesc *) arg;
4093 
4094  if (bufHdr != NULL)
4095  {
4096  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4097  bufHdr->tag.forkNum);
4098 
4099  errcontext("writing block %u of relation %s",
4100  bufHdr->tag.blockNum, path);
4101  pfree(path);
4102  }
4103 }
4104 
4105 /*
4106  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4107  */
4108 static int
4109 rnode_comparator(const void *p1, const void *p2)
4110 {
4111  RelFileNode n1 = *(const RelFileNode *) p1;
4112  RelFileNode n2 = *(const RelFileNode *) p2;
4113 
4114  if (n1.relNode < n2.relNode)
4115  return -1;
4116  else if (n1.relNode > n2.relNode)
4117  return 1;
4118 
4119  if (n1.dbNode < n2.dbNode)
4120  return -1;
4121  else if (n1.dbNode > n2.dbNode)
4122  return 1;
4123 
4124  if (n1.spcNode < n2.spcNode)
4125  return -1;
4126  else if (n1.spcNode > n2.spcNode)
4127  return 1;
4128  else
4129  return 0;
4130 }
4131 
4132 /*
4133  * Lock buffer header - set BM_LOCKED in buffer state.
4134  */
4135 uint32
4137 {
4138  SpinDelayStatus delayStatus;
4139  uint32 old_buf_state;
4140 
4141  init_local_spin_delay(&delayStatus);
4142 
4143  while (true)
4144  {
4145  /* set BM_LOCKED flag */
4146  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4147  /* if it wasn't set before we're OK */
4148  if (!(old_buf_state & BM_LOCKED))
4149  break;
4150  perform_spin_delay(&delayStatus);
4151  }
4152  finish_spin_delay(&delayStatus);
4153  return old_buf_state | BM_LOCKED;
4154 }
4155 
4156 /*
4157  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4158  * state at that point.
4159  *
4160  * Obviously the buffer could be locked by the time the value is returned, so
4161  * this is primarily useful in CAS style loops.
4162  */
4163 static uint32
4165 {
4166  SpinDelayStatus delayStatus;
4167  uint32 buf_state;
4168 
4169  init_local_spin_delay(&delayStatus);
4170 
4171  buf_state = pg_atomic_read_u32(&buf->state);
4172 
4173  while (buf_state & BM_LOCKED)
4174  {
4175  perform_spin_delay(&delayStatus);
4176  buf_state = pg_atomic_read_u32(&buf->state);
4177  }
4178 
4179  finish_spin_delay(&delayStatus);
4180 
4181  return buf_state;
4182 }
4183 
4184 /*
4185  * BufferTag comparator.
4186  */
4187 static int
4188 buffertag_comparator(const void *a, const void *b)
4189 {
4190  const BufferTag *ba = (const BufferTag *) a;
4191  const BufferTag *bb = (const BufferTag *) b;
4192  int ret;
4193 
4194  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4195 
4196  if (ret != 0)
4197  return ret;
4198 
4199  if (ba->forkNum < bb->forkNum)
4200  return -1;
4201  if (ba->forkNum > bb->forkNum)
4202  return 1;
4203 
4204  if (ba->blockNum < bb->blockNum)
4205  return -1;
4206  if (ba->blockNum > bb->blockNum)
4207  return 1;
4208 
4209  return 0;
4210 }
4211 
4212 /*
4213  * Comparator determining the writeout order in a checkpoint.
4214  *
4215  * It is important that tablespaces are compared first, the logic balancing
4216  * writes between tablespaces relies on it.
4217  */
4218 static int
4219 ckpt_buforder_comparator(const void *pa, const void *pb)
4220 {
4221  const CkptSortItem *a = (const CkptSortItem *) pa;
4222  const CkptSortItem *b = (const CkptSortItem *) pb;
4223 
4224  /* compare tablespace */
4225  if (a->tsId < b->tsId)
4226  return -1;
4227  else if (a->tsId > b->tsId)
4228  return 1;
4229  /* compare relation */
4230  if (a->relNode < b->relNode)
4231  return -1;
4232  else if (a->relNode > b->relNode)
4233  return 1;
4234  /* compare fork */
4235  else if (a->forkNum < b->forkNum)
4236  return -1;
4237  else if (a->forkNum > b->forkNum)
4238  return 1;
4239  /* compare block number */
4240  else if (a->blockNum < b->blockNum)
4241  return -1;
4242  else if (a->blockNum > b->blockNum)
4243  return 1;
4244  /* equal page IDs are unlikely, but not impossible */
4245  return 0;
4246 }
4247 
4248 /*
4249  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4250  * progress.
4251  */
4252 static int
4254 {
4255  CkptTsStatus *sa = (CkptTsStatus *) a;
4256  CkptTsStatus *sb = (CkptTsStatus *) b;
4257 
4258  /* we want a min-heap, so return 1 for the a < b */
4259  if (sa->progress < sb->progress)
4260  return 1;
4261  else if (sa->progress == sb->progress)
4262  return 0;
4263  else
4264  return -1;
4265 }
4266 
4267 /*
4268  * Initialize a writeback context, discarding potential previous state.
4269  *
4270  * *max_pending is a pointer instead of an immediate value, so the coalesce
4271  * limits can easily changed by the GUC mechanism, and so calling code does
4272  * not have to check the current configuration. A value is 0 means that no
4273  * writeback control will be performed.
4274  */
4275 void
4276 WritebackContextInit(WritebackContext *context, int *max_pending)
4277 {
4278  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4279 
4280  context->max_pending = max_pending;
4281  context->nr_pending = 0;
4282 }
4283 
4284 /*
4285  * Add buffer to list of pending writeback requests.
4286  */
4287 void
4289 {
4290  PendingWriteback *pending;
4291 
4292  /*
4293  * Add buffer to the pending writeback array, unless writeback control is
4294  * disabled.
4295  */
4296  if (*context->max_pending > 0)
4297  {
4299 
4300  pending = &context->pending_writebacks[context->nr_pending++];
4301 
4302  pending->tag = *tag;
4303  }
4304 
4305  /*
4306  * Perform pending flushes if the writeback limit is exceeded. This
4307  * includes the case where previously an item has been added, but control
4308  * is now disabled.
4309  */
4310  if (context->nr_pending >= *context->max_pending)
4311  IssuePendingWritebacks(context);
4312 }
4313 
4314 /*
4315  * Issue all pending writeback requests, previously scheduled with
4316  * ScheduleBufferTagForWriteback, to the OS.
4317  *
4318  * Because this is only used to improve the OSs IO scheduling we try to never
4319  * error out - it's just a hint.
4320  */
4321 void
4323 {
4324  int i;
4325 
4326  if (context->nr_pending == 0)
4327  return;
4328 
4329  /*
4330  * Executing the writes in-order can make them a lot faster, and allows to
4331  * merge writeback requests to consecutive blocks into larger writebacks.
4332  */
4333  qsort(&context->pending_writebacks, context->nr_pending,
4335 
4336  /*
4337  * Coalesce neighbouring writes, but nothing else. For that we iterate
4338  * through the, now sorted, array of pending flushes, and look forward to
4339  * find all neighbouring (or identical) writes.
4340  */
4341  for (i = 0; i < context->nr_pending; i++)
4342  {
4345  SMgrRelation reln;
4346  int ahead;
4347  BufferTag tag;
4348  Size nblocks = 1;
4349 
4350  cur = &context->pending_writebacks[i];
4351  tag = cur->tag;
4352 
4353  /*
4354  * Peek ahead, into following writeback requests, to see if they can
4355  * be combined with the current one.
4356  */
4357  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4358  {
4359  next = &context->pending_writebacks[i + ahead + 1];
4360 
4361  /* different file, stop */
4362  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4363  cur->tag.forkNum != next->tag.forkNum)
4364  break;
4365 
4366  /* ok, block queued twice, skip */
4367  if (cur->tag.blockNum == next->tag.blockNum)
4368  continue;
4369 
4370  /* only merge consecutive writes */
4371  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4372  break;
4373 
4374  nblocks++;
4375  cur = next;
4376  }
4377 
4378  i += ahead;
4379 
4380  /* and finally tell the kernel to write the data to storage */
4381  reln = smgropen(tag.rnode, InvalidBackendId);
4382  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4383  }
4384 
4385  context->nr_pending = 0;
4386 }
4387 
4388 
4389 /*
4390  * Implement slower/larger portions of TestForOldSnapshot
4391  *
4392  * Smaller/faster portions are put inline, but the entire set of logic is too
4393  * big for that.
4394  */
4395 void
4397 {
4398  if (RelationAllowsEarlyPruning(relation)
4399  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4400  ereport(ERROR,
4401  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4402  errmsg("snapshot too old")));
4403 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
void ProcessSyncRequests(void)
Definition: sync.c:236
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1579
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
#define init_local_spin_delay(status)
Definition: s_lock.h:1038
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
#define MAX_IO_CONCURRENCY
Definition: bufmgr.h:78
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
int target_prefetch_pages
Definition: bufmgr.c:130
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:150
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:662
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3647
Definition: lwlock.h:32
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:416
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2582
PgStat_Counter m_buf_alloc
Definition: pgstat.h:421
TimestampTz ckpt_sync_end_t
Definition: xlog.h:238
#define BM_PERMANENT
Definition: buf_internals.h:67
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1860
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:216
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:974
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1715
long local_blks_read
Definition: instrument.h:26
#define BM_TAG_VALID
Definition: buf_internals.h:61
Oid tsId
Definition: bufmgr.c:87
static int32 next
Definition: blutils.c:213
int VacuumCostBalance
Definition: globals.c:147
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2053
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2539
int BgWriterDelay
Definition: bgwriter.c:67
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:94
#define HASH_ELEM
Definition: hsearch.h:87
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
static uint32 PrivateRefCountClock
Definition: bufmgr.c:172
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3423
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1842
instr_time blk_read_time
Definition: instrument.h:31
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1569
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1360
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4090
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:314
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3119
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1458
int backend_flush_after
Definition: bufmgr.c:121
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2423
int VacuumPageHit
Definition: globals.c:143
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:418
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: bufmgr.c:2927
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:893
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:200
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:66
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3872
#define Min(x, y)
Definition: c.h:890
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:80
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4253
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:382
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:417
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:143
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:684
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1169
int checkpoint_flush_after
Definition: bufmgr.c:119
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:570
#define MemSet(start, val, len)
Definition: c.h:941
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3353
#define P_NEW
Definition: bufmgr.h:81
double bgwriter_lru_multiplier
Definition: bufmgr.c:111
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:906
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:92
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:88
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:84
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:7895
#define BM_DIRTY
Definition: buf_internals.h:59
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:69
int VacuumCostPageDirty
Definition: globals.c:139
void(* callback)(void *arg)
Definition: elog.h:254
struct ErrorContextCallback * previous
Definition: elog.h:253
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:906
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2798
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2673
int effective_io_concurrency
Definition: bufmgr.c:113
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2356
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4322
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:133
signed int int32
Definition: c.h:346
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4276
PGXACT * MyPgXact
Definition: proc.c:69
int bgwriter_flush_after
Definition: bufmgr.c:120
bool ComputeIoConcurrency(int io_concurrency, double *target)
Definition: bufmgr.c:469
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:587
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:253
ErrorContextCallback * error_context_stack
Definition: elog.c:88
#define RelationOpenSmgr(relation)
Definition: rel.h:471
void ProcSendSignal(int pid)
Definition: proc.c:1811
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
#define BUF_REUSABLE
Definition: bufmgr.c:67
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:208
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3919
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:1031
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:120
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2445
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3376
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3762
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:491
char relpersistence
Definition: pg_class.h:78
#define RelationIsValid(relation)
Definition: rel.h:387
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:461
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4288
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:46
int bgwriter_lru_maxpages
Definition: bufmgr.c:110
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1384
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:175
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
int num_to_scan
Definition: bufmgr.c:100
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:97
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2876
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1294
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:646
static char * buf
Definition: pg_test_fsync.c:68
int index
Definition: bufmgr.c:105
float8 progress
Definition: bufmgr.c:96
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3290
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:609
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1593
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4219
int errdetail(const char *fmt,...)
Definition: elog.c:860
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:212
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:44
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:64
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:358
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:622
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4188
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2846
#define BUF_WRITTEN
Definition: bufmgr.c:66
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
static bool IsForInput
Definition: bufmgr.c:134
#define ereport(elevel, rest)
Definition: elog.h:141
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3616
bool delayChkpt
Definition: proc.h:235
int VacuumCostPageHit
Definition: globals.c:137
static void BufferSync(int flags)
Definition: bufmgr.c:1786
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:144
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3818
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:44
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1799
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
int ckpt_bufs_written
Definition: xlog.h:241
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:173
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:658
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:41
#define BM_LOCKED
Definition: buf_internals.h:58
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1389
void UnlockBuffers(void)
Definition: bufmgr.c:3562
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3986
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4109
int VacuumPageDirty
Definition: globals.c:145
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:60
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:2984
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:58
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:316
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3590
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:705
#define InvalidOid
Definition: postgres_ext.h:36
void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:531
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2800
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:995
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:757
RelFileNode rd_node
Definition: rel.h:55
#define Max(x, y)
Definition: c.h:884
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4136
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:634
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:732
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1394
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:98
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2498
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:544
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3736
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:78
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3192
CheckpointStatsData CheckpointStats
Definition: xlog.c:181
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1664
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
size_t Size
Definition: c.h:466
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:43
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:236
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1396
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1198
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1521
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
void AbortBufferIO(void)
Definition: bufmgr.c:4018
BlockNumber blockNum
Definition: buf_internals.h:95
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4164
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1389
TimestampTz ckpt_sync_t
Definition: xlog.h:237
RelFileNode rnode
Definition: buf_internals.h:93
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1044
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1379
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3333
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:77
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:523
#define BM_IO_ERROR
Definition: buf_internals.h:63
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:562
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2613
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:784
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170
#define elog(elevel,...)
Definition: elog.h:226
int i
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:137
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:573
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:183
int NBuffers
Definition: globals.c:131
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:280
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:375
int num_scanned
Definition: bufmgr.c:102
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2469
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:62
int VacuumPageMiss
Definition: globals.c:144
int VacuumCostPageMiss
Definition: globals.c:138
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:50
#define qsort(a, b, c, d)
Definition: port.h:491
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:59
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2634
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3087
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4071
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4396
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2599
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3391
#define XLogHintBitIsNeeded()
Definition: xlog.h:192
bool track_io_timing
Definition: bufmgr.c:112
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:211
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:49
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:125
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:244
bool zero_damaged_pages
Definition: bufmgr.c:109
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:915
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:65
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2479