PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/xlog.h"
37 #include "catalog/catalog.h"
38 #include "catalog/storage.h"
39 #include "executor/instrument.h"
40 #include "lib/binaryheap.h"
41 #include "miscadmin.h"
42 #include "pg_trace.h"
43 #include "pgstat.h"
44 #include "postmaster/bgwriter.h"
45 #include "storage/buf_internals.h"
46 #include "storage/bufmgr.h"
47 #include "storage/ipc.h"
48 #include "storage/proc.h"
49 #include "storage/smgr.h"
50 #include "storage/standby.h"
51 #include "utils/rel.h"
52 #include "utils/resowner_private.h"
53 #include "utils/timestamp.h"
54 
55 
56 /* Note: these two macros only work on shared buffers, not local ones! */
57 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
58 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
59 
60 /* Note: this macro only works on local buffers, not shared ones! */
61 #define LocalBufHdrGetBlock(bufHdr) \
62  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
63 
64 /* Bits in SyncOneBuffer's return value */
65 #define BUF_WRITTEN 0x01
66 #define BUF_REUSABLE 0x02
67 
68 #define DROP_RELS_BSEARCH_THRESHOLD 20
69 
70 typedef struct PrivateRefCountEntry
71 {
75 
76 /* 64 bytes, about the size of a cache line on common systems */
77 #define REFCOUNT_ARRAY_ENTRIES 8
78 
79 /*
80  * Status of buffers to checkpoint for a particular tablespace, used
81  * internally in BufferSync.
82  */
83 typedef struct CkptTsStatus
84 {
85  /* oid of the tablespace */
87 
88  /*
89  * Checkpoint progress for this tablespace. To make progress comparable
90  * between tablespaces the progress is, for each tablespace, measured as a
91  * number between 0 and the total number of to-be-checkpointed pages. Each
92  * page checkpointed in this tablespace increments this space's progress
93  * by progress_slice.
94  */
97 
98  /* number of to-be checkpointed pages in this tablespace */
100  /* already processed pages in this tablespace */
102 
103  /* current offset in CkptBufferIds for this tablespace */
104  int index;
105 } CkptTsStatus;
106 
107 /* GUC variables */
108 bool zero_damaged_pages = false;
111 bool track_io_timing = false;
113 
114 /*
115  * GUC variables about triggering kernel writeback for buffers written; OS
116  * dependent defaults are set via the GUC mechanism.
117  */
121 
122 /*
123  * How many buffers PrefetchBuffer callers should try to stay ahead of their
124  * ReadBuffer calls by. This is maintained by the assign hook for
125  * effective_io_concurrency. Zero means "never prefetch". This value is
126  * only used for buffers not belonging to tablespaces that have their
127  * effective_io_concurrency parameter set.
128  */
130 
131 /* local state for StartBufferIO and related functions */
132 static BufferDesc *InProgressBuf = NULL;
133 static bool IsForInput;
134 
135 /* local state for LockBufferForCleanup */
137 
138 /*
139  * Backend-Private refcount management:
140  *
141  * Each buffer also has a private refcount that keeps track of the number of
142  * times the buffer is pinned in the current process. This is so that the
143  * shared refcount needs to be modified only once if a buffer is pinned more
144  * than once by an individual backend. It's also used to check that no buffers
145  * are still pinned at the end of transactions and when exiting.
146  *
147  *
148  * To avoid - as we used to - requiring an array with NBuffers entries to keep
149  * track of local buffers, we use a small sequentially searched array
150  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
151  * keep track of backend local pins.
152  *
153  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
154  * refcounts are kept track of in the array; after that, new array entries
155  * displace old ones into the hash table. That way a frequently used entry
156  * can't get "stuck" in the hashtable while infrequent ones clog the array.
157  *
158  * Note that in most scenarios the number of pinned buffers will not exceed
159  * REFCOUNT_ARRAY_ENTRIES.
160  *
161  *
162  * To enter a buffer into the refcount tracking mechanism first reserve a free
163  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
164  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
165  * memory allocations in NewPrivateRefCountEntry() which can be important
166  * because in some scenarios it's called with a spinlock held...
167  */
169 static HTAB *PrivateRefCountHash = NULL;
173 
174 static void ReservePrivateRefCountEntry(void);
177 static inline int32 GetPrivateRefCount(Buffer buffer);
179 
180 /*
181  * Ensure that the PrivateRefCountArray has sufficient space to store one more
182  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
183  * a new entry - but it's perfectly fine to not use a reserved entry.
184  */
185 static void
187 {
188  /* Already reserved (or freed), nothing to do */
189  if (ReservedRefCountEntry != NULL)
190  return;
191 
192  /*
193  * First search for a free entry the array, that'll be sufficient in the
194  * majority of cases.
195  */
196  {
197  int i;
198 
199  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
200  {
202 
203  res = &PrivateRefCountArray[i];
204 
205  if (res->buffer == InvalidBuffer)
206  {
207  ReservedRefCountEntry = res;
208  return;
209  }
210  }
211  }
212 
213  /*
214  * No luck. All array entries are full. Move one array entry into the hash
215  * table.
216  */
217  {
218  /*
219  * Move entry from the current clock position in the array into the
220  * hashtable. Use that slot.
221  */
222  PrivateRefCountEntry *hashent;
223  bool found;
224 
225  /* select victim slot */
226  ReservedRefCountEntry =
228 
229  /* Better be used, otherwise we shouldn't get here. */
230  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
231 
232  /* enter victim array entry into hashtable */
233  hashent = hash_search(PrivateRefCountHash,
234  (void *) &(ReservedRefCountEntry->buffer),
235  HASH_ENTER,
236  &found);
237  Assert(!found);
238  hashent->refcount = ReservedRefCountEntry->refcount;
239 
240  /* clear the now free array slot */
241  ReservedRefCountEntry->buffer = InvalidBuffer;
242  ReservedRefCountEntry->refcount = 0;
243 
245  }
246 }
247 
248 /*
249  * Fill a previously reserved refcount entry.
250  */
251 static PrivateRefCountEntry *
253 {
255 
256  /* only allowed to be called when a reservation has been made */
257  Assert(ReservedRefCountEntry != NULL);
258 
259  /* use up the reserved entry */
260  res = ReservedRefCountEntry;
261  ReservedRefCountEntry = NULL;
262 
263  /* and fill it */
264  res->buffer = buffer;
265  res->refcount = 0;
266 
267  return res;
268 }
269 
270 /*
271  * Return the PrivateRefCount entry for the passed buffer.
272  *
273  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
274  * do_move is true, and the entry resides in the hashtable the entry is
275  * optimized for frequent access by moving it to the array.
276  */
277 static PrivateRefCountEntry *
279 {
281  int i;
282 
283  Assert(BufferIsValid(buffer));
284  Assert(!BufferIsLocal(buffer));
285 
286  /*
287  * First search for references in the array, that'll be sufficient in the
288  * majority of cases.
289  */
290  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
291  {
292  res = &PrivateRefCountArray[i];
293 
294  if (res->buffer == buffer)
295  return res;
296  }
297 
298  /*
299  * By here we know that the buffer, if already pinned, isn't residing in
300  * the array.
301  *
302  * Only look up the buffer in the hashtable if we've previously overflowed
303  * into it.
304  */
305  if (PrivateRefCountOverflowed == 0)
306  return NULL;
307 
308  res = hash_search(PrivateRefCountHash,
309  (void *) &buffer,
310  HASH_FIND,
311  NULL);
312 
313  if (res == NULL)
314  return NULL;
315  else if (!do_move)
316  {
317  /* caller doesn't want us to move the hash entry into the array */
318  return res;
319  }
320  else
321  {
322  /* move buffer from hashtable into the free array slot */
323  bool found;
325 
326  /* Ensure there's a free array slot */
328 
329  /* Use up the reserved slot */
330  Assert(ReservedRefCountEntry != NULL);
331  free = ReservedRefCountEntry;
332  ReservedRefCountEntry = NULL;
333  Assert(free->buffer == InvalidBuffer);
334 
335  /* and fill it */
336  free->buffer = buffer;
337  free->refcount = res->refcount;
338 
339  /* delete from hashtable */
340  hash_search(PrivateRefCountHash,
341  (void *) &buffer,
342  HASH_REMOVE,
343  &found);
344  Assert(found);
347 
348  return free;
349  }
350 }
351 
352 /*
353  * Returns how many times the passed buffer is pinned by this backend.
354  *
355  * Only works for shared memory buffers!
356  */
357 static inline int32
359 {
361 
362  Assert(BufferIsValid(buffer));
363  Assert(!BufferIsLocal(buffer));
364 
365  /*
366  * Not moving the entry - that's ok for the current users, but we might
367  * want to change this one day.
368  */
369  ref = GetPrivateRefCountEntry(buffer, false);
370 
371  if (ref == NULL)
372  return 0;
373  return ref->refcount;
374 }
375 
376 /*
377  * Release resources used to track the reference count of a buffer which we no
378  * longer have pinned and don't want to pin again immediately.
379  */
380 static void
382 {
383  Assert(ref->refcount == 0);
384 
385  if (ref >= &PrivateRefCountArray[0] &&
387  {
388  ref->buffer = InvalidBuffer;
389 
390  /*
391  * Mark the just used entry as reserved - in many scenarios that
392  * allows us to avoid ever having to search the array/hash for free
393  * entries.
394  */
395  ReservedRefCountEntry = ref;
396  }
397  else
398  {
399  bool found;
400  Buffer buffer = ref->buffer;
401 
402  hash_search(PrivateRefCountHash,
403  (void *) &buffer,
404  HASH_REMOVE,
405  &found);
406  Assert(found);
409  }
410 }
411 
412 /*
413  * BufferIsPinned
414  * True iff the buffer is pinned (also checks for valid buffer number).
415  *
416  * NOTE: what we check here is that *this* backend holds a pin on
417  * the buffer. We do not care whether some other backend does.
418  */
419 #define BufferIsPinned(bufnum) \
420 ( \
421  !BufferIsValid(bufnum) ? \
422  false \
423  : \
424  BufferIsLocal(bufnum) ? \
425  (LocalRefCount[-(bufnum) - 1] > 0) \
426  : \
427  (GetPrivateRefCount(bufnum) > 0) \
428 )
429 
430 
431 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
432  ForkNumber forkNum, BlockNumber blockNum,
433  ReadBufferMode mode, BufferAccessStrategy strategy,
434  bool *hit);
435 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
436 static void PinBuffer_Locked(BufferDesc *buf);
437 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
438 static void BufferSync(int flags);
440 static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context);
441 static void WaitIO(BufferDesc *buf);
442 static bool StartBufferIO(BufferDesc *buf, bool forInput);
443 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
444  uint32 set_flag_bits);
445 static void shared_buffer_write_error_callback(void *arg);
446 static void local_buffer_write_error_callback(void *arg);
447 static BufferDesc *BufferAlloc(SMgrRelation smgr,
448  char relpersistence,
449  ForkNumber forkNum,
450  BlockNumber blockNum,
451  BufferAccessStrategy strategy,
452  bool *foundPtr);
453 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
454 static void AtProcExit_Buffers(int code, Datum arg);
455 static void CheckForBufferLeaks(void);
456 static int rnode_comparator(const void *p1, const void *p2);
457 static int buffertag_comparator(const void *p1, const void *p2);
458 static int ckpt_buforder_comparator(const void *pa, const void *pb);
459 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
460 
461 
462 /*
463  * ComputeIoConcurrency -- get the number of pages to prefetch for a given
464  * number of spindles.
465  */
466 bool
467 ComputeIoConcurrency(int io_concurrency, double *target)
468 {
469  double new_prefetch_pages = 0.0;
470  int i;
471 
472  /*
473  * Make sure the io_concurrency value is within valid range; it may have
474  * been forced with a manual pg_tablespace update.
475  */
476  io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
477 
478  /*----------
479  * The user-visible GUC parameter is the number of drives (spindles),
480  * which we need to translate to a number-of-pages-to-prefetch target.
481  * The target value is stashed in *extra and then assigned to the actual
482  * variable by assign_effective_io_concurrency.
483  *
484  * The expected number of prefetch pages needed to keep N drives busy is:
485  *
486  * drives | I/O requests
487  * -------+----------------
488  * 1 | 1
489  * 2 | 2/1 + 2/2 = 3
490  * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
491  * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
492  * n | n * H(n)
493  *
494  * This is called the "coupon collector problem" and H(n) is called the
495  * harmonic series. This could be approximated by n * ln(n), but for
496  * reasonable numbers of drives we might as well just compute the series.
497  *
498  * Alternatively we could set the target to the number of pages necessary
499  * so that the expected number of active spindles is some arbitrary
500  * percentage of the total. This sounds the same but is actually slightly
501  * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
502  * that desired fraction.
503  *
504  * Experimental results show that both of these formulas aren't aggressive
505  * enough, but we don't really have any better proposals.
506  *
507  * Note that if io_concurrency = 0 (disabled), we must set target = 0.
508  *----------
509  */
510 
511  for (i = 1; i <= io_concurrency; i++)
512  new_prefetch_pages += (double) io_concurrency / (double) i;
513 
514  *target = new_prefetch_pages;
515 
516  /* This range check shouldn't fail, but let's be paranoid */
517  return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
518 }
519 
520 /*
521  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
522  *
523  * This is named by analogy to ReadBuffer but doesn't actually allocate a
524  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
525  * block will not be delayed by the I/O. Prefetching is optional.
526  * No-op if prefetching isn't compiled in.
527  */
528 void
530 {
531 #ifdef USE_PREFETCH
532  Assert(RelationIsValid(reln));
533  Assert(BlockNumberIsValid(blockNum));
534 
535  /* Open it at the smgr level if not already done */
536  RelationOpenSmgr(reln);
537 
538  if (RelationUsesLocalBuffers(reln))
539  {
540  /* see comments in ReadBufferExtended */
541  if (RELATION_IS_OTHER_TEMP(reln))
542  ereport(ERROR,
543  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
544  errmsg("cannot access temporary tables of other sessions")));
545 
546  /* pass it off to localbuf.c */
547  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
548  }
549  else
550  {
551  BufferTag newTag; /* identity of requested block */
552  uint32 newHash; /* hash value for newTag */
553  LWLock *newPartitionLock; /* buffer partition lock for it */
554  int buf_id;
555 
556  /* create a tag so we can lookup the buffer */
557  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
558  forkNum, blockNum);
559 
560  /* determine its hash code and partition lock ID */
561  newHash = BufTableHashCode(&newTag);
562  newPartitionLock = BufMappingPartitionLock(newHash);
563 
564  /* see if the block is in the buffer pool already */
565  LWLockAcquire(newPartitionLock, LW_SHARED);
566  buf_id = BufTableLookup(&newTag, newHash);
567  LWLockRelease(newPartitionLock);
568 
569  /* If not in buffers, initiate prefetch */
570  if (buf_id < 0)
571  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
572 
573  /*
574  * If the block *is* in buffers, we do nothing. This is not really
575  * ideal: the block might be just about to be evicted, which would be
576  * stupid since we know we are going to need it soon. But the only
577  * easy answer is to bump the usage_count, which does not seem like a
578  * great solution: when the caller does ultimately touch the block,
579  * usage_count would get bumped again, resulting in too much
580  * favoritism for blocks that are involved in a prefetch sequence. A
581  * real fix would involve some additional per-buffer state, and it's
582  * not clear that there's enough of a problem to justify that.
583  */
584  }
585 #endif /* USE_PREFETCH */
586 }
587 
588 
589 /*
590  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
591  * fork with RBM_NORMAL mode and default strategy.
592  */
593 Buffer
595 {
596  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
597 }
598 
599 /*
600  * ReadBufferExtended -- returns a buffer containing the requested
601  * block of the requested relation. If the blknum
602  * requested is P_NEW, extend the relation file and
603  * allocate a new block. (Caller is responsible for
604  * ensuring that only one backend tries to extend a
605  * relation at the same time!)
606  *
607  * Returns: the buffer number for the buffer containing
608  * the block read. The returned buffer has been pinned.
609  * Does not return on error --- elog's instead.
610  *
611  * Assume when this function is called, that reln has been opened already.
612  *
613  * In RBM_NORMAL mode, the page is read from disk, and the page header is
614  * validated. An error is thrown if the page header is not valid. (But
615  * note that an all-zero page is considered "valid"; see PageIsVerified().)
616  *
617  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
618  * valid, the page is zeroed instead of throwing an error. This is intended
619  * for non-critical data, where the caller is prepared to repair errors.
620  *
621  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
622  * filled with zeros instead of reading it from disk. Useful when the caller
623  * is going to fill the page from scratch, since this saves I/O and avoids
624  * unnecessary failure if the page-on-disk has corrupt page headers.
625  * The page is returned locked to ensure that the caller has a chance to
626  * initialize the page before it's made visible to others.
627  * Caution: do not use this mode to read a page that is beyond the relation's
628  * current physical EOF; that is likely to cause problems in md.c when
629  * the page is modified and written out. P_NEW is OK, though.
630  *
631  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
632  * a cleanup-strength lock on the page.
633  *
634  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
635  *
636  * If strategy is not NULL, a nondefault buffer access strategy is used.
637  * See buffer/README for details.
638  */
639 Buffer
641  ReadBufferMode mode, BufferAccessStrategy strategy)
642 {
643  bool hit;
644  Buffer buf;
645 
646  /* Open it at the smgr level if not already done */
647  RelationOpenSmgr(reln);
648 
649  /*
650  * Reject attempts to read non-local temporary relations; we would be
651  * likely to get wrong data since we have no visibility into the owning
652  * session's local buffers.
653  */
654  if (RELATION_IS_OTHER_TEMP(reln))
655  ereport(ERROR,
656  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
657  errmsg("cannot access temporary tables of other sessions")));
658 
659  /*
660  * Read the buffer, and update pgstat counters to reflect a cache hit or
661  * miss.
662  */
664  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
665  forkNum, blockNum, mode, strategy, &hit);
666  if (hit)
668  return buf;
669 }
670 
671 
672 /*
673  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
674  * a relcache entry for the relation.
675  *
676  * NB: At present, this function may only be used on permanent relations, which
677  * is OK, because we only use it during XLOG replay. If in the future we
678  * want to use it on temporary or unlogged relations, we could pass additional
679  * parameters.
680  */
681 Buffer
683  BlockNumber blockNum, ReadBufferMode mode,
684  BufferAccessStrategy strategy)
685 {
686  bool hit;
687 
688  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
689 
691 
692  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
693  mode, strategy, &hit);
694 }
695 
696 
697 /*
698  * ReadBuffer_common -- common logic for all ReadBuffer variants
699  *
700  * *hit is set to true if the request was satisfied from shared buffer cache.
701  */
702 static Buffer
703 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
704  BlockNumber blockNum, ReadBufferMode mode,
705  BufferAccessStrategy strategy, bool *hit)
706 {
707  BufferDesc *bufHdr;
708  Block bufBlock;
709  bool found;
710  bool isExtend;
711  bool isLocalBuf = SmgrIsTemp(smgr);
712 
713  *hit = false;
714 
715  /* Make sure we will have room to remember the buffer pin */
717 
718  isExtend = (blockNum == P_NEW);
719 
720  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
721  smgr->smgr_rnode.node.spcNode,
722  smgr->smgr_rnode.node.dbNode,
723  smgr->smgr_rnode.node.relNode,
724  smgr->smgr_rnode.backend,
725  isExtend);
726 
727  /* Substitute proper block number if caller asked for P_NEW */
728  if (isExtend)
729  blockNum = smgrnblocks(smgr, forkNum);
730 
731  if (isLocalBuf)
732  {
733  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
734  if (found)
736  else
738  }
739  else
740  {
741  /*
742  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
743  * not currently in memory.
744  */
745  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
746  strategy, &found);
747  if (found)
749  else
751  }
752 
753  /* At this point we do NOT hold any locks. */
754 
755  /* if it was already in the buffer pool, we're done */
756  if (found)
757  {
758  if (!isExtend)
759  {
760  /* Just need to update stats before we exit */
761  *hit = true;
762  VacuumPageHit++;
763 
764  if (VacuumCostActive)
766 
767  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
768  smgr->smgr_rnode.node.spcNode,
769  smgr->smgr_rnode.node.dbNode,
770  smgr->smgr_rnode.node.relNode,
771  smgr->smgr_rnode.backend,
772  isExtend,
773  found);
774 
775  /*
776  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
777  * locked on return.
778  */
779  if (!isLocalBuf)
780  {
781  if (mode == RBM_ZERO_AND_LOCK)
783  LW_EXCLUSIVE);
784  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
786  }
787 
788  return BufferDescriptorGetBuffer(bufHdr);
789  }
790 
791  /*
792  * We get here only in the corner case where we are trying to extend
793  * the relation but we found a pre-existing buffer marked BM_VALID.
794  * This can happen because mdread doesn't complain about reads beyond
795  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
796  * read a block beyond EOF could have left a "valid" zero-filled
797  * buffer. Unfortunately, we have also seen this case occurring
798  * because of buggy Linux kernels that sometimes return an
799  * lseek(SEEK_END) result that doesn't account for a recent write. In
800  * that situation, the pre-existing buffer would contain valid data
801  * that we don't want to overwrite. Since the legitimate case should
802  * always have left a zero-filled buffer, complain if not PageIsNew.
803  */
804  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
805  if (!PageIsNew((Page) bufBlock))
806  ereport(ERROR,
807  (errmsg("unexpected data beyond EOF in block %u of relation %s",
808  blockNum, relpath(smgr->smgr_rnode, forkNum)),
809  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
810 
811  /*
812  * We *must* do smgrextend before succeeding, else the page will not
813  * be reserved by the kernel, and the next P_NEW call will decide to
814  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
815  * call that BufferAlloc didn't, and proceed.
816  */
817  if (isLocalBuf)
818  {
819  /* Only need to adjust flags */
820  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
821 
822  Assert(buf_state & BM_VALID);
823  buf_state &= ~BM_VALID;
824  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
825  }
826  else
827  {
828  /*
829  * Loop to handle the very small possibility that someone re-sets
830  * BM_VALID between our clearing it and StartBufferIO inspecting
831  * it.
832  */
833  do
834  {
835  uint32 buf_state = LockBufHdr(bufHdr);
836 
837  Assert(buf_state & BM_VALID);
838  buf_state &= ~BM_VALID;
839  UnlockBufHdr(bufHdr, buf_state);
840  } while (!StartBufferIO(bufHdr, true));
841  }
842  }
843 
844  /*
845  * if we have gotten to this point, we have allocated a buffer for the
846  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
847  * if it's a shared buffer.
848  *
849  * Note: if smgrextend fails, we will end up with a buffer that is
850  * allocated but not marked BM_VALID. P_NEW will still select the same
851  * block number (because the relation didn't get any longer on disk) and
852  * so future attempts to extend the relation will find the same buffer (if
853  * it's not been recycled) but come right back here to try smgrextend
854  * again.
855  */
856  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
857 
858  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
859 
860  if (isExtend)
861  {
862  /* new buffers are zero-filled */
863  MemSet((char *) bufBlock, 0, BLCKSZ);
864  /* don't set checksum for all-zero page */
865  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
866 
867  /*
868  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
869  * although we're essentially performing a write. At least on linux
870  * doing so defeats the 'delayed allocation' mechanism, leading to
871  * increased file fragmentation.
872  */
873  }
874  else
875  {
876  /*
877  * Read in the page, unless the caller intends to overwrite it and
878  * just wants us to allocate a buffer.
879  */
880  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
881  MemSet((char *) bufBlock, 0, BLCKSZ);
882  else
883  {
884  instr_time io_start,
885  io_time;
886 
887  if (track_io_timing)
888  INSTR_TIME_SET_CURRENT(io_start);
889 
890  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
891 
892  if (track_io_timing)
893  {
894  INSTR_TIME_SET_CURRENT(io_time);
895  INSTR_TIME_SUBTRACT(io_time, io_start);
898  }
899 
900  /* check for garbage data */
901  if (!PageIsVerified((Page) bufBlock, blockNum))
902  {
903  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
904  {
906  (errcode(ERRCODE_DATA_CORRUPTED),
907  errmsg("invalid page in block %u of relation %s; zeroing out page",
908  blockNum,
909  relpath(smgr->smgr_rnode, forkNum))));
910  MemSet((char *) bufBlock, 0, BLCKSZ);
911  }
912  else
913  ereport(ERROR,
914  (errcode(ERRCODE_DATA_CORRUPTED),
915  errmsg("invalid page in block %u of relation %s",
916  blockNum,
917  relpath(smgr->smgr_rnode, forkNum))));
918  }
919  }
920  }
921 
922  /*
923  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
924  * the page as valid, to make sure that no other backend sees the zeroed
925  * page before the caller has had a chance to initialize it.
926  *
927  * Since no-one else can be looking at the page contents yet, there is no
928  * difference between an exclusive lock and a cleanup-strength lock. (Note
929  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
930  * they assert that the buffer is already valid.)
931  */
932  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
933  !isLocalBuf)
934  {
936  }
937 
938  if (isLocalBuf)
939  {
940  /* Only need to adjust flags */
941  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
942 
943  buf_state |= BM_VALID;
944  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
945  }
946  else
947  {
948  /* Set BM_VALID, terminate IO, and wake up any waiters */
949  TerminateBufferIO(bufHdr, false, BM_VALID);
950  }
951 
952  VacuumPageMiss++;
953  if (VacuumCostActive)
955 
956  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
957  smgr->smgr_rnode.node.spcNode,
958  smgr->smgr_rnode.node.dbNode,
959  smgr->smgr_rnode.node.relNode,
960  smgr->smgr_rnode.backend,
961  isExtend,
962  found);
963 
964  return BufferDescriptorGetBuffer(bufHdr);
965 }
966 
967 /*
968  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
969  * buffer. If no buffer exists already, selects a replacement
970  * victim and evicts the old page, but does NOT read in new page.
971  *
972  * "strategy" can be a buffer replacement strategy object, or NULL for
973  * the default strategy. The selected buffer's usage_count is advanced when
974  * using the default strategy, but otherwise possibly not (see PinBuffer).
975  *
976  * The returned buffer is pinned and is already marked as holding the
977  * desired page. If it already did have the desired page, *foundPtr is
978  * set true. Otherwise, *foundPtr is set false and the buffer is marked
979  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
980  *
981  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
982  * we keep it for simplicity in ReadBuffer.
983  *
984  * No locks are held either at entry or exit.
985  */
986 static BufferDesc *
987 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
988  BlockNumber blockNum,
989  BufferAccessStrategy strategy,
990  bool *foundPtr)
991 {
992  BufferTag newTag; /* identity of requested block */
993  uint32 newHash; /* hash value for newTag */
994  LWLock *newPartitionLock; /* buffer partition lock for it */
995  BufferTag oldTag; /* previous identity of selected buffer */
996  uint32 oldHash; /* hash value for oldTag */
997  LWLock *oldPartitionLock; /* buffer partition lock for it */
998  uint32 oldFlags;
999  int buf_id;
1000  BufferDesc *buf;
1001  bool valid;
1002  uint32 buf_state;
1003 
1004  /* create a tag so we can lookup the buffer */
1005  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1006 
1007  /* determine its hash code and partition lock ID */
1008  newHash = BufTableHashCode(&newTag);
1009  newPartitionLock = BufMappingPartitionLock(newHash);
1010 
1011  /* see if the block is in the buffer pool already */
1012  LWLockAcquire(newPartitionLock, LW_SHARED);
1013  buf_id = BufTableLookup(&newTag, newHash);
1014  if (buf_id >= 0)
1015  {
1016  /*
1017  * Found it. Now, pin the buffer so no one can steal it from the
1018  * buffer pool, and check to see if the correct data has been loaded
1019  * into the buffer.
1020  */
1021  buf = GetBufferDescriptor(buf_id);
1022 
1023  valid = PinBuffer(buf, strategy);
1024 
1025  /* Can release the mapping lock as soon as we've pinned it */
1026  LWLockRelease(newPartitionLock);
1027 
1028  *foundPtr = true;
1029 
1030  if (!valid)
1031  {
1032  /*
1033  * We can only get here if (a) someone else is still reading in
1034  * the page, or (b) a previous read attempt failed. We have to
1035  * wait for any active read attempt to finish, and then set up our
1036  * own read attempt if the page is still not BM_VALID.
1037  * StartBufferIO does it all.
1038  */
1039  if (StartBufferIO(buf, true))
1040  {
1041  /*
1042  * If we get here, previous attempts to read the buffer must
1043  * have failed ... but we shall bravely try again.
1044  */
1045  *foundPtr = false;
1046  }
1047  }
1048 
1049  return buf;
1050  }
1051 
1052  /*
1053  * Didn't find it in the buffer pool. We'll have to initialize a new
1054  * buffer. Remember to unlock the mapping lock while doing the work.
1055  */
1056  LWLockRelease(newPartitionLock);
1057 
1058  /* Loop here in case we have to try another victim buffer */
1059  for (;;)
1060  {
1061  /*
1062  * Ensure, while the spinlock's not yet held, that there's a free
1063  * refcount entry.
1064  */
1066 
1067  /*
1068  * Select a victim buffer. The buffer is returned with its header
1069  * spinlock still held!
1070  */
1071  buf = StrategyGetBuffer(strategy, &buf_state);
1072 
1073  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1074 
1075  /* Must copy buffer flags while we still hold the spinlock */
1076  oldFlags = buf_state & BUF_FLAG_MASK;
1077 
1078  /* Pin the buffer and then release the buffer spinlock */
1079  PinBuffer_Locked(buf);
1080 
1081  /*
1082  * If the buffer was dirty, try to write it out. There is a race
1083  * condition here, in that someone might dirty it after we released it
1084  * above, or even while we are writing it out (since our share-lock
1085  * won't prevent hint-bit updates). We will recheck the dirty bit
1086  * after re-locking the buffer header.
1087  */
1088  if (oldFlags & BM_DIRTY)
1089  {
1090  /*
1091  * We need a share-lock on the buffer contents to write it out
1092  * (else we might write invalid data, eg because someone else is
1093  * compacting the page contents while we write). We must use a
1094  * conditional lock acquisition here to avoid deadlock. Even
1095  * though the buffer was not pinned (and therefore surely not
1096  * locked) when StrategyGetBuffer returned it, someone else could
1097  * have pinned and exclusive-locked it by the time we get here. If
1098  * we try to get the lock unconditionally, we'd block waiting for
1099  * them; if they later block waiting for us, deadlock ensues.
1100  * (This has been observed to happen when two backends are both
1101  * trying to split btree index pages, and the second one just
1102  * happens to be trying to split the page the first one got from
1103  * StrategyGetBuffer.)
1104  */
1106  LW_SHARED))
1107  {
1108  /*
1109  * If using a nondefault strategy, and writing the buffer
1110  * would require a WAL flush, let the strategy decide whether
1111  * to go ahead and write/reuse the buffer or to choose another
1112  * victim. We need lock to inspect the page LSN, so this
1113  * can't be done inside StrategyGetBuffer.
1114  */
1115  if (strategy != NULL)
1116  {
1117  XLogRecPtr lsn;
1118 
1119  /* Read the LSN while holding buffer header lock */
1120  buf_state = LockBufHdr(buf);
1121  lsn = BufferGetLSN(buf);
1122  UnlockBufHdr(buf, buf_state);
1123 
1124  if (XLogNeedsFlush(lsn) &&
1125  StrategyRejectBuffer(strategy, buf))
1126  {
1127  /* Drop lock/pin and loop around for another buffer */
1129  UnpinBuffer(buf, true);
1130  continue;
1131  }
1132  }
1133 
1134  /* OK, do the I/O */
1135  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1136  smgr->smgr_rnode.node.spcNode,
1137  smgr->smgr_rnode.node.dbNode,
1138  smgr->smgr_rnode.node.relNode);
1139 
1140  FlushBuffer(buf, NULL);
1142 
1144  &buf->tag);
1145 
1146  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1147  smgr->smgr_rnode.node.spcNode,
1148  smgr->smgr_rnode.node.dbNode,
1149  smgr->smgr_rnode.node.relNode);
1150  }
1151  else
1152  {
1153  /*
1154  * Someone else has locked the buffer, so give it up and loop
1155  * back to get another one.
1156  */
1157  UnpinBuffer(buf, true);
1158  continue;
1159  }
1160  }
1161 
1162  /*
1163  * To change the association of a valid buffer, we'll need to have
1164  * exclusive lock on both the old and new mapping partitions.
1165  */
1166  if (oldFlags & BM_TAG_VALID)
1167  {
1168  /*
1169  * Need to compute the old tag's hashcode and partition lock ID.
1170  * XXX is it worth storing the hashcode in BufferDesc so we need
1171  * not recompute it here? Probably not.
1172  */
1173  oldTag = buf->tag;
1174  oldHash = BufTableHashCode(&oldTag);
1175  oldPartitionLock = BufMappingPartitionLock(oldHash);
1176 
1177  /*
1178  * Must lock the lower-numbered partition first to avoid
1179  * deadlocks.
1180  */
1181  if (oldPartitionLock < newPartitionLock)
1182  {
1183  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1184  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1185  }
1186  else if (oldPartitionLock > newPartitionLock)
1187  {
1188  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1189  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1190  }
1191  else
1192  {
1193  /* only one partition, only one lock */
1194  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1195  }
1196  }
1197  else
1198  {
1199  /* if it wasn't valid, we need only the new partition */
1200  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1201  /* remember we have no old-partition lock or tag */
1202  oldPartitionLock = NULL;
1203  /* this just keeps the compiler quiet about uninit variables */
1204  oldHash = 0;
1205  }
1206 
1207  /*
1208  * Try to make a hashtable entry for the buffer under its new tag.
1209  * This could fail because while we were writing someone else
1210  * allocated another buffer for the same block we want to read in.
1211  * Note that we have not yet removed the hashtable entry for the old
1212  * tag.
1213  */
1214  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1215 
1216  if (buf_id >= 0)
1217  {
1218  /*
1219  * Got a collision. Someone has already done what we were about to
1220  * do. We'll just handle this as if it were found in the buffer
1221  * pool in the first place. First, give up the buffer we were
1222  * planning to use.
1223  */
1224  UnpinBuffer(buf, true);
1225 
1226  /* Can give up that buffer's mapping partition lock now */
1227  if (oldPartitionLock != NULL &&
1228  oldPartitionLock != newPartitionLock)
1229  LWLockRelease(oldPartitionLock);
1230 
1231  /* remaining code should match code at top of routine */
1232 
1233  buf = GetBufferDescriptor(buf_id);
1234 
1235  valid = PinBuffer(buf, strategy);
1236 
1237  /* Can release the mapping lock as soon as we've pinned it */
1238  LWLockRelease(newPartitionLock);
1239 
1240  *foundPtr = true;
1241 
1242  if (!valid)
1243  {
1244  /*
1245  * We can only get here if (a) someone else is still reading
1246  * in the page, or (b) a previous read attempt failed. We
1247  * have to wait for any active read attempt to finish, and
1248  * then set up our own read attempt if the page is still not
1249  * BM_VALID. StartBufferIO does it all.
1250  */
1251  if (StartBufferIO(buf, true))
1252  {
1253  /*
1254  * If we get here, previous attempts to read the buffer
1255  * must have failed ... but we shall bravely try again.
1256  */
1257  *foundPtr = false;
1258  }
1259  }
1260 
1261  return buf;
1262  }
1263 
1264  /*
1265  * Need to lock the buffer header too in order to change its tag.
1266  */
1267  buf_state = LockBufHdr(buf);
1268 
1269  /*
1270  * Somebody could have pinned or re-dirtied the buffer while we were
1271  * doing the I/O and making the new hashtable entry. If so, we can't
1272  * recycle this buffer; we must undo everything we've done and start
1273  * over with a new victim buffer.
1274  */
1275  oldFlags = buf_state & BUF_FLAG_MASK;
1276  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1277  break;
1278 
1279  UnlockBufHdr(buf, buf_state);
1280  BufTableDelete(&newTag, newHash);
1281  if (oldPartitionLock != NULL &&
1282  oldPartitionLock != newPartitionLock)
1283  LWLockRelease(oldPartitionLock);
1284  LWLockRelease(newPartitionLock);
1285  UnpinBuffer(buf, true);
1286  }
1287 
1288  /*
1289  * Okay, it's finally safe to rename the buffer.
1290  *
1291  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1292  * paranoia. We also reset the usage_count since any recency of use of
1293  * the old content is no longer relevant. (The usage_count starts out at
1294  * 1 so that the buffer can survive one clock-sweep pass.)
1295  *
1296  * Make sure BM_PERMANENT is set for buffers that must be written at every
1297  * checkpoint. Unlogged buffers only need to be written at shutdown
1298  * checkpoints, except for their "init" forks, which need to be treated
1299  * just like permanent relations.
1300  */
1301  buf->tag = newTag;
1302  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1305  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1306  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1307  else
1308  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1309 
1310  UnlockBufHdr(buf, buf_state);
1311 
1312  if (oldPartitionLock != NULL)
1313  {
1314  BufTableDelete(&oldTag, oldHash);
1315  if (oldPartitionLock != newPartitionLock)
1316  LWLockRelease(oldPartitionLock);
1317  }
1318 
1319  LWLockRelease(newPartitionLock);
1320 
1321  /*
1322  * Buffer contents are currently invalid. Try to get the io_in_progress
1323  * lock. If StartBufferIO returns false, then someone else managed to
1324  * read it before we did, so there's nothing left for BufferAlloc() to do.
1325  */
1326  if (StartBufferIO(buf, true))
1327  *foundPtr = false;
1328  else
1329  *foundPtr = true;
1330 
1331  return buf;
1332 }
1333 
1334 /*
1335  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1336  * freelist.
1337  *
1338  * The buffer header spinlock must be held at entry. We drop it before
1339  * returning. (This is sane because the caller must have locked the
1340  * buffer in order to be sure it should be dropped.)
1341  *
1342  * This is used only in contexts such as dropping a relation. We assume
1343  * that no other backend could possibly be interested in using the page,
1344  * so the only reason the buffer might be pinned is if someone else is
1345  * trying to write it out. We have to let them finish before we can
1346  * reclaim the buffer.
1347  *
1348  * The buffer could get reclaimed by someone else while we are waiting
1349  * to acquire the necessary locks; if so, don't mess it up.
1350  */
1351 static void
1353 {
1354  BufferTag oldTag;
1355  uint32 oldHash; /* hash value for oldTag */
1356  LWLock *oldPartitionLock; /* buffer partition lock for it */
1357  uint32 oldFlags;
1358  uint32 buf_state;
1359 
1360  /* Save the original buffer tag before dropping the spinlock */
1361  oldTag = buf->tag;
1362 
1363  buf_state = pg_atomic_read_u32(&buf->state);
1364  Assert(buf_state & BM_LOCKED);
1365  UnlockBufHdr(buf, buf_state);
1366 
1367  /*
1368  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1369  * worth storing the hashcode in BufferDesc so we need not recompute it
1370  * here? Probably not.
1371  */
1372  oldHash = BufTableHashCode(&oldTag);
1373  oldPartitionLock = BufMappingPartitionLock(oldHash);
1374 
1375 retry:
1376 
1377  /*
1378  * Acquire exclusive mapping lock in preparation for changing the buffer's
1379  * association.
1380  */
1381  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1382 
1383  /* Re-lock the buffer header */
1384  buf_state = LockBufHdr(buf);
1385 
1386  /* If it's changed while we were waiting for lock, do nothing */
1387  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1388  {
1389  UnlockBufHdr(buf, buf_state);
1390  LWLockRelease(oldPartitionLock);
1391  return;
1392  }
1393 
1394  /*
1395  * We assume the only reason for it to be pinned is that someone else is
1396  * flushing the page out. Wait for them to finish. (This could be an
1397  * infinite loop if the refcount is messed up... it would be nice to time
1398  * out after awhile, but there seems no way to be sure how many loops may
1399  * be needed. Note that if the other guy has pinned the buffer but not
1400  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1401  * be busy-looping here.)
1402  */
1403  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1404  {
1405  UnlockBufHdr(buf, buf_state);
1406  LWLockRelease(oldPartitionLock);
1407  /* safety check: should definitely not be our *own* pin */
1409  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1410  WaitIO(buf);
1411  goto retry;
1412  }
1413 
1414  /*
1415  * Clear out the buffer's tag and flags. We must do this to ensure that
1416  * linear scans of the buffer array don't think the buffer is valid.
1417  */
1418  oldFlags = buf_state & BUF_FLAG_MASK;
1419  CLEAR_BUFFERTAG(buf->tag);
1420  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1421  UnlockBufHdr(buf, buf_state);
1422 
1423  /*
1424  * Remove the buffer from the lookup hashtable, if it was in there.
1425  */
1426  if (oldFlags & BM_TAG_VALID)
1427  BufTableDelete(&oldTag, oldHash);
1428 
1429  /*
1430  * Done with mapping lock.
1431  */
1432  LWLockRelease(oldPartitionLock);
1433 
1434  /*
1435  * Insert the buffer at the head of the list of free buffers.
1436  */
1437  StrategyFreeBuffer(buf);
1438 }
1439 
1440 /*
1441  * MarkBufferDirty
1442  *
1443  * Marks buffer contents as dirty (actual write happens later).
1444  *
1445  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1446  * exclusive lock, then somebody could be in process of writing the buffer,
1447  * leading to risk of bad data written to disk.)
1448  */
1449 void
1451 {
1452  BufferDesc *bufHdr;
1453  uint32 buf_state;
1454  uint32 old_buf_state;
1455 
1456  if (!BufferIsValid(buffer))
1457  elog(ERROR, "bad buffer ID: %d", buffer);
1458 
1459  if (BufferIsLocal(buffer))
1460  {
1461  MarkLocalBufferDirty(buffer);
1462  return;
1463  }
1464 
1465  bufHdr = GetBufferDescriptor(buffer - 1);
1466 
1467  Assert(BufferIsPinned(buffer));
1469  LW_EXCLUSIVE));
1470 
1471  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1472  for (;;)
1473  {
1474  if (old_buf_state & BM_LOCKED)
1475  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1476 
1477  buf_state = old_buf_state;
1478 
1479  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1480  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1481 
1482  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1483  buf_state))
1484  break;
1485  }
1486 
1487  /*
1488  * If the buffer was not dirty already, do vacuum accounting.
1489  */
1490  if (!(old_buf_state & BM_DIRTY))
1491  {
1492  VacuumPageDirty++;
1494  if (VacuumCostActive)
1496  }
1497 }
1498 
1499 /*
1500  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1501  *
1502  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1503  * compared to calling the two routines separately. Now it's mainly just
1504  * a convenience function. However, if the passed buffer is valid and
1505  * already contains the desired block, we just return it as-is; and that
1506  * does save considerable work compared to a full release and reacquire.
1507  *
1508  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1509  * buffer actually needs to be released. This case is the same as ReadBuffer,
1510  * but can save some tests in the caller.
1511  */
1512 Buffer
1514  Relation relation,
1515  BlockNumber blockNum)
1516 {
1517  ForkNumber forkNum = MAIN_FORKNUM;
1518  BufferDesc *bufHdr;
1519 
1520  if (BufferIsValid(buffer))
1521  {
1522  Assert(BufferIsPinned(buffer));
1523  if (BufferIsLocal(buffer))
1524  {
1525  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1526  if (bufHdr->tag.blockNum == blockNum &&
1527  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1528  bufHdr->tag.forkNum == forkNum)
1529  return buffer;
1531  LocalRefCount[-buffer - 1]--;
1532  }
1533  else
1534  {
1535  bufHdr = GetBufferDescriptor(buffer - 1);
1536  /* we have pin, so it's ok to examine tag without spinlock */
1537  if (bufHdr->tag.blockNum == blockNum &&
1538  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1539  bufHdr->tag.forkNum == forkNum)
1540  return buffer;
1541  UnpinBuffer(bufHdr, true);
1542  }
1543  }
1544 
1545  return ReadBuffer(relation, blockNum);
1546 }
1547 
1548 /*
1549  * PinBuffer -- make buffer unavailable for replacement.
1550  *
1551  * For the default access strategy, the buffer's usage_count is incremented
1552  * when we first pin it; for other strategies we just make sure the usage_count
1553  * isn't zero. (The idea of the latter is that we don't want synchronized
1554  * heap scans to inflate the count, but we need it to not be zero to discourage
1555  * other backends from stealing buffers from our ring. As long as we cycle
1556  * through the ring faster than the global clock-sweep cycles, buffers in
1557  * our ring won't be chosen as victims for replacement by other backends.)
1558  *
1559  * This should be applied only to shared buffers, never local ones.
1560  *
1561  * Since buffers are pinned/unpinned very frequently, pin buffers without
1562  * taking the buffer header lock; instead update the state variable in loop of
1563  * CAS operations. Hopefully it's just a single CAS.
1564  *
1565  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1566  *
1567  * Returns true if buffer is BM_VALID, else false. This provision allows
1568  * some callers to avoid an extra spinlock cycle.
1569  */
1570 static bool
1572 {
1574  bool result;
1575  PrivateRefCountEntry *ref;
1576 
1577  ref = GetPrivateRefCountEntry(b, true);
1578 
1579  if (ref == NULL)
1580  {
1581  uint32 buf_state;
1582  uint32 old_buf_state;
1583 
1585  ref = NewPrivateRefCountEntry(b);
1586 
1587  old_buf_state = pg_atomic_read_u32(&buf->state);
1588  for (;;)
1589  {
1590  if (old_buf_state & BM_LOCKED)
1591  old_buf_state = WaitBufHdrUnlocked(buf);
1592 
1593  buf_state = old_buf_state;
1594 
1595  /* increase refcount */
1596  buf_state += BUF_REFCOUNT_ONE;
1597 
1598  if (strategy == NULL)
1599  {
1600  /* Default case: increase usagecount unless already max. */
1602  buf_state += BUF_USAGECOUNT_ONE;
1603  }
1604  else
1605  {
1606  /*
1607  * Ring buffers shouldn't evict others from pool. Thus we
1608  * don't make usagecount more than 1.
1609  */
1610  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1611  buf_state += BUF_USAGECOUNT_ONE;
1612  }
1613 
1614  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1615  buf_state))
1616  {
1617  result = (buf_state & BM_VALID) != 0;
1618  break;
1619  }
1620  }
1621  }
1622  else
1623  {
1624  /* If we previously pinned the buffer, it must surely be valid */
1625  result = true;
1626  }
1627 
1628  ref->refcount++;
1629  Assert(ref->refcount > 0);
1631  return result;
1632 }
1633 
1634 /*
1635  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1636  * The spinlock is released before return.
1637  *
1638  * As this function is called with the spinlock held, the caller has to
1639  * previously call ReservePrivateRefCountEntry().
1640  *
1641  * Currently, no callers of this function want to modify the buffer's
1642  * usage_count at all, so there's no need for a strategy parameter.
1643  * Also we don't bother with a BM_VALID test (the caller could check that for
1644  * itself).
1645  *
1646  * Also all callers only ever use this function when it's known that the
1647  * buffer can't have a preexisting pin by this backend. That allows us to skip
1648  * searching the private refcount array & hash, which is a boon, because the
1649  * spinlock is still held.
1650  *
1651  * Note: use of this routine is frequently mandatory, not just an optimization
1652  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1653  * its state can change under us.
1654  */
1655 static void
1657 {
1658  Buffer b;
1659  PrivateRefCountEntry *ref;
1660  uint32 buf_state;
1661 
1662  /*
1663  * As explained, We don't expect any preexisting pins. That allows us to
1664  * manipulate the PrivateRefCount after releasing the spinlock
1665  */
1667 
1668  /*
1669  * Since we hold the buffer spinlock, we can update the buffer state and
1670  * release the lock in one operation.
1671  */
1672  buf_state = pg_atomic_read_u32(&buf->state);
1673  Assert(buf_state & BM_LOCKED);
1674  buf_state += BUF_REFCOUNT_ONE;
1675  UnlockBufHdr(buf, buf_state);
1676 
1677  b = BufferDescriptorGetBuffer(buf);
1678 
1679  ref = NewPrivateRefCountEntry(b);
1680  ref->refcount++;
1681 
1683 }
1684 
1685 /*
1686  * UnpinBuffer -- make buffer available for replacement.
1687  *
1688  * This should be applied only to shared buffers, never local ones.
1689  *
1690  * Most but not all callers want CurrentResourceOwner to be adjusted.
1691  * Those that don't should pass fixOwner = false.
1692  */
1693 static void
1694 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1695 {
1696  PrivateRefCountEntry *ref;
1698 
1699  /* not moving as we're likely deleting it soon anyway */
1700  ref = GetPrivateRefCountEntry(b, false);
1701  Assert(ref != NULL);
1702 
1703  if (fixOwner)
1705 
1706  Assert(ref->refcount > 0);
1707  ref->refcount--;
1708  if (ref->refcount == 0)
1709  {
1710  uint32 buf_state;
1711  uint32 old_buf_state;
1712 
1713  /* I'd better not still hold any locks on the buffer */
1716 
1717  /*
1718  * Decrement the shared reference count.
1719  *
1720  * Since buffer spinlock holder can update status using just write,
1721  * it's not safe to use atomic decrement here; thus use a CAS loop.
1722  */
1723  old_buf_state = pg_atomic_read_u32(&buf->state);
1724  for (;;)
1725  {
1726  if (old_buf_state & BM_LOCKED)
1727  old_buf_state = WaitBufHdrUnlocked(buf);
1728 
1729  buf_state = old_buf_state;
1730 
1731  buf_state -= BUF_REFCOUNT_ONE;
1732 
1733  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1734  buf_state))
1735  break;
1736  }
1737 
1738  /* Support LockBufferForCleanup() */
1739  if (buf_state & BM_PIN_COUNT_WAITER)
1740  {
1741  /*
1742  * Acquire the buffer header lock, re-check that there's a waiter.
1743  * Another backend could have unpinned this buffer, and already
1744  * woken up the waiter. There's no danger of the buffer being
1745  * replaced after we unpinned it above, as it's pinned by the
1746  * waiter.
1747  */
1748  buf_state = LockBufHdr(buf);
1749 
1750  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1751  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1752  {
1753  /* we just released the last pin other than the waiter's */
1754  int wait_backend_pid = buf->wait_backend_pid;
1755 
1756  buf_state &= ~BM_PIN_COUNT_WAITER;
1757  UnlockBufHdr(buf, buf_state);
1758  ProcSendSignal(wait_backend_pid);
1759  }
1760  else
1761  UnlockBufHdr(buf, buf_state);
1762  }
1764  }
1765 }
1766 
1767 /*
1768  * BufferSync -- Write out all dirty buffers in the pool.
1769  *
1770  * This is called at checkpoint time to write out all dirty shared buffers.
1771  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1772  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1773  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1774  * unlogged buffers, which are otherwise skipped. The remaining flags
1775  * currently have no effect here.
1776  */
1777 static void
1778 BufferSync(int flags)
1779 {
1780  uint32 buf_state;
1781  int buf_id;
1782  int num_to_scan;
1783  int num_spaces;
1784  int num_processed;
1785  int num_written;
1786  CkptTsStatus *per_ts_stat = NULL;
1787  Oid last_tsid;
1788  binaryheap *ts_heap;
1789  int i;
1790  int mask = BM_DIRTY;
1791  WritebackContext wb_context;
1792 
1793  /* Make sure we can handle the pin inside SyncOneBuffer */
1795 
1796  /*
1797  * Unless this is a shutdown checkpoint or we have been explicitly told,
1798  * we write only permanent, dirty buffers. But at shutdown or end of
1799  * recovery, we write all dirty buffers.
1800  */
1803  mask |= BM_PERMANENT;
1804 
1805  /*
1806  * Loop over all buffers, and mark the ones that need to be written with
1807  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1808  * can estimate how much work needs to be done.
1809  *
1810  * This allows us to write only those pages that were dirty when the
1811  * checkpoint began, and not those that get dirtied while it proceeds.
1812  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1813  * later in this function, or by normal backends or the bgwriter cleaning
1814  * scan, the flag is cleared. Any buffer dirtied after this point won't
1815  * have the flag set.
1816  *
1817  * Note that if we fail to write some buffer, we may leave buffers with
1818  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1819  * certainly need to be written for the next checkpoint attempt, too.
1820  */
1821  num_to_scan = 0;
1822  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1823  {
1824  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1825 
1826  /*
1827  * Header spinlock is enough to examine BM_DIRTY, see comment in
1828  * SyncOneBuffer.
1829  */
1830  buf_state = LockBufHdr(bufHdr);
1831 
1832  if ((buf_state & mask) == mask)
1833  {
1834  CkptSortItem *item;
1835 
1836  buf_state |= BM_CHECKPOINT_NEEDED;
1837 
1838  item = &CkptBufferIds[num_to_scan++];
1839  item->buf_id = buf_id;
1840  item->tsId = bufHdr->tag.rnode.spcNode;
1841  item->relNode = bufHdr->tag.rnode.relNode;
1842  item->forkNum = bufHdr->tag.forkNum;
1843  item->blockNum = bufHdr->tag.blockNum;
1844  }
1845 
1846  UnlockBufHdr(bufHdr, buf_state);
1847  }
1848 
1849  if (num_to_scan == 0)
1850  return; /* nothing to do */
1851 
1853 
1854  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1855 
1856  /*
1857  * Sort buffers that need to be written to reduce the likelihood of random
1858  * IO. The sorting is also important for the implementation of balancing
1859  * writes between tablespaces. Without balancing writes we'd potentially
1860  * end up writing to the tablespaces one-by-one; possibly overloading the
1861  * underlying system.
1862  */
1863  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1865 
1866  num_spaces = 0;
1867 
1868  /*
1869  * Allocate progress status for each tablespace with buffers that need to
1870  * be flushed. This requires the to-be-flushed array to be sorted.
1871  */
1872  last_tsid = InvalidOid;
1873  for (i = 0; i < num_to_scan; i++)
1874  {
1875  CkptTsStatus *s;
1876  Oid cur_tsid;
1877 
1878  cur_tsid = CkptBufferIds[i].tsId;
1879 
1880  /*
1881  * Grow array of per-tablespace status structs, every time a new
1882  * tablespace is found.
1883  */
1884  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1885  {
1886  Size sz;
1887 
1888  num_spaces++;
1889 
1890  /*
1891  * Not worth adding grow-by-power-of-2 logic here - even with a
1892  * few hundred tablespaces this should be fine.
1893  */
1894  sz = sizeof(CkptTsStatus) * num_spaces;
1895 
1896  if (per_ts_stat == NULL)
1897  per_ts_stat = (CkptTsStatus *) palloc(sz);
1898  else
1899  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1900 
1901  s = &per_ts_stat[num_spaces - 1];
1902  memset(s, 0, sizeof(*s));
1903  s->tsId = cur_tsid;
1904 
1905  /*
1906  * The first buffer in this tablespace. As CkptBufferIds is sorted
1907  * by tablespace all (s->num_to_scan) buffers in this tablespace
1908  * will follow afterwards.
1909  */
1910  s->index = i;
1911 
1912  /*
1913  * progress_slice will be determined once we know how many buffers
1914  * are in each tablespace, i.e. after this loop.
1915  */
1916 
1917  last_tsid = cur_tsid;
1918  }
1919  else
1920  {
1921  s = &per_ts_stat[num_spaces - 1];
1922  }
1923 
1924  s->num_to_scan++;
1925  }
1926 
1927  Assert(num_spaces > 0);
1928 
1929  /*
1930  * Build a min-heap over the write-progress in the individual tablespaces,
1931  * and compute how large a portion of the total progress a single
1932  * processed buffer is.
1933  */
1934  ts_heap = binaryheap_allocate(num_spaces,
1936  NULL);
1937 
1938  for (i = 0; i < num_spaces; i++)
1939  {
1940  CkptTsStatus *ts_stat = &per_ts_stat[i];
1941 
1942  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1943 
1944  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1945  }
1946 
1947  binaryheap_build(ts_heap);
1948 
1949  /*
1950  * Iterate through to-be-checkpointed buffers and write the ones (still)
1951  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1952  * tablespaces; otherwise the sorting would lead to only one tablespace
1953  * receiving writes at a time, making inefficient use of the hardware.
1954  */
1955  num_processed = 0;
1956  num_written = 0;
1957  while (!binaryheap_empty(ts_heap))
1958  {
1959  BufferDesc *bufHdr = NULL;
1960  CkptTsStatus *ts_stat = (CkptTsStatus *)
1962 
1963  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1964  Assert(buf_id != -1);
1965 
1966  bufHdr = GetBufferDescriptor(buf_id);
1967 
1968  num_processed++;
1969 
1970  /*
1971  * We don't need to acquire the lock here, because we're only looking
1972  * at a single bit. It's possible that someone else writes the buffer
1973  * and clears the flag right after we check, but that doesn't matter
1974  * since SyncOneBuffer will then do nothing. However, there is a
1975  * further race condition: it's conceivable that between the time we
1976  * examine the bit here and the time SyncOneBuffer acquires the lock,
1977  * someone else not only wrote the buffer but replaced it with another
1978  * page and dirtied it. In that improbable case, SyncOneBuffer will
1979  * write the buffer though we didn't need to. It doesn't seem worth
1980  * guarding against this, though.
1981  */
1983  {
1984  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1985  {
1986  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1988  num_written++;
1989  }
1990  }
1991 
1992  /*
1993  * Measure progress independent of actually having to flush the buffer
1994  * - otherwise writing become unbalanced.
1995  */
1996  ts_stat->progress += ts_stat->progress_slice;
1997  ts_stat->num_scanned++;
1998  ts_stat->index++;
1999 
2000  /* Have all the buffers from the tablespace been processed? */
2001  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2002  {
2003  binaryheap_remove_first(ts_heap);
2004  }
2005  else
2006  {
2007  /* update heap with the new progress */
2008  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2009  }
2010 
2011  /*
2012  * Sleep to throttle our I/O rate.
2013  */
2014  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2015  }
2016 
2017  /* issue all pending flushes */
2018  IssuePendingWritebacks(&wb_context);
2019 
2020  pfree(per_ts_stat);
2021  per_ts_stat = NULL;
2022  binaryheap_free(ts_heap);
2023 
2024  /*
2025  * Update checkpoint statistics. As noted above, this doesn't include
2026  * buffers written by other backends or bgwriter scan.
2027  */
2028  CheckpointStats.ckpt_bufs_written += num_written;
2029 
2030  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2031 }
2032 
2033 /*
2034  * BgBufferSync -- Write out some dirty buffers in the pool.
2035  *
2036  * This is called periodically by the background writer process.
2037  *
2038  * Returns true if it's appropriate for the bgwriter process to go into
2039  * low-power hibernation mode. (This happens if the strategy clock sweep
2040  * has been "lapped" and no buffer allocations have occurred recently,
2041  * or if the bgwriter has been effectively disabled by setting
2042  * bgwriter_lru_maxpages to 0.)
2043  */
2044 bool
2046 {
2047  /* info obtained from freelist.c */
2048  int strategy_buf_id;
2049  uint32 strategy_passes;
2050  uint32 recent_alloc;
2051 
2052  /*
2053  * Information saved between calls so we can determine the strategy
2054  * point's advance rate and avoid scanning already-cleaned buffers.
2055  */
2056  static bool saved_info_valid = false;
2057  static int prev_strategy_buf_id;
2058  static uint32 prev_strategy_passes;
2059  static int next_to_clean;
2060  static uint32 next_passes;
2061 
2062  /* Moving averages of allocation rate and clean-buffer density */
2063  static float smoothed_alloc = 0;
2064  static float smoothed_density = 10.0;
2065 
2066  /* Potentially these could be tunables, but for now, not */
2067  float smoothing_samples = 16;
2068  float scan_whole_pool_milliseconds = 120000.0;
2069 
2070  /* Used to compute how far we scan ahead */
2071  long strategy_delta;
2072  int bufs_to_lap;
2073  int bufs_ahead;
2074  float scans_per_alloc;
2075  int reusable_buffers_est;
2076  int upcoming_alloc_est;
2077  int min_scan_buffers;
2078 
2079  /* Variables for the scanning loop proper */
2080  int num_to_scan;
2081  int num_written;
2082  int reusable_buffers;
2083 
2084  /* Variables for final smoothed_density update */
2085  long new_strategy_delta;
2086  uint32 new_recent_alloc;
2087 
2088  /*
2089  * Find out where the freelist clock sweep currently is, and how many
2090  * buffer allocations have happened since our last call.
2091  */
2092  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2093 
2094  /* Report buffer alloc counts to pgstat */
2095  BgWriterStats.m_buf_alloc += recent_alloc;
2096 
2097  /*
2098  * If we're not running the LRU scan, just stop after doing the stats
2099  * stuff. We mark the saved state invalid so that we can recover sanely
2100  * if LRU scan is turned back on later.
2101  */
2102  if (bgwriter_lru_maxpages <= 0)
2103  {
2104  saved_info_valid = false;
2105  return true;
2106  }
2107 
2108  /*
2109  * Compute strategy_delta = how many buffers have been scanned by the
2110  * clock sweep since last time. If first time through, assume none. Then
2111  * see if we are still ahead of the clock sweep, and if so, how many
2112  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2113  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2114  * behavior when the passes counts wrap around.
2115  */
2116  if (saved_info_valid)
2117  {
2118  int32 passes_delta = strategy_passes - prev_strategy_passes;
2119 
2120  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2121  strategy_delta += (long) passes_delta * NBuffers;
2122 
2123  Assert(strategy_delta >= 0);
2124 
2125  if ((int32) (next_passes - strategy_passes) > 0)
2126  {
2127  /* we're one pass ahead of the strategy point */
2128  bufs_to_lap = strategy_buf_id - next_to_clean;
2129 #ifdef BGW_DEBUG
2130  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2131  next_passes, next_to_clean,
2132  strategy_passes, strategy_buf_id,
2133  strategy_delta, bufs_to_lap);
2134 #endif
2135  }
2136  else if (next_passes == strategy_passes &&
2137  next_to_clean >= strategy_buf_id)
2138  {
2139  /* on same pass, but ahead or at least not behind */
2140  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2141 #ifdef BGW_DEBUG
2142  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2143  next_passes, next_to_clean,
2144  strategy_passes, strategy_buf_id,
2145  strategy_delta, bufs_to_lap);
2146 #endif
2147  }
2148  else
2149  {
2150  /*
2151  * We're behind, so skip forward to the strategy point and start
2152  * cleaning from there.
2153  */
2154 #ifdef BGW_DEBUG
2155  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2156  next_passes, next_to_clean,
2157  strategy_passes, strategy_buf_id,
2158  strategy_delta);
2159 #endif
2160  next_to_clean = strategy_buf_id;
2161  next_passes = strategy_passes;
2162  bufs_to_lap = NBuffers;
2163  }
2164  }
2165  else
2166  {
2167  /*
2168  * Initializing at startup or after LRU scanning had been off. Always
2169  * start at the strategy point.
2170  */
2171 #ifdef BGW_DEBUG
2172  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2173  strategy_passes, strategy_buf_id);
2174 #endif
2175  strategy_delta = 0;
2176  next_to_clean = strategy_buf_id;
2177  next_passes = strategy_passes;
2178  bufs_to_lap = NBuffers;
2179  }
2180 
2181  /* Update saved info for next time */
2182  prev_strategy_buf_id = strategy_buf_id;
2183  prev_strategy_passes = strategy_passes;
2184  saved_info_valid = true;
2185 
2186  /*
2187  * Compute how many buffers had to be scanned for each new allocation, ie,
2188  * 1/density of reusable buffers, and track a moving average of that.
2189  *
2190  * If the strategy point didn't move, we don't update the density estimate
2191  */
2192  if (strategy_delta > 0 && recent_alloc > 0)
2193  {
2194  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2195  smoothed_density += (scans_per_alloc - smoothed_density) /
2196  smoothing_samples;
2197  }
2198 
2199  /*
2200  * Estimate how many reusable buffers there are between the current
2201  * strategy point and where we've scanned ahead to, based on the smoothed
2202  * density estimate.
2203  */
2204  bufs_ahead = NBuffers - bufs_to_lap;
2205  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2206 
2207  /*
2208  * Track a moving average of recent buffer allocations. Here, rather than
2209  * a true average we want a fast-attack, slow-decline behavior: we
2210  * immediately follow any increase.
2211  */
2212  if (smoothed_alloc <= (float) recent_alloc)
2213  smoothed_alloc = recent_alloc;
2214  else
2215  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2216  smoothing_samples;
2217 
2218  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2219  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2220 
2221  /*
2222  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2223  * eventually underflow to zero, and the underflows produce annoying
2224  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2225  * zero, there's no point in tracking smaller and smaller values of
2226  * smoothed_alloc, so just reset it to exactly zero to avoid this
2227  * syndrome. It will pop back up as soon as recent_alloc increases.
2228  */
2229  if (upcoming_alloc_est == 0)
2230  smoothed_alloc = 0;
2231 
2232  /*
2233  * Even in cases where there's been little or no buffer allocation
2234  * activity, we want to make a small amount of progress through the buffer
2235  * cache so that as many reusable buffers as possible are clean after an
2236  * idle period.
2237  *
2238  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2239  * the BGW will be called during the scan_whole_pool time; slice the
2240  * buffer pool into that many sections.
2241  */
2242  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2243 
2244  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2245  {
2246 #ifdef BGW_DEBUG
2247  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2248  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2249 #endif
2250  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2251  }
2252 
2253  /*
2254  * Now write out dirty reusable buffers, working forward from the
2255  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2256  * enough buffers to match our estimate of the next cycle's allocation
2257  * requirements, or hit the bgwriter_lru_maxpages limit.
2258  */
2259 
2260  /* Make sure we can handle the pin inside SyncOneBuffer */
2262 
2263  num_to_scan = bufs_to_lap;
2264  num_written = 0;
2265  reusable_buffers = reusable_buffers_est;
2266 
2267  /* Execute the LRU scan */
2268  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2269  {
2270  int sync_state = SyncOneBuffer(next_to_clean, true,
2271  wb_context);
2272 
2273  if (++next_to_clean >= NBuffers)
2274  {
2275  next_to_clean = 0;
2276  next_passes++;
2277  }
2278  num_to_scan--;
2279 
2280  if (sync_state & BUF_WRITTEN)
2281  {
2282  reusable_buffers++;
2283  if (++num_written >= bgwriter_lru_maxpages)
2284  {
2286  break;
2287  }
2288  }
2289  else if (sync_state & BUF_REUSABLE)
2290  reusable_buffers++;
2291  }
2292 
2293  BgWriterStats.m_buf_written_clean += num_written;
2294 
2295 #ifdef BGW_DEBUG
2296  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2297  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2298  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2299  bufs_to_lap - num_to_scan,
2300  num_written,
2301  reusable_buffers - reusable_buffers_est);
2302 #endif
2303 
2304  /*
2305  * Consider the above scan as being like a new allocation scan.
2306  * Characterize its density and update the smoothed one based on it. This
2307  * effectively halves the moving average period in cases where both the
2308  * strategy and the background writer are doing some useful scanning,
2309  * which is helpful because a long memory isn't as desirable on the
2310  * density estimates.
2311  */
2312  new_strategy_delta = bufs_to_lap - num_to_scan;
2313  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2314  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2315  {
2316  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2317  smoothed_density += (scans_per_alloc - smoothed_density) /
2318  smoothing_samples;
2319 
2320 #ifdef BGW_DEBUG
2321  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2322  new_recent_alloc, new_strategy_delta,
2323  scans_per_alloc, smoothed_density);
2324 #endif
2325  }
2326 
2327  /* Return true if OK to hibernate */
2328  return (bufs_to_lap == 0 && recent_alloc == 0);
2329 }
2330 
2331 /*
2332  * SyncOneBuffer -- process a single buffer during syncing.
2333  *
2334  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2335  * buffers marked recently used, as these are not replacement candidates.
2336  *
2337  * Returns a bitmask containing the following flag bits:
2338  * BUF_WRITTEN: we wrote the buffer.
2339  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2340  * pin count 0 and usage count 0.
2341  *
2342  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
2343  * after locking it, but we don't care all that much.)
2344  *
2345  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2346  */
2347 static int
2348 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2349 {
2350  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2351  int result = 0;
2352  uint32 buf_state;
2353  BufferTag tag;
2354 
2356 
2357  /*
2358  * Check whether buffer needs writing.
2359  *
2360  * We can make this check without taking the buffer content lock so long
2361  * as we mark pages dirty in access methods *before* logging changes with
2362  * XLogInsert(): if someone marks the buffer dirty just after our check we
2363  * don't worry because our checkpoint.redo points before log record for
2364  * upcoming changes and so we are not required to write such dirty buffer.
2365  */
2366  buf_state = LockBufHdr(bufHdr);
2367 
2368  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2369  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2370  {
2371  result |= BUF_REUSABLE;
2372  }
2373  else if (skip_recently_used)
2374  {
2375  /* Caller told us not to write recently-used buffers */
2376  UnlockBufHdr(bufHdr, buf_state);
2377  return result;
2378  }
2379 
2380  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2381  {
2382  /* It's clean, so nothing to do */
2383  UnlockBufHdr(bufHdr, buf_state);
2384  return result;
2385  }
2386 
2387  /*
2388  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2389  * buffer is clean by the time we've locked it.)
2390  */
2391  PinBuffer_Locked(bufHdr);
2393 
2394  FlushBuffer(bufHdr, NULL);
2395 
2397 
2398  tag = bufHdr->tag;
2399 
2400  UnpinBuffer(bufHdr, true);
2401 
2402  ScheduleBufferTagForWriteback(wb_context, &tag);
2403 
2404  return result | BUF_WRITTEN;
2405 }
2406 
2407 /*
2408  * AtEOXact_Buffers - clean up at end of transaction.
2409  *
2410  * As of PostgreSQL 8.0, buffer pins should get released by the
2411  * ResourceOwner mechanism. This routine is just a debugging
2412  * cross-check that no pins remain.
2413  */
2414 void
2415 AtEOXact_Buffers(bool isCommit)
2416 {
2418 
2419  AtEOXact_LocalBuffers(isCommit);
2420 
2422 }
2423 
2424 /*
2425  * Initialize access to shared buffer pool
2426  *
2427  * This is called during backend startup (whether standalone or under the
2428  * postmaster). It sets up for this backend's access to the already-existing
2429  * buffer pool.
2430  *
2431  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2432  * cannot do LWLockAcquire; hence we can't actually access stuff in
2433  * shared memory yet. We are only initializing local data here.
2434  * (See also InitBufferPoolBackend)
2435  */
2436 void
2438 {
2439  HASHCTL hash_ctl;
2440 
2441  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2442 
2443  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2444  hash_ctl.keysize = sizeof(int32);
2445  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2446 
2447  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2448  HASH_ELEM | HASH_BLOBS);
2449 }
2450 
2451 /*
2452  * InitBufferPoolBackend --- second-stage initialization of a new backend
2453  *
2454  * This is called after we have acquired a PGPROC and so can safely get
2455  * LWLocks. We don't currently need to do anything at this stage ...
2456  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2457  * access, and thereby has to be called at the corresponding phase of
2458  * backend shutdown.
2459  */
2460 void
2462 {
2464 }
2465 
2466 /*
2467  * During backend exit, ensure that we released all shared-buffer locks and
2468  * assert that we have no remaining pins.
2469  */
2470 static void
2472 {
2473  AbortBufferIO();
2474  UnlockBuffers();
2475 
2477 
2478  /* localbuf.c needs a chance too */
2480 }
2481 
2482 /*
2483  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2484  *
2485  * As of PostgreSQL 8.0, buffer pins should get released by the
2486  * ResourceOwner mechanism. This routine is just a debugging
2487  * cross-check that no pins remain.
2488  */
2489 static void
2491 {
2492 #ifdef USE_ASSERT_CHECKING
2493  int RefCountErrors = 0;
2494  PrivateRefCountEntry *res;
2495  int i;
2496 
2497  /* check the array */
2498  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2499  {
2500  res = &PrivateRefCountArray[i];
2501 
2502  if (res->buffer != InvalidBuffer)
2503  {
2505  RefCountErrors++;
2506  }
2507  }
2508 
2509  /* if necessary search the hash */
2511  {
2512  HASH_SEQ_STATUS hstat;
2513 
2514  hash_seq_init(&hstat, PrivateRefCountHash);
2515  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2516  {
2518  RefCountErrors++;
2519  }
2520 
2521  }
2522 
2523  Assert(RefCountErrors == 0);
2524 #endif
2525 }
2526 
2527 /*
2528  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2529  */
2530 void
2532 {
2533  BufferDesc *buf;
2534  int32 loccount;
2535  char *path;
2536  BackendId backend;
2537  uint32 buf_state;
2538 
2539  Assert(BufferIsValid(buffer));
2540  if (BufferIsLocal(buffer))
2541  {
2542  buf = GetLocalBufferDescriptor(-buffer - 1);
2543  loccount = LocalRefCount[-buffer - 1];
2544  backend = MyBackendId;
2545  }
2546  else
2547  {
2548  buf = GetBufferDescriptor(buffer - 1);
2549  loccount = GetPrivateRefCount(buffer);
2550  backend = InvalidBackendId;
2551  }
2552 
2553  /* theoretically we should lock the bufhdr here */
2554  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2555  buf_state = pg_atomic_read_u32(&buf->state);
2556  elog(WARNING,
2557  "buffer refcount leak: [%03d] "
2558  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2559  buffer, path,
2560  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2561  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2562  pfree(path);
2563 }
2564 
2565 /*
2566  * CheckPointBuffers
2567  *
2568  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2569  *
2570  * Note: temporary relations do not participate in checkpoints, so they don't
2571  * need to be flushed.
2572  */
2573 void
2575 {
2576  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2578  BufferSync(flags);
2580  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2581  smgrsync();
2583  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2584 }
2585 
2586 
2587 /*
2588  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2589  */
2590 void
2592 {
2593  /* Nothing to do in bufmgr anymore... */
2594 }
2595 
2596 /*
2597  * BufferGetBlockNumber
2598  * Returns the block number associated with a buffer.
2599  *
2600  * Note:
2601  * Assumes that the buffer is valid and pinned, else the
2602  * value may be obsolete immediately...
2603  */
2606 {
2607  BufferDesc *bufHdr;
2608 
2609  Assert(BufferIsPinned(buffer));
2610 
2611  if (BufferIsLocal(buffer))
2612  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2613  else
2614  bufHdr = GetBufferDescriptor(buffer - 1);
2615 
2616  /* pinned, so OK to read tag without spinlock */
2617  return bufHdr->tag.blockNum;
2618 }
2619 
2620 /*
2621  * BufferGetTag
2622  * Returns the relfilenode, fork number and block number associated with
2623  * a buffer.
2624  */
2625 void
2627  BlockNumber *blknum)
2628 {
2629  BufferDesc *bufHdr;
2630 
2631  /* Do the same checks as BufferGetBlockNumber. */
2632  Assert(BufferIsPinned(buffer));
2633 
2634  if (BufferIsLocal(buffer))
2635  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2636  else
2637  bufHdr = GetBufferDescriptor(buffer - 1);
2638 
2639  /* pinned, so OK to read tag without spinlock */
2640  *rnode = bufHdr->tag.rnode;
2641  *forknum = bufHdr->tag.forkNum;
2642  *blknum = bufHdr->tag.blockNum;
2643 }
2644 
2645 /*
2646  * FlushBuffer
2647  * Physically write out a shared buffer.
2648  *
2649  * NOTE: this actually just passes the buffer contents to the kernel; the
2650  * real write to disk won't happen until the kernel feels like it. This
2651  * is okay from our point of view since we can redo the changes from WAL.
2652  * However, we will need to force the changes to disk via fsync before
2653  * we can checkpoint WAL.
2654  *
2655  * The caller must hold a pin on the buffer and have share-locked the
2656  * buffer contents. (Note: a share-lock does not prevent updates of
2657  * hint bits in the buffer, so the page could change while the write
2658  * is in progress, but we assume that that will not invalidate the data
2659  * written.)
2660  *
2661  * If the caller has an smgr reference for the buffer's relation, pass it
2662  * as the second parameter. If not, pass NULL.
2663  */
2664 static void
2666 {
2667  XLogRecPtr recptr;
2668  ErrorContextCallback errcallback;
2669  instr_time io_start,
2670  io_time;
2671  Block bufBlock;
2672  char *bufToWrite;
2673  uint32 buf_state;
2674 
2675  /*
2676  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2677  * false, then someone else flushed the buffer before we could, so we need
2678  * not do anything.
2679  */
2680  if (!StartBufferIO(buf, false))
2681  return;
2682 
2683  /* Setup error traceback support for ereport() */
2685  errcallback.arg = (void *) buf;
2686  errcallback.previous = error_context_stack;
2687  error_context_stack = &errcallback;
2688 
2689  /* Find smgr relation for buffer */
2690  if (reln == NULL)
2691  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2692 
2693  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2694  buf->tag.blockNum,
2695  reln->smgr_rnode.node.spcNode,
2696  reln->smgr_rnode.node.dbNode,
2697  reln->smgr_rnode.node.relNode);
2698 
2699  buf_state = LockBufHdr(buf);
2700 
2701  /*
2702  * Run PageGetLSN while holding header lock, since we don't have the
2703  * buffer locked exclusively in all cases.
2704  */
2705  recptr = BufferGetLSN(buf);
2706 
2707  /* To check if block content changes while flushing. - vadim 01/17/97 */
2708  buf_state &= ~BM_JUST_DIRTIED;
2709  UnlockBufHdr(buf, buf_state);
2710 
2711  /*
2712  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2713  * rule that log updates must hit disk before any of the data-file changes
2714  * they describe do.
2715  *
2716  * However, this rule does not apply to unlogged relations, which will be
2717  * lost after a crash anyway. Most unlogged relation pages do not bear
2718  * LSNs since we never emit WAL records for them, and therefore flushing
2719  * up through the buffer LSN would be useless, but harmless. However,
2720  * GiST indexes use LSNs internally to track page-splits, and therefore
2721  * unlogged GiST pages bear "fake" LSNs generated by
2722  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2723  * LSN counter could advance past the WAL insertion point; and if it did
2724  * happen, attempting to flush WAL through that location would fail, with
2725  * disastrous system-wide consequences. To make sure that can't happen,
2726  * skip the flush if the buffer isn't permanent.
2727  */
2728  if (buf_state & BM_PERMANENT)
2729  XLogFlush(recptr);
2730 
2731  /*
2732  * Now it's safe to write buffer to disk. Note that no one else should
2733  * have been able to write it while we were busy with log flushing because
2734  * we have the io_in_progress lock.
2735  */
2736  bufBlock = BufHdrGetBlock(buf);
2737 
2738  /*
2739  * Update page checksum if desired. Since we have only shared lock on the
2740  * buffer, other processes might be updating hint bits in it, so we must
2741  * copy the page to private storage if we do checksumming.
2742  */
2743  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2744 
2745  if (track_io_timing)
2746  INSTR_TIME_SET_CURRENT(io_start);
2747 
2748  /*
2749  * bufToWrite is either the shared buffer or a copy, as appropriate.
2750  */
2751  smgrwrite(reln,
2752  buf->tag.forkNum,
2753  buf->tag.blockNum,
2754  bufToWrite,
2755  false);
2756 
2757  if (track_io_timing)
2758  {
2759  INSTR_TIME_SET_CURRENT(io_time);
2760  INSTR_TIME_SUBTRACT(io_time, io_start);
2763  }
2764 
2766 
2767  /*
2768  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2769  * end the io_in_progress state.
2770  */
2771  TerminateBufferIO(buf, true, 0);
2772 
2773  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2774  buf->tag.blockNum,
2775  reln->smgr_rnode.node.spcNode,
2776  reln->smgr_rnode.node.dbNode,
2777  reln->smgr_rnode.node.relNode);
2778 
2779  /* Pop the error context stack */
2780  error_context_stack = errcallback.previous;
2781 }
2782 
2783 /*
2784  * RelationGetNumberOfBlocksInFork
2785  * Determines the current number of pages in the specified relation fork.
2786  */
2789 {
2790  /* Open it at the smgr level if not already done */
2791  RelationOpenSmgr(relation);
2792 
2793  return smgrnblocks(relation->rd_smgr, forkNum);
2794 }
2795 
2796 /*
2797  * BufferIsPermanent
2798  * Determines whether a buffer will potentially still be around after
2799  * a crash. Caller must hold a buffer pin.
2800  */
2801 bool
2803 {
2804  BufferDesc *bufHdr;
2805 
2806  /* Local buffers are used only for temp relations. */
2807  if (BufferIsLocal(buffer))
2808  return false;
2809 
2810  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2811  Assert(BufferIsValid(buffer));
2812  Assert(BufferIsPinned(buffer));
2813 
2814  /*
2815  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2816  * need not bother with the buffer header spinlock. Even if someone else
2817  * changes the buffer header state while we're doing this, the state is
2818  * changed atomically, so we'll read the old value or the new value, but
2819  * not random garbage.
2820  */
2821  bufHdr = GetBufferDescriptor(buffer - 1);
2822  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2823 }
2824 
2825 /*
2826  * BufferGetLSNAtomic
2827  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2828  * This is necessary for some callers who may not have an exclusive lock
2829  * on the buffer.
2830  */
2831 XLogRecPtr
2833 {
2834  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2835  char *page = BufferGetPage(buffer);
2836  XLogRecPtr lsn;
2837  uint32 buf_state;
2838 
2839  /*
2840  * If we don't need locking for correctness, fastpath out.
2841  */
2842  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2843  return PageGetLSN(page);
2844 
2845  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2846  Assert(BufferIsValid(buffer));
2847  Assert(BufferIsPinned(buffer));
2848 
2849  buf_state = LockBufHdr(bufHdr);
2850  lsn = PageGetLSN(page);
2851  UnlockBufHdr(bufHdr, buf_state);
2852 
2853  return lsn;
2854 }
2855 
2856 /* ---------------------------------------------------------------------
2857  * DropRelFileNodeBuffers
2858  *
2859  * This function removes from the buffer pool all the pages of the
2860  * specified relation fork that have block numbers >= firstDelBlock.
2861  * (In particular, with firstDelBlock = 0, all pages are removed.)
2862  * Dirty pages are simply dropped, without bothering to write them
2863  * out first. Therefore, this is NOT rollback-able, and so should be
2864  * used only with extreme caution!
2865  *
2866  * Currently, this is called only from smgr.c when the underlying file
2867  * is about to be deleted or truncated (firstDelBlock is needed for
2868  * the truncation case). The data in the affected pages would therefore
2869  * be deleted momentarily anyway, and there is no point in writing it.
2870  * It is the responsibility of higher-level code to ensure that the
2871  * deletion or truncation does not lose any data that could be needed
2872  * later. It is also the responsibility of higher-level code to ensure
2873  * that no other process could be trying to load more pages of the
2874  * relation into buffers.
2875  *
2876  * XXX currently it sequentially searches the buffer pool, should be
2877  * changed to more clever ways of searching. However, this routine
2878  * is used only in code paths that aren't very performance-critical,
2879  * and we shouldn't slow down the hot paths to make it faster ...
2880  * --------------------------------------------------------------------
2881  */
2882 void
2884  BlockNumber firstDelBlock)
2885 {
2886  int i;
2887 
2888  /* If it's a local relation, it's localbuf.c's problem. */
2889  if (RelFileNodeBackendIsTemp(rnode))
2890  {
2891  if (rnode.backend == MyBackendId)
2892  DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
2893  return;
2894  }
2895 
2896  for (i = 0; i < NBuffers; i++)
2897  {
2898  BufferDesc *bufHdr = GetBufferDescriptor(i);
2899  uint32 buf_state;
2900 
2901  /*
2902  * We can make this a tad faster by prechecking the buffer tag before
2903  * we attempt to lock the buffer; this saves a lot of lock
2904  * acquisitions in typical cases. It should be safe because the
2905  * caller must have AccessExclusiveLock on the relation, or some other
2906  * reason to be certain that no one is loading new pages of the rel
2907  * into the buffer pool. (Otherwise we might well miss such pages
2908  * entirely.) Therefore, while the tag might be changing while we
2909  * look at it, it can't be changing *to* a value we care about, only
2910  * *away* from such a value. So false negatives are impossible, and
2911  * false positives are safe because we'll recheck after getting the
2912  * buffer lock.
2913  *
2914  * We could check forkNum and blockNum as well as the rnode, but the
2915  * incremental win from doing so seems small.
2916  */
2917  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2918  continue;
2919 
2920  buf_state = LockBufHdr(bufHdr);
2921  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2922  bufHdr->tag.forkNum == forkNum &&
2923  bufHdr->tag.blockNum >= firstDelBlock)
2924  InvalidateBuffer(bufHdr); /* releases spinlock */
2925  else
2926  UnlockBufHdr(bufHdr, buf_state);
2927  }
2928 }
2929 
2930 /* ---------------------------------------------------------------------
2931  * DropRelFileNodesAllBuffers
2932  *
2933  * This function removes from the buffer pool all the pages of all
2934  * forks of the specified relations. It's equivalent to calling
2935  * DropRelFileNodeBuffers once per fork per relation with
2936  * firstDelBlock = 0.
2937  * --------------------------------------------------------------------
2938  */
2939 void
2941 {
2942  int i,
2943  n = 0;
2944  RelFileNode *nodes;
2945  bool use_bsearch;
2946 
2947  if (nnodes == 0)
2948  return;
2949 
2950  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2951 
2952  /* If it's a local relation, it's localbuf.c's problem. */
2953  for (i = 0; i < nnodes; i++)
2954  {
2955  if (RelFileNodeBackendIsTemp(rnodes[i]))
2956  {
2957  if (rnodes[i].backend == MyBackendId)
2958  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
2959  }
2960  else
2961  nodes[n++] = rnodes[i].node;
2962  }
2963 
2964  /*
2965  * If there are no non-local relations, then we're done. Release the
2966  * memory and return.
2967  */
2968  if (n == 0)
2969  {
2970  pfree(nodes);
2971  return;
2972  }
2973 
2974  /*
2975  * For low number of relations to drop just use a simple walk through, to
2976  * save the bsearch overhead. The threshold to use is rather a guess than
2977  * an exactly determined value, as it depends on many factors (CPU and RAM
2978  * speeds, amount of shared buffers etc.).
2979  */
2980  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
2981 
2982  /* sort the list of rnodes if necessary */
2983  if (use_bsearch)
2984  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
2985 
2986  for (i = 0; i < NBuffers; i++)
2987  {
2988  RelFileNode *rnode = NULL;
2989  BufferDesc *bufHdr = GetBufferDescriptor(i);
2990  uint32 buf_state;
2991 
2992  /*
2993  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
2994  * and saves some cycles.
2995  */
2996 
2997  if (!use_bsearch)
2998  {
2999  int j;
3000 
3001  for (j = 0; j < n; j++)
3002  {
3003  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3004  {
3005  rnode = &nodes[j];
3006  break;
3007  }
3008  }
3009  }
3010  else
3011  {
3012  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3013  nodes, n, sizeof(RelFileNode),
3015  }
3016 
3017  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3018  if (rnode == NULL)
3019  continue;
3020 
3021  buf_state = LockBufHdr(bufHdr);
3022  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3023  InvalidateBuffer(bufHdr); /* releases spinlock */
3024  else
3025  UnlockBufHdr(bufHdr, buf_state);
3026  }
3027 
3028  pfree(nodes);
3029 }
3030 
3031 /* ---------------------------------------------------------------------
3032  * DropDatabaseBuffers
3033  *
3034  * This function removes all the buffers in the buffer cache for a
3035  * particular database. Dirty pages are simply dropped, without
3036  * bothering to write them out first. This is used when we destroy a
3037  * database, to avoid trying to flush data to disk when the directory
3038  * tree no longer exists. Implementation is pretty similar to
3039  * DropRelFileNodeBuffers() which is for destroying just one relation.
3040  * --------------------------------------------------------------------
3041  */
3042 void
3044 {
3045  int i;
3046 
3047  /*
3048  * We needn't consider local buffers, since by assumption the target
3049  * database isn't our own.
3050  */
3051 
3052  for (i = 0; i < NBuffers; i++)
3053  {
3054  BufferDesc *bufHdr = GetBufferDescriptor(i);
3055  uint32 buf_state;
3056 
3057  /*
3058  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3059  * and saves some cycles.
3060  */
3061  if (bufHdr->tag.rnode.dbNode != dbid)
3062  continue;
3063 
3064  buf_state = LockBufHdr(bufHdr);
3065  if (bufHdr->tag.rnode.dbNode == dbid)
3066  InvalidateBuffer(bufHdr); /* releases spinlock */
3067  else
3068  UnlockBufHdr(bufHdr, buf_state);
3069  }
3070 }
3071 
3072 /* -----------------------------------------------------------------
3073  * PrintBufferDescs
3074  *
3075  * this function prints all the buffer descriptors, for debugging
3076  * use only.
3077  * -----------------------------------------------------------------
3078  */
3079 #ifdef NOT_USED
3080 void
3081 PrintBufferDescs(void)
3082 {
3083  int i;
3084 
3085  for (i = 0; i < NBuffers; ++i)
3086  {
3089 
3090  /* theoretically we should lock the bufhdr here */
3091  elog(LOG,
3092  "[%02d] (freeNext=%d, rel=%s, "
3093  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3094  i, buf->freeNext,
3096  buf->tag.blockNum, buf->flags,
3097  buf->refcount, GetPrivateRefCount(b));
3098  }
3099 }
3100 #endif
3101 
3102 #ifdef NOT_USED
3103 void
3104 PrintPinnedBufs(void)
3105 {
3106  int i;
3107 
3108  for (i = 0; i < NBuffers; ++i)
3109  {
3112 
3113  if (GetPrivateRefCount(b) > 0)
3114  {
3115  /* theoretically we should lock the bufhdr here */
3116  elog(LOG,
3117  "[%02d] (freeNext=%d, rel=%s, "
3118  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3119  i, buf->freeNext,
3120  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3121  buf->tag.blockNum, buf->flags,
3122  buf->refcount, GetPrivateRefCount(b));
3123  }
3124  }
3125 }
3126 #endif
3127 
3128 /* ---------------------------------------------------------------------
3129  * FlushRelationBuffers
3130  *
3131  * This function writes all dirty pages of a relation out to disk
3132  * (or more accurately, out to kernel disk buffers), ensuring that the
3133  * kernel has an up-to-date view of the relation.
3134  *
3135  * Generally, the caller should be holding AccessExclusiveLock on the
3136  * target relation to ensure that no other backend is busy dirtying
3137  * more blocks of the relation; the effects can't be expected to last
3138  * after the lock is released.
3139  *
3140  * XXX currently it sequentially searches the buffer pool, should be
3141  * changed to more clever ways of searching. This routine is not
3142  * used in any performance-critical code paths, so it's not worth
3143  * adding additional overhead to normal paths to make it go faster;
3144  * but see also DropRelFileNodeBuffers.
3145  * --------------------------------------------------------------------
3146  */
3147 void
3149 {
3150  int i;
3151  BufferDesc *bufHdr;
3152 
3153  /* Open rel at the smgr level if not already done */
3154  RelationOpenSmgr(rel);
3155 
3156  if (RelationUsesLocalBuffers(rel))
3157  {
3158  for (i = 0; i < NLocBuffer; i++)
3159  {
3160  uint32 buf_state;
3161 
3162  bufHdr = GetLocalBufferDescriptor(i);
3163  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3164  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3165  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3166  {
3167  ErrorContextCallback errcallback;
3168  Page localpage;
3169 
3170  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3171 
3172  /* Setup error traceback support for ereport() */
3174  errcallback.arg = (void *) bufHdr;
3175  errcallback.previous = error_context_stack;
3176  error_context_stack = &errcallback;
3177 
3178  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3179 
3180  smgrwrite(rel->rd_smgr,
3181  bufHdr->tag.forkNum,
3182  bufHdr->tag.blockNum,
3183  localpage,
3184  false);
3185 
3186  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3187  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3188 
3189  /* Pop the error context stack */
3190  error_context_stack = errcallback.previous;
3191  }
3192  }
3193 
3194  return;
3195  }
3196 
3197  /* Make sure we can handle the pin inside the loop */
3199 
3200  for (i = 0; i < NBuffers; i++)
3201  {
3202  uint32 buf_state;
3203 
3204  bufHdr = GetBufferDescriptor(i);
3205 
3206  /*
3207  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3208  * and saves some cycles.
3209  */
3210  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3211  continue;
3212 
3214 
3215  buf_state = LockBufHdr(bufHdr);
3216  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3217  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3218  {
3219  PinBuffer_Locked(bufHdr);
3221  FlushBuffer(bufHdr, rel->rd_smgr);
3223  UnpinBuffer(bufHdr, true);
3224  }
3225  else
3226  UnlockBufHdr(bufHdr, buf_state);
3227  }
3228 }
3229 
3230 /* ---------------------------------------------------------------------
3231  * FlushDatabaseBuffers
3232  *
3233  * This function writes all dirty pages of a database out to disk
3234  * (or more accurately, out to kernel disk buffers), ensuring that the
3235  * kernel has an up-to-date view of the database.
3236  *
3237  * Generally, the caller should be holding an appropriate lock to ensure
3238  * no other backend is active in the target database; otherwise more
3239  * pages could get dirtied.
3240  *
3241  * Note we don't worry about flushing any pages of temporary relations.
3242  * It's assumed these wouldn't be interesting.
3243  * --------------------------------------------------------------------
3244  */
3245 void
3247 {
3248  int i;
3249  BufferDesc *bufHdr;
3250 
3251  /* Make sure we can handle the pin inside the loop */
3253 
3254  for (i = 0; i < NBuffers; i++)
3255  {
3256  uint32 buf_state;
3257 
3258  bufHdr = GetBufferDescriptor(i);
3259 
3260  /*
3261  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3262  * and saves some cycles.
3263  */
3264  if (bufHdr->tag.rnode.dbNode != dbid)
3265  continue;
3266 
3268 
3269  buf_state = LockBufHdr(bufHdr);
3270  if (bufHdr->tag.rnode.dbNode == dbid &&
3271  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3272  {
3273  PinBuffer_Locked(bufHdr);
3275  FlushBuffer(bufHdr, NULL);
3277  UnpinBuffer(bufHdr, true);
3278  }
3279  else
3280  UnlockBufHdr(bufHdr, buf_state);
3281  }
3282 }
3283 
3284 /*
3285  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3286  * OS.
3287  */
3288 void
3290 {
3291  BufferDesc *bufHdr;
3292 
3293  /* currently not needed, but no fundamental reason not to support */
3294  Assert(!BufferIsLocal(buffer));
3295 
3296  Assert(BufferIsPinned(buffer));
3297 
3298  bufHdr = GetBufferDescriptor(buffer - 1);
3299 
3301 
3302  FlushBuffer(bufHdr, NULL);
3303 }
3304 
3305 /*
3306  * ReleaseBuffer -- release the pin on a buffer
3307  */
3308 void
3310 {
3311  if (!BufferIsValid(buffer))
3312  elog(ERROR, "bad buffer ID: %d", buffer);
3313 
3314  if (BufferIsLocal(buffer))
3315  {
3317 
3318  Assert(LocalRefCount[-buffer - 1] > 0);
3319  LocalRefCount[-buffer - 1]--;
3320  return;
3321  }
3322 
3323  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3324 }
3325 
3326 /*
3327  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3328  *
3329  * This is just a shorthand for a common combination.
3330  */
3331 void
3333 {
3334  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3335  ReleaseBuffer(buffer);
3336 }
3337 
3338 /*
3339  * IncrBufferRefCount
3340  * Increment the pin count on a buffer that we have *already* pinned
3341  * at least once.
3342  *
3343  * This function cannot be used on a buffer we do not have pinned,
3344  * because it doesn't change the shared buffer state.
3345  */
3346 void
3348 {
3349  Assert(BufferIsPinned(buffer));
3351  if (BufferIsLocal(buffer))
3352  LocalRefCount[-buffer - 1]++;
3353  else
3354  {
3355  PrivateRefCountEntry *ref;
3356 
3357  ref = GetPrivateRefCountEntry(buffer, true);
3358  Assert(ref != NULL);
3359  ref->refcount++;
3360  }
3362 }
3363 
3364 /*
3365  * MarkBufferDirtyHint
3366  *
3367  * Mark a buffer dirty for non-critical changes.
3368  *
3369  * This is essentially the same as MarkBufferDirty, except:
3370  *
3371  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3372  * to write an XLOG_FPI WAL record to protect against torn pages.
3373  * 2. The caller might have only share-lock instead of exclusive-lock on the
3374  * buffer's content lock.
3375  * 3. This function does not guarantee that the buffer is always marked dirty
3376  * (due to a race condition), so it cannot be used for important changes.
3377  */
3378 void
3380 {
3381  BufferDesc *bufHdr;
3382  Page page = BufferGetPage(buffer);
3383 
3384  if (!BufferIsValid(buffer))
3385  elog(ERROR, "bad buffer ID: %d", buffer);
3386 
3387  if (BufferIsLocal(buffer))
3388  {
3389  MarkLocalBufferDirty(buffer);
3390  return;
3391  }
3392 
3393  bufHdr = GetBufferDescriptor(buffer - 1);
3394 
3395  Assert(GetPrivateRefCount(buffer) > 0);
3396  /* here, either share or exclusive lock is OK */
3398 
3399  /*
3400  * This routine might get called many times on the same page, if we are
3401  * making the first scan after commit of an xact that added/deleted many
3402  * tuples. So, be as quick as we can if the buffer is already dirty. We
3403  * do this by not acquiring spinlock if it looks like the status bits are
3404  * already set. Since we make this test unlocked, there's a chance we
3405  * might fail to notice that the flags have just been cleared, and failed
3406  * to reset them, due to memory-ordering issues. But since this function
3407  * is only intended to be used in cases where failing to write out the
3408  * data would be harmless anyway, it doesn't really matter.
3409  */
3410  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3412  {
3414  bool dirtied = false;
3415  bool delayChkpt = false;
3416  uint32 buf_state;
3417 
3418  /*
3419  * If we need to protect hint bit updates from torn writes, WAL-log a
3420  * full page image of the page. This full page image is only necessary
3421  * if the hint bit update is the first change to the page since the
3422  * last checkpoint.
3423  *
3424  * We don't check full_page_writes here because that logic is included
3425  * when we call XLogInsert() since the value changes dynamically.
3426  */
3427  if (XLogHintBitIsNeeded() &&
3428  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3429  {
3430  /*
3431  * If we're in recovery we cannot dirty a page because of a hint.
3432  * We can set the hint, just not dirty the page as a result so the
3433  * hint is lost when we evict the page or shutdown.
3434  *
3435  * See src/backend/storage/page/README for longer discussion.
3436  */
3437  if (RecoveryInProgress())
3438  return;
3439 
3440  /*
3441  * If the block is already dirty because we either made a change
3442  * or set a hint already, then we don't need to write a full page
3443  * image. Note that aggressive cleaning of blocks dirtied by hint
3444  * bit setting would increase the call rate. Bulk setting of hint
3445  * bits would reduce the call rate...
3446  *
3447  * We must issue the WAL record before we mark the buffer dirty.
3448  * Otherwise we might write the page before we write the WAL. That
3449  * causes a race condition, since a checkpoint might occur between
3450  * writing the WAL record and marking the buffer dirty. We solve
3451  * that with a kluge, but one that is already in use during
3452  * transaction commit to prevent race conditions. Basically, we
3453  * simply prevent the checkpoint WAL record from being written
3454  * until we have marked the buffer dirty. We don't start the
3455  * checkpoint flush until we have marked dirty, so our checkpoint
3456  * must flush the change to disk successfully or the checkpoint
3457  * never gets written, so crash recovery will fix.
3458  *
3459  * It's possible we may enter here without an xid, so it is
3460  * essential that CreateCheckpoint waits for virtual transactions
3461  * rather than full transactionids.
3462  */
3463  MyPgXact->delayChkpt = delayChkpt = true;
3464  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3465  }
3466 
3467  buf_state = LockBufHdr(bufHdr);
3468 
3469  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3470 
3471  if (!(buf_state & BM_DIRTY))
3472  {
3473  dirtied = true; /* Means "will be dirtied by this action" */
3474 
3475  /*
3476  * Set the page LSN if we wrote a backup block. We aren't supposed
3477  * to set this when only holding a share lock but as long as we
3478  * serialise it somehow we're OK. We choose to set LSN while
3479  * holding the buffer header lock, which causes any reader of an
3480  * LSN who holds only a share lock to also obtain a buffer header
3481  * lock before using PageGetLSN(), which is enforced in
3482  * BufferGetLSNAtomic().
3483  *
3484  * If checksums are enabled, you might think we should reset the
3485  * checksum here. That will happen when the page is written
3486  * sometime later in this checkpoint cycle.
3487  */
3488  if (!XLogRecPtrIsInvalid(lsn))
3489  PageSetLSN(page, lsn);
3490  }
3491 
3492  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3493  UnlockBufHdr(bufHdr, buf_state);
3494 
3495  if (delayChkpt)
3496  MyPgXact->delayChkpt = false;
3497 
3498  if (dirtied)
3499  {
3500  VacuumPageDirty++;
3502  if (VacuumCostActive)
3504  }
3505  }
3506 }
3507 
3508 /*
3509  * Release buffer content locks for shared buffers.
3510  *
3511  * Used to clean up after errors.
3512  *
3513  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3514  * of releasing buffer content locks per se; the only thing we need to deal
3515  * with here is clearing any PIN_COUNT request that was in progress.
3516  */
3517 void
3519 {
3521 
3522  if (buf)
3523  {
3524  uint32 buf_state;
3525 
3526  buf_state = LockBufHdr(buf);
3527 
3528  /*
3529  * Don't complain if flag bit not set; it could have been reset but we
3530  * got a cancel/die interrupt before getting the signal.
3531  */
3532  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3533  buf->wait_backend_pid == MyProcPid)
3534  buf_state &= ~BM_PIN_COUNT_WAITER;
3535 
3536  UnlockBufHdr(buf, buf_state);
3537 
3538  PinCountWaitBuf = NULL;
3539  }
3540 }
3541 
3542 /*
3543  * Acquire or release the content_lock for the buffer.
3544  */
3545 void
3547 {
3548  BufferDesc *buf;
3549 
3550  Assert(BufferIsValid(buffer));
3551  if (BufferIsLocal(buffer))
3552  return; /* local buffers need no lock */
3553 
3554  buf = GetBufferDescriptor(buffer - 1);
3555 
3556  if (mode == BUFFER_LOCK_UNLOCK)
3558  else if (mode == BUFFER_LOCK_SHARE)
3560  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3562  else
3563  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3564 }
3565 
3566 /*
3567  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3568  *
3569  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3570  */
3571 bool
3573 {
3574  BufferDesc *buf;
3575 
3576  Assert(BufferIsValid(buffer));
3577  if (BufferIsLocal(buffer))
3578  return true; /* act as though we got it */
3579 
3580  buf = GetBufferDescriptor(buffer - 1);
3581 
3583  LW_EXCLUSIVE);
3584 }
3585 
3586 /*
3587  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3588  *
3589  * Items may be deleted from a disk page only when the caller (a) holds an
3590  * exclusive lock on the buffer and (b) has observed that no other backend
3591  * holds a pin on the buffer. If there is a pin, then the other backend
3592  * might have a pointer into the buffer (for example, a heapscan reference
3593  * to an item --- see README for more details). It's OK if a pin is added
3594  * after the cleanup starts, however; the newly-arrived backend will be
3595  * unable to look at the page until we release the exclusive lock.
3596  *
3597  * To implement this protocol, a would-be deleter must pin the buffer and
3598  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3599  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3600  * it has successfully observed pin count = 1.
3601  */
3602 void
3604 {
3605  BufferDesc *bufHdr;
3606 
3607  Assert(BufferIsValid(buffer));
3608  Assert(PinCountWaitBuf == NULL);
3609 
3610  if (BufferIsLocal(buffer))
3611  {
3612  /* There should be exactly one pin */
3613  if (LocalRefCount[-buffer - 1] != 1)
3614  elog(ERROR, "incorrect local pin count: %d",
3615  LocalRefCount[-buffer - 1]);
3616  /* Nobody else to wait for */
3617  return;
3618  }
3619 
3620  /* There should be exactly one local pin */
3621  if (GetPrivateRefCount(buffer) != 1)
3622  elog(ERROR, "incorrect local pin count: %d",
3623  GetPrivateRefCount(buffer));
3624 
3625  bufHdr = GetBufferDescriptor(buffer - 1);
3626 
3627  for (;;)
3628  {
3629  uint32 buf_state;
3630 
3631  /* Try to acquire lock */
3633  buf_state = LockBufHdr(bufHdr);
3634 
3635  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3636  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3637  {
3638  /* Successfully acquired exclusive lock with pincount 1 */
3639  UnlockBufHdr(bufHdr, buf_state);
3640  return;
3641  }
3642  /* Failed, so mark myself as waiting for pincount 1 */
3643  if (buf_state & BM_PIN_COUNT_WAITER)
3644  {
3645  UnlockBufHdr(bufHdr, buf_state);
3646  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3647  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3648  }
3649  bufHdr->wait_backend_pid = MyProcPid;
3650  PinCountWaitBuf = bufHdr;
3651  buf_state |= BM_PIN_COUNT_WAITER;
3652  UnlockBufHdr(bufHdr, buf_state);
3653  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3654 
3655  /* Wait to be signaled by UnpinBuffer() */
3656  if (InHotStandby)
3657  {
3658  /* Publish the bufid that Startup process waits on */
3659  SetStartupBufferPinWaitBufId(buffer - 1);
3660  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3662  /* Reset the published bufid */
3664  }
3665  else
3667 
3668  /*
3669  * Remove flag marking us as waiter. Normally this will not be set
3670  * anymore, but ProcWaitForSignal() can return for other signals as
3671  * well. We take care to only reset the flag if we're the waiter, as
3672  * theoretically another backend could have started waiting. That's
3673  * impossible with the current usages due to table level locking, but
3674  * better be safe.
3675  */
3676  buf_state = LockBufHdr(bufHdr);
3677  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3678  bufHdr->wait_backend_pid == MyProcPid)
3679  buf_state &= ~BM_PIN_COUNT_WAITER;
3680  UnlockBufHdr(bufHdr, buf_state);
3681 
3682  PinCountWaitBuf = NULL;
3683  /* Loop back and try again */
3684  }
3685 }
3686 
3687 /*
3688  * Check called from RecoveryConflictInterrupt handler when Startup
3689  * process requests cancellation of all pin holders that are blocking it.
3690  */
3691 bool
3693 {
3694  int bufid = GetStartupBufferPinWaitBufId();
3695 
3696  /*
3697  * If we get woken slowly then it's possible that the Startup process was
3698  * already woken by other backends before we got here. Also possible that
3699  * we get here by multiple interrupts or interrupts at inappropriate
3700  * times, so make sure we do nothing if the bufid is not set.
3701  */
3702  if (bufid < 0)
3703  return false;
3704 
3705  if (GetPrivateRefCount(bufid + 1) > 0)
3706  return true;
3707 
3708  return false;
3709 }
3710 
3711 /*
3712  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3713  *
3714  * We won't loop, but just check once to see if the pin count is OK. If
3715  * not, return false with no lock held.
3716  */
3717 bool
3719 {
3720  BufferDesc *bufHdr;
3721  uint32 buf_state,
3722  refcount;
3723 
3724  Assert(BufferIsValid(buffer));
3725 
3726  if (BufferIsLocal(buffer))
3727  {
3728  refcount = LocalRefCount[-buffer - 1];
3729  /* There should be exactly one pin */
3730  Assert(refcount > 0);
3731  if (refcount != 1)
3732  return false;
3733  /* Nobody else to wait for */
3734  return true;
3735  }
3736 
3737  /* There should be exactly one local pin */
3738  refcount = GetPrivateRefCount(buffer);
3739  Assert(refcount);
3740  if (refcount != 1)
3741  return false;
3742 
3743  /* Try to acquire lock */
3744  if (!ConditionalLockBuffer(buffer))
3745  return false;
3746 
3747  bufHdr = GetBufferDescriptor(buffer - 1);
3748  buf_state = LockBufHdr(bufHdr);
3749  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3750 
3751  Assert(refcount > 0);
3752  if (refcount == 1)
3753  {
3754  /* Successfully acquired exclusive lock with pincount 1 */
3755  UnlockBufHdr(bufHdr, buf_state);
3756  return true;
3757  }
3758 
3759  /* Failed, so release the lock */
3760  UnlockBufHdr(bufHdr, buf_state);
3761  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3762  return false;
3763 }
3764 
3765 /*
3766  * IsBufferCleanupOK - as above, but we already have the lock
3767  *
3768  * Check whether it's OK to perform cleanup on a buffer we've already
3769  * locked. If we observe that the pin count is 1, our exclusive lock
3770  * happens to be a cleanup lock, and we can proceed with anything that
3771  * would have been allowable had we sought a cleanup lock originally.
3772  */
3773 bool
3775 {
3776  BufferDesc *bufHdr;
3777  uint32 buf_state;
3778 
3779  Assert(BufferIsValid(buffer));
3780 
3781  if (BufferIsLocal(buffer))
3782  {
3783  /* There should be exactly one pin */
3784  if (LocalRefCount[-buffer - 1] != 1)
3785  return false;
3786  /* Nobody else to wait for */
3787  return true;
3788  }
3789 
3790  /* There should be exactly one local pin */
3791  if (GetPrivateRefCount(buffer) != 1)
3792  return false;
3793 
3794  bufHdr = GetBufferDescriptor(buffer - 1);
3795 
3796  /* caller must hold exclusive lock on buffer */
3798  LW_EXCLUSIVE));
3799 
3800  buf_state = LockBufHdr(bufHdr);
3801 
3802  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3803  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3804  {
3805  /* pincount is OK. */
3806  UnlockBufHdr(bufHdr, buf_state);
3807  return true;
3808  }
3809 
3810  UnlockBufHdr(bufHdr, buf_state);
3811  return false;
3812 }
3813 
3814 
3815 /*
3816  * Functions for buffer I/O handling
3817  *
3818  * Note: We assume that nested buffer I/O never occurs.
3819  * i.e at most one io_in_progress lock is held per proc.
3820  *
3821  * Also note that these are used only for shared buffers, not local ones.
3822  */
3823 
3824 /*
3825  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3826  */
3827 static void
3829 {
3830  /*
3831  * Changed to wait until there's no IO - Inoue 01/13/2000
3832  *
3833  * Note this is *necessary* because an error abort in the process doing
3834  * I/O could release the io_in_progress_lock prematurely. See
3835  * AbortBufferIO.
3836  */
3837  for (;;)
3838  {
3839  uint32 buf_state;
3840 
3841  /*
3842  * It may not be necessary to acquire the spinlock to check the flag
3843  * here, but since this test is essential for correctness, we'd better
3844  * play it safe.
3845  */
3846  buf_state = LockBufHdr(buf);
3847  UnlockBufHdr(buf, buf_state);
3848 
3849  if (!(buf_state & BM_IO_IN_PROGRESS))
3850  break;
3853  }
3854 }
3855 
3856 /*
3857  * StartBufferIO: begin I/O on this buffer
3858  * (Assumptions)
3859  * My process is executing no IO
3860  * The buffer is Pinned
3861  *
3862  * In some scenarios there are race conditions in which multiple backends
3863  * could attempt the same I/O operation concurrently. If someone else
3864  * has already started I/O on this buffer then we will block on the
3865  * io_in_progress lock until he's done.
3866  *
3867  * Input operations are only attempted on buffers that are not BM_VALID,
3868  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3869  * so we can always tell if the work is already done.
3870  *
3871  * Returns true if we successfully marked the buffer as I/O busy,
3872  * false if someone else already did the work.
3873  */
3874 static bool
3875 StartBufferIO(BufferDesc *buf, bool forInput)
3876 {
3877  uint32 buf_state;
3878 
3879  Assert(!InProgressBuf);
3880 
3881  for (;;)
3882  {
3883  /*
3884  * Grab the io_in_progress lock so that other processes can wait for
3885  * me to finish the I/O.
3886  */
3888 
3889  buf_state = LockBufHdr(buf);
3890 
3891  if (!(buf_state & BM_IO_IN_PROGRESS))
3892  break;
3893 
3894  /*
3895  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3896  * lock isn't held is if the process doing the I/O is recovering from
3897  * an error (see AbortBufferIO). If that's the case, we must wait for
3898  * him to get unwedged.
3899  */
3900  UnlockBufHdr(buf, buf_state);
3902  WaitIO(buf);
3903  }
3904 
3905  /* Once we get here, there is definitely no I/O active on this buffer */
3906 
3907  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3908  {
3909  /* someone else already did the I/O */
3910  UnlockBufHdr(buf, buf_state);
3912  return false;
3913  }
3914 
3915  buf_state |= BM_IO_IN_PROGRESS;
3916  UnlockBufHdr(buf, buf_state);
3917 
3918  InProgressBuf = buf;
3919  IsForInput = forInput;
3920 
3921  return true;
3922 }
3923 
3924 /*
3925  * TerminateBufferIO: release a buffer we were doing I/O on
3926  * (Assumptions)
3927  * My process is executing IO for the buffer
3928  * BM_IO_IN_PROGRESS bit is set for the buffer
3929  * We hold the buffer's io_in_progress lock
3930  * The buffer is Pinned
3931  *
3932  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
3933  * buffer's BM_DIRTY flag. This is appropriate when terminating a
3934  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
3935  * marking the buffer clean if it was re-dirtied while we were writing.
3936  *
3937  * set_flag_bits gets ORed into the buffer's flags. It must include
3938  * BM_IO_ERROR in a failure case. For successful completion it could
3939  * be 0, or BM_VALID if we just finished reading in the page.
3940  */
3941 static void
3942 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
3943 {
3944  uint32 buf_state;
3945 
3946  Assert(buf == InProgressBuf);
3947 
3948  buf_state = LockBufHdr(buf);
3949 
3950  Assert(buf_state & BM_IO_IN_PROGRESS);
3951 
3952  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
3953  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
3954  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
3955 
3956  buf_state |= set_flag_bits;
3957  UnlockBufHdr(buf, buf_state);
3958 
3959  InProgressBuf = NULL;
3960 
3962 }
3963 
3964 /*
3965  * AbortBufferIO: Clean up any active buffer I/O after an error.
3966  *
3967  * All LWLocks we might have held have been released,
3968  * but we haven't yet released buffer pins, so the buffer is still pinned.
3969  *
3970  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
3971  * possible the error condition wasn't related to the I/O.
3972  */
3973 void
3975 {
3977 
3978  if (buf)
3979  {
3980  uint32 buf_state;
3981 
3982  /*
3983  * Since LWLockReleaseAll has already been called, we're not holding
3984  * the buffer's io_in_progress_lock. We have to re-acquire it so that
3985  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
3986  * buffer will be in a busy spin until we succeed in doing this.
3987  */
3989 
3990  buf_state = LockBufHdr(buf);
3991  Assert(buf_state & BM_IO_IN_PROGRESS);
3992  if (IsForInput)
3993  {
3994  Assert(!(buf_state & BM_DIRTY));
3995 
3996  /* We'd better not think buffer is valid yet */
3997  Assert(!(buf_state & BM_VALID));
3998  UnlockBufHdr(buf, buf_state);
3999  }
4000  else
4001  {
4002  Assert(buf_state & BM_DIRTY);
4003  UnlockBufHdr(buf, buf_state);
4004  /* Issue notice if this is not the first failure... */
4005  if (buf_state & BM_IO_ERROR)
4006  {
4007  /* Buffer is pinned, so we can read tag without spinlock */
4008  char *path;
4009 
4010  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4011  ereport(WARNING,
4012  (errcode(ERRCODE_IO_ERROR),
4013  errmsg("could not write block %u of %s",
4014  buf->tag.blockNum, path),
4015  errdetail("Multiple failures --- write error might be permanent.")));
4016  pfree(path);
4017  }
4018  }
4019  TerminateBufferIO(buf, false, BM_IO_ERROR);
4020  }
4021 }
4022 
4023 /*
4024  * Error context callback for errors occurring during shared buffer writes.
4025  */
4026 static void
4028 {
4029  BufferDesc *bufHdr = (BufferDesc *) arg;
4030 
4031  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4032  if (bufHdr != NULL)
4033  {
4034  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4035 
4036  errcontext("writing block %u of relation %s",
4037  bufHdr->tag.blockNum, path);
4038  pfree(path);
4039  }
4040 }
4041 
4042 /*
4043  * Error context callback for errors occurring during local buffer writes.
4044  */
4045 static void
4047 {
4048  BufferDesc *bufHdr = (BufferDesc *) arg;
4049 
4050  if (bufHdr != NULL)
4051  {
4052  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4053  bufHdr->tag.forkNum);
4054 
4055  errcontext("writing block %u of relation %s",
4056  bufHdr->tag.blockNum, path);
4057  pfree(path);
4058  }
4059 }
4060 
4061 /*
4062  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4063  */
4064 static int
4065 rnode_comparator(const void *p1, const void *p2)
4066 {
4067  RelFileNode n1 = *(RelFileNode *) p1;
4068  RelFileNode n2 = *(RelFileNode *) p2;
4069 
4070  if (n1.relNode < n2.relNode)
4071  return -1;
4072  else if (n1.relNode > n2.relNode)
4073  return 1;
4074 
4075  if (n1.dbNode < n2.dbNode)
4076  return -1;
4077  else if (n1.dbNode > n2.dbNode)
4078  return 1;
4079 
4080  if (n1.spcNode < n2.spcNode)
4081  return -1;
4082  else if (n1.spcNode > n2.spcNode)
4083  return 1;
4084  else
4085  return 0;
4086 }
4087 
4088 /*
4089  * Lock buffer header - set BM_LOCKED in buffer state.
4090  */
4091 uint32
4093 {
4094  SpinDelayStatus delayStatus;
4095  uint32 old_buf_state;
4096 
4097  init_local_spin_delay(&delayStatus);
4098 
4099  while (true)
4100  {
4101  /* set BM_LOCKED flag */
4102  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4103  /* if it wasn't set before we're OK */
4104  if (!(old_buf_state & BM_LOCKED))
4105  break;
4106  perform_spin_delay(&delayStatus);
4107  }
4108  finish_spin_delay(&delayStatus);
4109  return old_buf_state | BM_LOCKED;
4110 }
4111 
4112 /*
4113  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4114  * state at that point.
4115  *
4116  * Obviously the buffer could be locked by the time the value is returned, so
4117  * this is primarily useful in CAS style loops.
4118  */
4119 static uint32
4121 {
4122  SpinDelayStatus delayStatus;
4123  uint32 buf_state;
4124 
4125  init_local_spin_delay(&delayStatus);
4126 
4127  buf_state = pg_atomic_read_u32(&buf->state);
4128 
4129  while (buf_state & BM_LOCKED)
4130  {
4131  perform_spin_delay(&delayStatus);
4132  buf_state = pg_atomic_read_u32(&buf->state);
4133  }
4134 
4135  finish_spin_delay(&delayStatus);
4136 
4137  return buf_state;
4138 }
4139 
4140 /*
4141  * BufferTag comparator.
4142  */
4143 static int
4144 buffertag_comparator(const void *a, const void *b)
4145 {
4146  const BufferTag *ba = (const BufferTag *) a;
4147  const BufferTag *bb = (const BufferTag *) b;
4148  int ret;
4149 
4150  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4151 
4152  if (ret != 0)
4153  return ret;
4154 
4155  if (ba->forkNum < bb->forkNum)
4156  return -1;
4157  if (ba->forkNum > bb->forkNum)
4158  return 1;
4159 
4160  if (ba->blockNum < bb->blockNum)
4161  return -1;
4162  if (ba->blockNum > bb->blockNum)
4163  return 1;
4164 
4165  return 0;
4166 }
4167 
4168 /*
4169  * Comparator determining the writeout order in a checkpoint.
4170  *
4171  * It is important that tablespaces are compared first, the logic balancing
4172  * writes between tablespaces relies on it.
4173  */
4174 static int
4175 ckpt_buforder_comparator(const void *pa, const void *pb)
4176 {
4177  const CkptSortItem *a = (CkptSortItem *) pa;
4178  const CkptSortItem *b = (CkptSortItem *) pb;
4179 
4180  /* compare tablespace */
4181  if (a->tsId < b->tsId)
4182  return -1;
4183  else if (a->tsId > b->tsId)
4184  return 1;
4185  /* compare relation */
4186  if (a->relNode < b->relNode)
4187  return -1;
4188  else if (a->relNode > b->relNode)
4189  return 1;
4190  /* compare fork */
4191  else if (a->forkNum < b->forkNum)
4192  return -1;
4193  else if (a->forkNum > b->forkNum)
4194  return 1;
4195  /* compare block number */
4196  else if (a->blockNum < b->blockNum)
4197  return -1;
4198  else /* should not be the same block ... */
4199  return 1;
4200 }
4201 
4202 /*
4203  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4204  * progress.
4205  */
4206 static int
4208 {
4209  CkptTsStatus *sa = (CkptTsStatus *) a;
4210  CkptTsStatus *sb = (CkptTsStatus *) b;
4211 
4212  /* we want a min-heap, so return 1 for the a < b */
4213  if (sa->progress < sb->progress)
4214  return 1;
4215  else if (sa->progress == sb->progress)
4216  return 0;
4217  else
4218  return -1;
4219 }
4220 
4221 /*
4222  * Initialize a writeback context, discarding potential previous state.
4223  *
4224  * *max_pending is a pointer instead of an immediate value, so the coalesce
4225  * limits can easily changed by the GUC mechanism, and so calling code does
4226  * not have to check the current configuration. A value is 0 means that no
4227  * writeback control will be performed.
4228  */
4229 void
4230 WritebackContextInit(WritebackContext *context, int *max_pending)
4231 {
4232  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4233 
4234  context->max_pending = max_pending;
4235  context->nr_pending = 0;
4236 }
4237 
4238 /*
4239  * Add buffer to list of pending writeback requests.
4240  */
4241 void
4243 {
4244  PendingWriteback *pending;
4245 
4246  /*
4247  * Add buffer to the pending writeback array, unless writeback control is
4248  * disabled.
4249  */
4250  if (*context->max_pending > 0)
4251  {
4253 
4254  pending = &context->pending_writebacks[context->nr_pending++];
4255 
4256  pending->tag = *tag;
4257  }
4258 
4259  /*
4260  * Perform pending flushes if the writeback limit is exceeded. This
4261  * includes the case where previously an item has been added, but control
4262  * is now disabled.
4263  */
4264  if (context->nr_pending >= *context->max_pending)
4265  IssuePendingWritebacks(context);
4266 }
4267 
4268 /*
4269  * Issue all pending writeback requests, previously scheduled with
4270  * ScheduleBufferTagForWriteback, to the OS.
4271  *
4272  * Because this is only used to improve the OSs IO scheduling we try to never
4273  * error out - it's just a hint.
4274  */
4275 void
4277 {
4278  int i;
4279 
4280  if (context->nr_pending == 0)
4281  return;
4282 
4283  /*
4284  * Executing the writes in-order can make them a lot faster, and allows to
4285  * merge writeback requests to consecutive blocks into larger writebacks.
4286  */
4287  qsort(&context->pending_writebacks, context->nr_pending,
4289 
4290  /*
4291  * Coalesce neighbouring writes, but nothing else. For that we iterate
4292  * through the, now sorted, array of pending flushes, and look forward to
4293  * find all neighbouring (or identical) writes.
4294  */
4295  for (i = 0; i < context->nr_pending; i++)
4296  {
4299  SMgrRelation reln;
4300  int ahead;
4301  BufferTag tag;
4302  Size nblocks = 1;
4303 
4304  cur = &context->pending_writebacks[i];
4305  tag = cur->tag;
4306 
4307  /*
4308  * Peek ahead, into following writeback requests, to see if they can
4309  * be combined with the current one.
4310  */
4311  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4312  {
4313  next = &context->pending_writebacks[i + ahead + 1];
4314 
4315  /* different file, stop */
4316  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4317  cur->tag.forkNum != next->tag.forkNum)
4318  break;
4319 
4320  /* ok, block queued twice, skip */
4321  if (cur->tag.blockNum == next->tag.blockNum)
4322  continue;
4323 
4324  /* only merge consecutive writes */
4325  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4326  break;
4327 
4328  nblocks++;
4329  cur = next;
4330  }
4331 
4332  i += ahead;
4333 
4334  /* and finally tell the kernel to write the data to storage */
4335  reln = smgropen(tag.rnode, InvalidBackendId);
4336  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4337  }
4338 
4339  context->nr_pending = 0;
4340 }
4341 
4342 
4343 /*
4344  * Implement slower/larger portions of TestForOldSnapshot
4345  *
4346  * Smaller/faster portions are put inline, but the entire set of logic is too
4347  * big for that.
4348  */
4349 void
4351 {
4352  if (RelationAllowsEarlyPruning(relation)
4353  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4354  ereport(ERROR,
4355  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4356  errmsg("snapshot too old")));
4357 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:61
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1571
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:278
#define init_local_spin_delay(status)
Definition: s_lock.h:1021
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:419
#define MAX_IO_CONCURRENCY
Definition: bufmgr.h:79
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
int target_prefetch_pages
Definition: bufmgr.c:129
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:150
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:676
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3603
Definition: lwlock.h:32
#define relpathperm(rnode, forknum)
Definition: relpath.h:67
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:415
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2574
PgStat_Counter m_buf_alloc
Definition: pgstat.h:420
TimestampTz ckpt_sync_end_t
Definition: xlog.h:202
#define BM_PERMANENT
Definition: buf_internals.h:67
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1856
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:181
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:39
int errhint(const char *fmt,...)
Definition: elog.c:987
BackendId MyBackendId
Definition: globals.c:73
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1711
long local_blks_read
Definition: instrument.h:26
#define BM_TAG_VALID
Definition: buf_internals.h:61
Oid tsId
Definition: bufmgr.c:86
static int32 next
Definition: blutils.c:210
int VacuumCostBalance
Definition: globals.c:138
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2045
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2531
int BgWriterDelay
Definition: bgwriter.c:67
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:94
#define HASH_ELEM
Definition: hsearch.h:87
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:81
static uint32 PrivateRefCountClock
Definition: bufmgr.c:171
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3379
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:170
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1838
instr_time blk_read_time
Definition: instrument.h:31
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1570
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1352
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4046
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:322
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3084
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1450
int backend_flush_after
Definition: bufmgr.c:120
#define PointerGetDatum(X)
Definition: postgres.h:562
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2415
int VacuumPageHit
Definition: globals.c:134
ResourceOwner CurrentResourceOwner
Definition: resowner.c:138
struct SMgrRelationData * rd_smgr
Definition: rel.h:87
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:417
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: bufmgr.c:2883
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:893
struct timeval instr_time
Definition: instr_time.h:147
bool InRecovery
Definition: xlog.c:194
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:66
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3828
#define Min(x, y)
Definition: c.h:812
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:80
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4207
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:381
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:416
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:143
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:682
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1166
int checkpoint_flush_after
Definition: bufmgr.c:118
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:863
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3309
#define P_NEW
Definition: bufmgr.h:82
double bgwriter_lru_multiplier
Definition: bufmgr.c:110
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:904
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:92
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:114
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:7922
#define BM_DIRTY
Definition: buf_internals.h:59
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:68
int VacuumCostPageDirty
Definition: globals.c:130
void(* callback)(void *arg)
Definition: elog.h:239
struct ErrorContextCallback * previous
Definition: elog.h:238
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:841
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2763
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2665
int effective_io_concurrency
Definition: bufmgr.c:112
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4276
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:132
signed int int32
Definition: c.h:294
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4230
PGXACT * MyPgXact
Definition: proc.c:68
int bgwriter_flush_after
Definition: bufmgr.c:119
bool ComputeIoConcurrency(int io_concurrency, double *target)
Definition: bufmgr.c:467
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1722
#define RELPERSISTENCE_PERMANENT
Definition: pg_class.h:170
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:625
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:252
ErrorContextCallback * error_context_stack
Definition: elog.c:88
#define RelationOpenSmgr(relation)
Definition: rel.h:469
void ProcSendSignal(int pid)
Definition: proc.c:1786
#define SmgrIsTemp(smgr)
Definition: smgr.h:80
#define BUF_REUSABLE
Definition: bufmgr.c:66
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:208
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3875
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:936
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:120
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2437
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3332
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3718
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:439
#define RelationIsValid(relation)
Definition: rel.h:398
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:434
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:167
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4242
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:46
int bgwriter_lru_maxpages
Definition: bufmgr.c:109
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:43
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1285
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:175
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
int num_to_scan
Definition: bufmgr.c:99
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:348
float8 progress_slice
Definition: bufmgr.c:96
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2832
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1290
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:620
static char * buf
Definition: pg_test_fsync.c:67
int index
Definition: bufmgr.c:104
float8 progress
Definition: bufmgr.c:95
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3246
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:155
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4175
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:177
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:44
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:64
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:358
unsigned int uint32
Definition: c.h:306
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:660
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4144
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2802
#define BUF_WRITTEN
Definition: bufmgr.c:65
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
static bool IsForInput
Definition: bufmgr.c:133
#define ereport(elevel, rest)
Definition: elog.h:122
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:168
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3572
bool delayChkpt
Definition: proc.h:232
int VacuumCostPageHit
Definition: globals.c:128
static void BufferSync(int flags)
Definition: bufmgr.c:1778
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:137
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3774
ForkNumber
Definition: relpath.h:24
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1694
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1775
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:825
int ckpt_bufs_written
Definition: xlog.h:205
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:172
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:38
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:632
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:41
#define BM_LOCKED
Definition: buf_internals.h:58
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1290
void UnlockBuffers(void)
Definition: bufmgr.c:3518
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3942
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4065
int VacuumPageDirty
Definition: globals.c:136
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:60
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:2940
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:57
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:316
uintptr_t Datum
Definition: postgres.h:372
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3546
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:703
#define InvalidOid
Definition: postgres_ext.h:36
void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:529
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2788
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:987
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:739
RelFileNode rd_node
Definition: rel.h:85
#define Max(x, y)
Definition: c.h:806
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context)
Definition: bufmgr.c:2348
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4092
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:672
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:680
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1295
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:98
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2490
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:542
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3692
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:202
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:77
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3148
CheckpointStatsData CheckpointStats
Definition: xlog.c:175
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1656
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
size_t Size
Definition: c.h:414
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:43
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:200
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1297
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1195
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1513
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1118
void AbortBufferIO(void)
Definition: bufmgr.c:3974
BlockNumber blockNum
Definition: buf_internals.h:95
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4120
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1387
TimestampTz ckpt_sync_t
Definition: xlog.h:201
RelFileNode rnode
Definition: buf_internals.h:93
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:949
void smgrsync(void)
Definition: smgr.c:759
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1377
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:153
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3289
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:77
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:521
#define BM_IO_ERROR
Definition: buf_internals.h:63
#define PageGetLSN(page)
Definition: bufpage.h:362
#define DatumGetPointer(X)
Definition: postgres.h:555
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:600
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2605
#define PageIsNew(page)
Definition: bufpage.h:225
void * palloc(Size size)
Definition: mcxt.c:835
int errmsg(const char *fmt,...)
Definition: elog.c:797
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:169
int i
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:136
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:611
#define relpath(rnode, forknum)
Definition: relpath.h:71
#define errcontext
Definition: elog.h:164
int NBuffers
Definition: globals.c:122
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:88
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:288
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:186
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:383
int num_scanned
Definition: bufmgr.c:101
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2461
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:62
int VacuumPageMiss
Definition: globals.c:135
int VacuumCostPageMiss
Definition: globals.c:129
#define elog
Definition: elog.h:219
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:50
#define qsort(a, b, c, d)
Definition: port.h:408
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:58
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2626
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3043
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:62
#define PageSetLSN(page, lsn)
Definition: bufpage.h:364
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4027
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4350
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2591
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3347
#define XLogHintBitIsNeeded()
Definition: xlog.h:157
bool track_io_timing
Definition: bufmgr.c:111
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:74
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:176
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:49
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:125
void * Block
Definition: bufmgr.h:25
bool VacuumCostActive
Definition: globals.c:139
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:252
bool zero_damaged_pages
Definition: bufmgr.c:108
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:855
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:65
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2471