PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/xlog.h"
37 #include "catalog/catalog.h"
38 #include "catalog/storage.h"
39 #include "executor/instrument.h"
40 #include "lib/binaryheap.h"
41 #include "miscadmin.h"
42 #include "pg_trace.h"
43 #include "pgstat.h"
44 #include "postmaster/bgwriter.h"
45 #include "storage/buf_internals.h"
46 #include "storage/bufmgr.h"
47 #include "storage/ipc.h"
48 #include "storage/proc.h"
49 #include "storage/smgr.h"
50 #include "storage/standby.h"
51 #include "utils/rel.h"
52 #include "utils/resowner_private.h"
53 #include "utils/timestamp.h"
54 
55 
56 /* Note: these two macros only work on shared buffers, not local ones! */
57 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
58 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
59 
60 /* Note: this macro only works on local buffers, not shared ones! */
61 #define LocalBufHdrGetBlock(bufHdr) \
62  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
63 
64 /* Bits in SyncOneBuffer's return value */
65 #define BUF_WRITTEN 0x01
66 #define BUF_REUSABLE 0x02
67 
68 #define DROP_RELS_BSEARCH_THRESHOLD 20
69 
70 typedef struct PrivateRefCountEntry
71 {
75 
76 /* 64 bytes, about the size of a cache line on common systems */
77 #define REFCOUNT_ARRAY_ENTRIES 8
78 
79 /*
80  * Status of buffers to checkpoint for a particular tablespace, used
81  * internally in BufferSync.
82  */
83 typedef struct CkptTsStatus
84 {
85  /* oid of the tablespace */
87 
88  /*
89  * Checkpoint progress for this tablespace. To make progress comparable
90  * between tablespaces the progress is, for each tablespace, measured as a
91  * number between 0 and the total number of to-be-checkpointed pages. Each
92  * page checkpointed in this tablespace increments this space's progress
93  * by progress_slice.
94  */
97 
98  /* number of to-be checkpointed pages in this tablespace */
100  /* already processed pages in this tablespace */
102 
103  /* current offset in CkptBufferIds for this tablespace */
104  int index;
105 } CkptTsStatus;
106 
107 /* GUC variables */
108 bool zero_damaged_pages = false;
111 bool track_io_timing = false;
113 
114 /*
115  * GUC variables about triggering kernel writeback for buffers written; OS
116  * dependent defaults are set via the GUC mechanism.
117  */
121 
122 /*
123  * How many buffers PrefetchBuffer callers should try to stay ahead of their
124  * ReadBuffer calls by. This is maintained by the assign hook for
125  * effective_io_concurrency. Zero means "never prefetch". This value is
126  * only used for buffers not belonging to tablespaces that have their
127  * effective_io_concurrency parameter set.
128  */
130 
131 /* local state for StartBufferIO and related functions */
133 static bool IsForInput;
134 
135 /* local state for LockBufferForCleanup */
137 
138 /*
139  * Backend-Private refcount management:
140  *
141  * Each buffer also has a private refcount that keeps track of the number of
142  * times the buffer is pinned in the current process. This is so that the
143  * shared refcount needs to be modified only once if a buffer is pinned more
144  * than once by an individual backend. It's also used to check that no buffers
145  * are still pinned at the end of transactions and when exiting.
146  *
147  *
148  * To avoid - as we used to - requiring an array with NBuffers entries to keep
149  * track of local buffers, we use a small sequentially searched array
150  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
151  * keep track of backend local pins.
152  *
153  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
154  * refcounts are kept track of in the array; after that, new array entries
155  * displace old ones into the hash table. That way a frequently used entry
156  * can't get "stuck" in the hashtable while infrequent ones clog the array.
157  *
158  * Note that in most scenarios the number of pinned buffers will not exceed
159  * REFCOUNT_ARRAY_ENTRIES.
160  *
161  *
162  * To enter a buffer into the refcount tracking mechanism first reserve a free
163  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
164  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
165  * memory allocations in NewPrivateRefCountEntry() which can be important
166  * because in some scenarios it's called with a spinlock held...
167  */
173 
174 static void ReservePrivateRefCountEntry(void);
177 static inline int32 GetPrivateRefCount(Buffer buffer);
179 
180 /*
181  * Ensure that the PrivateRefCountArray has sufficient space to store one more
182  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
183  * a new entry - but it's perfectly fine to not use a reserved entry.
184  */
185 static void
187 {
188  /* Already reserved (or freed), nothing to do */
189  if (ReservedRefCountEntry != NULL)
190  return;
191 
192  /*
193  * First search for a free entry the array, that'll be sufficient in the
194  * majority of cases.
195  */
196  {
197  int i;
198 
199  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
200  {
202 
203  res = &PrivateRefCountArray[i];
204 
205  if (res->buffer == InvalidBuffer)
206  {
207  ReservedRefCountEntry = res;
208  return;
209  }
210  }
211  }
212 
213  /*
214  * No luck. All array entries are full. Move one array entry into the hash
215  * table.
216  */
217  {
218  /*
219  * Move entry from the current clock position in the array into the
220  * hashtable. Use that slot.
221  */
222  PrivateRefCountEntry *hashent;
223  bool found;
224 
225  /* select victim slot */
226  ReservedRefCountEntry =
228 
229  /* Better be used, otherwise we shouldn't get here. */
230  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
231 
232  /* enter victim array entry into hashtable */
233  hashent = hash_search(PrivateRefCountHash,
234  (void *) &(ReservedRefCountEntry->buffer),
235  HASH_ENTER,
236  &found);
237  Assert(!found);
238  hashent->refcount = ReservedRefCountEntry->refcount;
239 
240  /* clear the now free array slot */
241  ReservedRefCountEntry->buffer = InvalidBuffer;
242  ReservedRefCountEntry->refcount = 0;
243 
245  }
246 }
247 
248 /*
249  * Fill a previously reserved refcount entry.
250  */
251 static PrivateRefCountEntry *
253 {
255 
256  /* only allowed to be called when a reservation has been made */
257  Assert(ReservedRefCountEntry != NULL);
258 
259  /* use up the reserved entry */
260  res = ReservedRefCountEntry;
261  ReservedRefCountEntry = NULL;
262 
263  /* and fill it */
264  res->buffer = buffer;
265  res->refcount = 0;
266 
267  return res;
268 }
269 
270 /*
271  * Return the PrivateRefCount entry for the passed buffer.
272  *
273  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
274  * do_move is true, and the entry resides in the hashtable the entry is
275  * optimized for frequent access by moving it to the array.
276  */
277 static PrivateRefCountEntry *
279 {
281  int i;
282 
283  Assert(BufferIsValid(buffer));
284  Assert(!BufferIsLocal(buffer));
285 
286  /*
287  * First search for references in the array, that'll be sufficient in the
288  * majority of cases.
289  */
290  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
291  {
292  res = &PrivateRefCountArray[i];
293 
294  if (res->buffer == buffer)
295  return res;
296  }
297 
298  /*
299  * By here we know that the buffer, if already pinned, isn't residing in
300  * the array.
301  *
302  * Only look up the buffer in the hashtable if we've previously overflowed
303  * into it.
304  */
305  if (PrivateRefCountOverflowed == 0)
306  return NULL;
307 
308  res = hash_search(PrivateRefCountHash,
309  (void *) &buffer,
310  HASH_FIND,
311  NULL);
312 
313  if (res == NULL)
314  return NULL;
315  else if (!do_move)
316  {
317  /* caller doesn't want us to move the hash entry into the array */
318  return res;
319  }
320  else
321  {
322  /* move buffer from hashtable into the free array slot */
323  bool found;
325 
326  /* Ensure there's a free array slot */
328 
329  /* Use up the reserved slot */
330  Assert(ReservedRefCountEntry != NULL);
331  free = ReservedRefCountEntry;
332  ReservedRefCountEntry = NULL;
333  Assert(free->buffer == InvalidBuffer);
334 
335  /* and fill it */
336  free->buffer = buffer;
337  free->refcount = res->refcount;
338 
339  /* delete from hashtable */
340  hash_search(PrivateRefCountHash,
341  (void *) &buffer,
342  HASH_REMOVE,
343  &found);
344  Assert(found);
347 
348  return free;
349  }
350 }
351 
352 /*
353  * Returns how many times the passed buffer is pinned by this backend.
354  *
355  * Only works for shared memory buffers!
356  */
357 static inline int32
359 {
361 
362  Assert(BufferIsValid(buffer));
363  Assert(!BufferIsLocal(buffer));
364 
365  /*
366  * Not moving the entry - that's ok for the current users, but we might
367  * want to change this one day.
368  */
369  ref = GetPrivateRefCountEntry(buffer, false);
370 
371  if (ref == NULL)
372  return 0;
373  return ref->refcount;
374 }
375 
376 /*
377  * Release resources used to track the reference count of a buffer which we no
378  * longer have pinned and don't want to pin again immediately.
379  */
380 static void
382 {
383  Assert(ref->refcount == 0);
384 
385  if (ref >= &PrivateRefCountArray[0] &&
387  {
388  ref->buffer = InvalidBuffer;
389 
390  /*
391  * Mark the just used entry as reserved - in many scenarios that
392  * allows us to avoid ever having to search the array/hash for free
393  * entries.
394  */
395  ReservedRefCountEntry = ref;
396  }
397  else
398  {
399  bool found;
400  Buffer buffer = ref->buffer;
401 
402  hash_search(PrivateRefCountHash,
403  (void *) &buffer,
404  HASH_REMOVE,
405  &found);
406  Assert(found);
409  }
410 }
411 
412 /*
413  * BufferIsPinned
414  * True iff the buffer is pinned (also checks for valid buffer number).
415  *
416  * NOTE: what we check here is that *this* backend holds a pin on
417  * the buffer. We do not care whether some other backend does.
418  */
419 #define BufferIsPinned(bufnum) \
420 ( \
421  !BufferIsValid(bufnum) ? \
422  false \
423  : \
424  BufferIsLocal(bufnum) ? \
425  (LocalRefCount[-(bufnum) - 1] > 0) \
426  : \
427  (GetPrivateRefCount(bufnum) > 0) \
428 )
429 
430 
431 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
432  ForkNumber forkNum, BlockNumber blockNum,
433  ReadBufferMode mode, BufferAccessStrategy strategy,
434  bool *hit);
435 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
436 static void PinBuffer_Locked(BufferDesc *buf);
437 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
438 static void BufferSync(int flags);
440 static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context);
441 static void WaitIO(BufferDesc *buf);
442 static bool StartBufferIO(BufferDesc *buf, bool forInput);
443 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
444  uint32 set_flag_bits);
445 static void shared_buffer_write_error_callback(void *arg);
446 static void local_buffer_write_error_callback(void *arg);
447 static BufferDesc *BufferAlloc(SMgrRelation smgr,
448  char relpersistence,
449  ForkNumber forkNum,
450  BlockNumber blockNum,
451  BufferAccessStrategy strategy,
452  bool *foundPtr);
453 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
454 static void AtProcExit_Buffers(int code, Datum arg);
455 static void CheckForBufferLeaks(void);
456 static int rnode_comparator(const void *p1, const void *p2);
457 static int buffertag_comparator(const void *p1, const void *p2);
458 static int ckpt_buforder_comparator(const void *pa, const void *pb);
459 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
460 
461 
462 /*
463  * ComputeIoConcurrency -- get the number of pages to prefetch for a given
464  * number of spindles.
465  */
466 bool
467 ComputeIoConcurrency(int io_concurrency, double *target)
468 {
469  double new_prefetch_pages = 0.0;
470  int i;
471 
472  /*
473  * Make sure the io_concurrency value is within valid range; it may have
474  * been forced with a manual pg_tablespace update.
475  */
476  io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
477 
478  /*----------
479  * The user-visible GUC parameter is the number of drives (spindles),
480  * which we need to translate to a number-of-pages-to-prefetch target.
481  * The target value is stashed in *extra and then assigned to the actual
482  * variable by assign_effective_io_concurrency.
483  *
484  * The expected number of prefetch pages needed to keep N drives busy is:
485  *
486  * drives | I/O requests
487  * -------+----------------
488  * 1 | 1
489  * 2 | 2/1 + 2/2 = 3
490  * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
491  * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
492  * n | n * H(n)
493  *
494  * This is called the "coupon collector problem" and H(n) is called the
495  * harmonic series. This could be approximated by n * ln(n), but for
496  * reasonable numbers of drives we might as well just compute the series.
497  *
498  * Alternatively we could set the target to the number of pages necessary
499  * so that the expected number of active spindles is some arbitrary
500  * percentage of the total. This sounds the same but is actually slightly
501  * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
502  * that desired fraction.
503  *
504  * Experimental results show that both of these formulas aren't aggressive
505  * enough, but we don't really have any better proposals.
506  *
507  * Note that if io_concurrency = 0 (disabled), we must set target = 0.
508  *----------
509  */
510 
511  for (i = 1; i <= io_concurrency; i++)
512  new_prefetch_pages += (double) io_concurrency / (double) i;
513 
514  *target = new_prefetch_pages;
515 
516  /* This range check shouldn't fail, but let's be paranoid */
517  return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
518 }
519 
520 /*
521  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
522  *
523  * This is named by analogy to ReadBuffer but doesn't actually allocate a
524  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
525  * block will not be delayed by the I/O. Prefetching is optional.
526  * No-op if prefetching isn't compiled in.
527  */
528 void
530 {
531 #ifdef USE_PREFETCH
532  Assert(RelationIsValid(reln));
533  Assert(BlockNumberIsValid(blockNum));
534 
535  /* Open it at the smgr level if not already done */
536  RelationOpenSmgr(reln);
537 
538  if (RelationUsesLocalBuffers(reln))
539  {
540  /* see comments in ReadBufferExtended */
541  if (RELATION_IS_OTHER_TEMP(reln))
542  ereport(ERROR,
543  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
544  errmsg("cannot access temporary tables of other sessions")));
545 
546  /* pass it off to localbuf.c */
547  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
548  }
549  else
550  {
551  BufferTag newTag; /* identity of requested block */
552  uint32 newHash; /* hash value for newTag */
553  LWLock *newPartitionLock; /* buffer partition lock for it */
554  int buf_id;
555 
556  /* create a tag so we can lookup the buffer */
557  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
558  forkNum, blockNum);
559 
560  /* determine its hash code and partition lock ID */
561  newHash = BufTableHashCode(&newTag);
562  newPartitionLock = BufMappingPartitionLock(newHash);
563 
564  /* see if the block is in the buffer pool already */
565  LWLockAcquire(newPartitionLock, LW_SHARED);
566  buf_id = BufTableLookup(&newTag, newHash);
567  LWLockRelease(newPartitionLock);
568 
569  /* If not in buffers, initiate prefetch */
570  if (buf_id < 0)
571  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
572 
573  /*
574  * If the block *is* in buffers, we do nothing. This is not really
575  * ideal: the block might be just about to be evicted, which would be
576  * stupid since we know we are going to need it soon. But the only
577  * easy answer is to bump the usage_count, which does not seem like a
578  * great solution: when the caller does ultimately touch the block,
579  * usage_count would get bumped again, resulting in too much
580  * favoritism for blocks that are involved in a prefetch sequence. A
581  * real fix would involve some additional per-buffer state, and it's
582  * not clear that there's enough of a problem to justify that.
583  */
584  }
585 #endif /* USE_PREFETCH */
586 }
587 
588 
589 /*
590  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
591  * fork with RBM_NORMAL mode and default strategy.
592  */
593 Buffer
595 {
596  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
597 }
598 
599 /*
600  * ReadBufferExtended -- returns a buffer containing the requested
601  * block of the requested relation. If the blknum
602  * requested is P_NEW, extend the relation file and
603  * allocate a new block. (Caller is responsible for
604  * ensuring that only one backend tries to extend a
605  * relation at the same time!)
606  *
607  * Returns: the buffer number for the buffer containing
608  * the block read. The returned buffer has been pinned.
609  * Does not return on error --- elog's instead.
610  *
611  * Assume when this function is called, that reln has been opened already.
612  *
613  * In RBM_NORMAL mode, the page is read from disk, and the page header is
614  * validated. An error is thrown if the page header is not valid. (But
615  * note that an all-zero page is considered "valid"; see PageIsVerified().)
616  *
617  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
618  * valid, the page is zeroed instead of throwing an error. This is intended
619  * for non-critical data, where the caller is prepared to repair errors.
620  *
621  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
622  * filled with zeros instead of reading it from disk. Useful when the caller
623  * is going to fill the page from scratch, since this saves I/O and avoids
624  * unnecessary failure if the page-on-disk has corrupt page headers.
625  * The page is returned locked to ensure that the caller has a chance to
626  * initialize the page before it's made visible to others.
627  * Caution: do not use this mode to read a page that is beyond the relation's
628  * current physical EOF; that is likely to cause problems in md.c when
629  * the page is modified and written out. P_NEW is OK, though.
630  *
631  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
632  * a cleanup-strength lock on the page.
633  *
634  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
635  *
636  * If strategy is not NULL, a nondefault buffer access strategy is used.
637  * See buffer/README for details.
638  */
639 Buffer
641  ReadBufferMode mode, BufferAccessStrategy strategy)
642 {
643  bool hit;
644  Buffer buf;
645 
646  /* Open it at the smgr level if not already done */
647  RelationOpenSmgr(reln);
648 
649  /*
650  * Reject attempts to read non-local temporary relations; we would be
651  * likely to get wrong data since we have no visibility into the owning
652  * session's local buffers.
653  */
654  if (RELATION_IS_OTHER_TEMP(reln))
655  ereport(ERROR,
656  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
657  errmsg("cannot access temporary tables of other sessions")));
658 
659  /*
660  * Read the buffer, and update pgstat counters to reflect a cache hit or
661  * miss.
662  */
664  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
665  forkNum, blockNum, mode, strategy, &hit);
666  if (hit)
668  return buf;
669 }
670 
671 
672 /*
673  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
674  * a relcache entry for the relation.
675  *
676  * NB: At present, this function may only be used on permanent relations, which
677  * is OK, because we only use it during XLOG replay. If in the future we
678  * want to use it on temporary or unlogged relations, we could pass additional
679  * parameters.
680  */
681 Buffer
683  BlockNumber blockNum, ReadBufferMode mode,
684  BufferAccessStrategy strategy)
685 {
686  bool hit;
687 
688  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
689 
691 
692  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
693  mode, strategy, &hit);
694 }
695 
696 
697 /*
698  * ReadBuffer_common -- common logic for all ReadBuffer variants
699  *
700  * *hit is set to true if the request was satisfied from shared buffer cache.
701  */
702 static Buffer
703 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
704  BlockNumber blockNum, ReadBufferMode mode,
705  BufferAccessStrategy strategy, bool *hit)
706 {
707  BufferDesc *bufHdr;
708  Block bufBlock;
709  bool found;
710  bool isExtend;
711  bool isLocalBuf = SmgrIsTemp(smgr);
712 
713  *hit = false;
714 
715  /* Make sure we will have room to remember the buffer pin */
717 
718  isExtend = (blockNum == P_NEW);
719 
720  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
721  smgr->smgr_rnode.node.spcNode,
722  smgr->smgr_rnode.node.dbNode,
723  smgr->smgr_rnode.node.relNode,
724  smgr->smgr_rnode.backend,
725  isExtend);
726 
727  /* Substitute proper block number if caller asked for P_NEW */
728  if (isExtend)
729  blockNum = smgrnblocks(smgr, forkNum);
730 
731  if (isLocalBuf)
732  {
733  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
734  if (found)
736  else
738  }
739  else
740  {
741  /*
742  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
743  * not currently in memory.
744  */
745  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
746  strategy, &found);
747  if (found)
749  else
751  }
752 
753  /* At this point we do NOT hold any locks. */
754 
755  /* if it was already in the buffer pool, we're done */
756  if (found)
757  {
758  if (!isExtend)
759  {
760  /* Just need to update stats before we exit */
761  *hit = true;
762  VacuumPageHit++;
763 
764  if (VacuumCostActive)
766 
767  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
768  smgr->smgr_rnode.node.spcNode,
769  smgr->smgr_rnode.node.dbNode,
770  smgr->smgr_rnode.node.relNode,
771  smgr->smgr_rnode.backend,
772  isExtend,
773  found);
774 
775  /*
776  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
777  * locked on return.
778  */
779  if (!isLocalBuf)
780  {
781  if (mode == RBM_ZERO_AND_LOCK)
783  LW_EXCLUSIVE);
784  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
786  }
787 
788  return BufferDescriptorGetBuffer(bufHdr);
789  }
790 
791  /*
792  * We get here only in the corner case where we are trying to extend
793  * the relation but we found a pre-existing buffer marked BM_VALID.
794  * This can happen because mdread doesn't complain about reads beyond
795  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
796  * read a block beyond EOF could have left a "valid" zero-filled
797  * buffer. Unfortunately, we have also seen this case occurring
798  * because of buggy Linux kernels that sometimes return an
799  * lseek(SEEK_END) result that doesn't account for a recent write. In
800  * that situation, the pre-existing buffer would contain valid data
801  * that we don't want to overwrite. Since the legitimate case should
802  * always have left a zero-filled buffer, complain if not PageIsNew.
803  */
804  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
805  if (!PageIsNew((Page) bufBlock))
806  ereport(ERROR,
807  (errmsg("unexpected data beyond EOF in block %u of relation %s",
808  blockNum, relpath(smgr->smgr_rnode, forkNum)),
809  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
810 
811  /*
812  * We *must* do smgrextend before succeeding, else the page will not
813  * be reserved by the kernel, and the next P_NEW call will decide to
814  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
815  * call that BufferAlloc didn't, and proceed.
816  */
817  if (isLocalBuf)
818  {
819  /* Only need to adjust flags */
820  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
821 
822  Assert(buf_state & BM_VALID);
823  buf_state &= ~BM_VALID;
824  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
825  }
826  else
827  {
828  /*
829  * Loop to handle the very small possibility that someone re-sets
830  * BM_VALID between our clearing it and StartBufferIO inspecting
831  * it.
832  */
833  do
834  {
835  uint32 buf_state = LockBufHdr(bufHdr);
836 
837  Assert(buf_state & BM_VALID);
838  buf_state &= ~BM_VALID;
839  UnlockBufHdr(bufHdr, buf_state);
840  } while (!StartBufferIO(bufHdr, true));
841  }
842  }
843 
844  /*
845  * if we have gotten to this point, we have allocated a buffer for the
846  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
847  * if it's a shared buffer.
848  *
849  * Note: if smgrextend fails, we will end up with a buffer that is
850  * allocated but not marked BM_VALID. P_NEW will still select the same
851  * block number (because the relation didn't get any longer on disk) and
852  * so future attempts to extend the relation will find the same buffer (if
853  * it's not been recycled) but come right back here to try smgrextend
854  * again.
855  */
856  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
857 
858  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
859 
860  if (isExtend)
861  {
862  /* new buffers are zero-filled */
863  MemSet((char *) bufBlock, 0, BLCKSZ);
864  /* don't set checksum for all-zero page */
865  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
866 
867  /*
868  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
869  * although we're essentially performing a write. At least on linux
870  * doing so defeats the 'delayed allocation' mechanism, leading to
871  * increased file fragmentation.
872  */
873  }
874  else
875  {
876  /*
877  * Read in the page, unless the caller intends to overwrite it and
878  * just wants us to allocate a buffer.
879  */
880  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
881  MemSet((char *) bufBlock, 0, BLCKSZ);
882  else
883  {
884  instr_time io_start,
885  io_time;
886 
887  if (track_io_timing)
888  INSTR_TIME_SET_CURRENT(io_start);
889 
890  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
891 
892  if (track_io_timing)
893  {
894  INSTR_TIME_SET_CURRENT(io_time);
895  INSTR_TIME_SUBTRACT(io_time, io_start);
898  }
899 
900  /* check for garbage data */
901  if (!PageIsVerified((Page) bufBlock, blockNum))
902  {
903  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
904  {
906  (errcode(ERRCODE_DATA_CORRUPTED),
907  errmsg("invalid page in block %u of relation %s; zeroing out page",
908  blockNum,
909  relpath(smgr->smgr_rnode, forkNum))));
910  MemSet((char *) bufBlock, 0, BLCKSZ);
911  }
912  else
913  ereport(ERROR,
914  (errcode(ERRCODE_DATA_CORRUPTED),
915  errmsg("invalid page in block %u of relation %s",
916  blockNum,
917  relpath(smgr->smgr_rnode, forkNum))));
918  }
919  }
920  }
921 
922  /*
923  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
924  * the page as valid, to make sure that no other backend sees the zeroed
925  * page before the caller has had a chance to initialize it.
926  *
927  * Since no-one else can be looking at the page contents yet, there is no
928  * difference between an exclusive lock and a cleanup-strength lock. (Note
929  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
930  * they assert that the buffer is already valid.)
931  */
932  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
933  !isLocalBuf)
934  {
936  }
937 
938  if (isLocalBuf)
939  {
940  /* Only need to adjust flags */
941  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
942 
943  buf_state |= BM_VALID;
944  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
945  }
946  else
947  {
948  /* Set BM_VALID, terminate IO, and wake up any waiters */
949  TerminateBufferIO(bufHdr, false, BM_VALID);
950  }
951 
952  VacuumPageMiss++;
953  if (VacuumCostActive)
955 
956  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
957  smgr->smgr_rnode.node.spcNode,
958  smgr->smgr_rnode.node.dbNode,
959  smgr->smgr_rnode.node.relNode,
960  smgr->smgr_rnode.backend,
961  isExtend,
962  found);
963 
964  return BufferDescriptorGetBuffer(bufHdr);
965 }
966 
967 /*
968  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
969  * buffer. If no buffer exists already, selects a replacement
970  * victim and evicts the old page, but does NOT read in new page.
971  *
972  * "strategy" can be a buffer replacement strategy object, or NULL for
973  * the default strategy. The selected buffer's usage_count is advanced when
974  * using the default strategy, but otherwise possibly not (see PinBuffer).
975  *
976  * The returned buffer is pinned and is already marked as holding the
977  * desired page. If it already did have the desired page, *foundPtr is
978  * set TRUE. Otherwise, *foundPtr is set FALSE and the buffer is marked
979  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
980  *
981  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
982  * we keep it for simplicity in ReadBuffer.
983  *
984  * No locks are held either at entry or exit.
985  */
986 static BufferDesc *
987 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
988  BlockNumber blockNum,
989  BufferAccessStrategy strategy,
990  bool *foundPtr)
991 {
992  BufferTag newTag; /* identity of requested block */
993  uint32 newHash; /* hash value for newTag */
994  LWLock *newPartitionLock; /* buffer partition lock for it */
995  BufferTag oldTag; /* previous identity of selected buffer */
996  uint32 oldHash; /* hash value for oldTag */
997  LWLock *oldPartitionLock; /* buffer partition lock for it */
998  uint32 oldFlags;
999  int buf_id;
1000  BufferDesc *buf;
1001  bool valid;
1002  uint32 buf_state;
1003 
1004  /* create a tag so we can lookup the buffer */
1005  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1006 
1007  /* determine its hash code and partition lock ID */
1008  newHash = BufTableHashCode(&newTag);
1009  newPartitionLock = BufMappingPartitionLock(newHash);
1010 
1011  /* see if the block is in the buffer pool already */
1012  LWLockAcquire(newPartitionLock, LW_SHARED);
1013  buf_id = BufTableLookup(&newTag, newHash);
1014  if (buf_id >= 0)
1015  {
1016  /*
1017  * Found it. Now, pin the buffer so no one can steal it from the
1018  * buffer pool, and check to see if the correct data has been loaded
1019  * into the buffer.
1020  */
1021  buf = GetBufferDescriptor(buf_id);
1022 
1023  valid = PinBuffer(buf, strategy);
1024 
1025  /* Can release the mapping lock as soon as we've pinned it */
1026  LWLockRelease(newPartitionLock);
1027 
1028  *foundPtr = TRUE;
1029 
1030  if (!valid)
1031  {
1032  /*
1033  * We can only get here if (a) someone else is still reading in
1034  * the page, or (b) a previous read attempt failed. We have to
1035  * wait for any active read attempt to finish, and then set up our
1036  * own read attempt if the page is still not BM_VALID.
1037  * StartBufferIO does it all.
1038  */
1039  if (StartBufferIO(buf, true))
1040  {
1041  /*
1042  * If we get here, previous attempts to read the buffer must
1043  * have failed ... but we shall bravely try again.
1044  */
1045  *foundPtr = FALSE;
1046  }
1047  }
1048 
1049  return buf;
1050  }
1051 
1052  /*
1053  * Didn't find it in the buffer pool. We'll have to initialize a new
1054  * buffer. Remember to unlock the mapping lock while doing the work.
1055  */
1056  LWLockRelease(newPartitionLock);
1057 
1058  /* Loop here in case we have to try another victim buffer */
1059  for (;;)
1060  {
1061  /*
1062  * Ensure, while the spinlock's not yet held, that there's a free
1063  * refcount entry.
1064  */
1066 
1067  /*
1068  * Select a victim buffer. The buffer is returned with its header
1069  * spinlock still held!
1070  */
1071  buf = StrategyGetBuffer(strategy, &buf_state);
1072 
1073  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1074 
1075  /* Must copy buffer flags while we still hold the spinlock */
1076  oldFlags = buf_state & BUF_FLAG_MASK;
1077 
1078  /* Pin the buffer and then release the buffer spinlock */
1079  PinBuffer_Locked(buf);
1080 
1081  /*
1082  * If the buffer was dirty, try to write it out. There is a race
1083  * condition here, in that someone might dirty it after we released it
1084  * above, or even while we are writing it out (since our share-lock
1085  * won't prevent hint-bit updates). We will recheck the dirty bit
1086  * after re-locking the buffer header.
1087  */
1088  if (oldFlags & BM_DIRTY)
1089  {
1090  /*
1091  * We need a share-lock on the buffer contents to write it out
1092  * (else we might write invalid data, eg because someone else is
1093  * compacting the page contents while we write). We must use a
1094  * conditional lock acquisition here to avoid deadlock. Even
1095  * though the buffer was not pinned (and therefore surely not
1096  * locked) when StrategyGetBuffer returned it, someone else could
1097  * have pinned and exclusive-locked it by the time we get here. If
1098  * we try to get the lock unconditionally, we'd block waiting for
1099  * them; if they later block waiting for us, deadlock ensues.
1100  * (This has been observed to happen when two backends are both
1101  * trying to split btree index pages, and the second one just
1102  * happens to be trying to split the page the first one got from
1103  * StrategyGetBuffer.)
1104  */
1106  LW_SHARED))
1107  {
1108  /*
1109  * If using a nondefault strategy, and writing the buffer
1110  * would require a WAL flush, let the strategy decide whether
1111  * to go ahead and write/reuse the buffer or to choose another
1112  * victim. We need lock to inspect the page LSN, so this
1113  * can't be done inside StrategyGetBuffer.
1114  */
1115  if (strategy != NULL)
1116  {
1117  XLogRecPtr lsn;
1118 
1119  /* Read the LSN while holding buffer header lock */
1120  buf_state = LockBufHdr(buf);
1121  lsn = BufferGetLSN(buf);
1122  UnlockBufHdr(buf, buf_state);
1123 
1124  if (XLogNeedsFlush(lsn) &&
1125  StrategyRejectBuffer(strategy, buf))
1126  {
1127  /* Drop lock/pin and loop around for another buffer */
1129  UnpinBuffer(buf, true);
1130  continue;
1131  }
1132  }
1133 
1134  /* OK, do the I/O */
1135  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1136  smgr->smgr_rnode.node.spcNode,
1137  smgr->smgr_rnode.node.dbNode,
1138  smgr->smgr_rnode.node.relNode);
1139 
1140  FlushBuffer(buf, NULL);
1142 
1144  &buf->tag);
1145 
1146  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1147  smgr->smgr_rnode.node.spcNode,
1148  smgr->smgr_rnode.node.dbNode,
1149  smgr->smgr_rnode.node.relNode);
1150  }
1151  else
1152  {
1153  /*
1154  * Someone else has locked the buffer, so give it up and loop
1155  * back to get another one.
1156  */
1157  UnpinBuffer(buf, true);
1158  continue;
1159  }
1160  }
1161 
1162  /*
1163  * To change the association of a valid buffer, we'll need to have
1164  * exclusive lock on both the old and new mapping partitions.
1165  */
1166  if (oldFlags & BM_TAG_VALID)
1167  {
1168  /*
1169  * Need to compute the old tag's hashcode and partition lock ID.
1170  * XXX is it worth storing the hashcode in BufferDesc so we need
1171  * not recompute it here? Probably not.
1172  */
1173  oldTag = buf->tag;
1174  oldHash = BufTableHashCode(&oldTag);
1175  oldPartitionLock = BufMappingPartitionLock(oldHash);
1176 
1177  /*
1178  * Must lock the lower-numbered partition first to avoid
1179  * deadlocks.
1180  */
1181  if (oldPartitionLock < newPartitionLock)
1182  {
1183  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1184  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1185  }
1186  else if (oldPartitionLock > newPartitionLock)
1187  {
1188  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1189  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1190  }
1191  else
1192  {
1193  /* only one partition, only one lock */
1194  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1195  }
1196  }
1197  else
1198  {
1199  /* if it wasn't valid, we need only the new partition */
1200  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1201  /* remember we have no old-partition lock or tag */
1202  oldPartitionLock = NULL;
1203  /* this just keeps the compiler quiet about uninit variables */
1204  oldHash = 0;
1205  }
1206 
1207  /*
1208  * Try to make a hashtable entry for the buffer under its new tag.
1209  * This could fail because while we were writing someone else
1210  * allocated another buffer for the same block we want to read in.
1211  * Note that we have not yet removed the hashtable entry for the old
1212  * tag.
1213  */
1214  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1215 
1216  if (buf_id >= 0)
1217  {
1218  /*
1219  * Got a collision. Someone has already done what we were about to
1220  * do. We'll just handle this as if it were found in the buffer
1221  * pool in the first place. First, give up the buffer we were
1222  * planning to use.
1223  */
1224  UnpinBuffer(buf, true);
1225 
1226  /* Can give up that buffer's mapping partition lock now */
1227  if (oldPartitionLock != NULL &&
1228  oldPartitionLock != newPartitionLock)
1229  LWLockRelease(oldPartitionLock);
1230 
1231  /* remaining code should match code at top of routine */
1232 
1233  buf = GetBufferDescriptor(buf_id);
1234 
1235  valid = PinBuffer(buf, strategy);
1236 
1237  /* Can release the mapping lock as soon as we've pinned it */
1238  LWLockRelease(newPartitionLock);
1239 
1240  *foundPtr = TRUE;
1241 
1242  if (!valid)
1243  {
1244  /*
1245  * We can only get here if (a) someone else is still reading
1246  * in the page, or (b) a previous read attempt failed. We
1247  * have to wait for any active read attempt to finish, and
1248  * then set up our own read attempt if the page is still not
1249  * BM_VALID. StartBufferIO does it all.
1250  */
1251  if (StartBufferIO(buf, true))
1252  {
1253  /*
1254  * If we get here, previous attempts to read the buffer
1255  * must have failed ... but we shall bravely try again.
1256  */
1257  *foundPtr = FALSE;
1258  }
1259  }
1260 
1261  return buf;
1262  }
1263 
1264  /*
1265  * Need to lock the buffer header too in order to change its tag.
1266  */
1267  buf_state = LockBufHdr(buf);
1268 
1269  /*
1270  * Somebody could have pinned or re-dirtied the buffer while we were
1271  * doing the I/O and making the new hashtable entry. If so, we can't
1272  * recycle this buffer; we must undo everything we've done and start
1273  * over with a new victim buffer.
1274  */
1275  oldFlags = buf_state & BUF_FLAG_MASK;
1276  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1277  break;
1278 
1279  UnlockBufHdr(buf, buf_state);
1280  BufTableDelete(&newTag, newHash);
1281  if (oldPartitionLock != NULL &&
1282  oldPartitionLock != newPartitionLock)
1283  LWLockRelease(oldPartitionLock);
1284  LWLockRelease(newPartitionLock);
1285  UnpinBuffer(buf, true);
1286  }
1287 
1288  /*
1289  * Okay, it's finally safe to rename the buffer.
1290  *
1291  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1292  * paranoia. We also reset the usage_count since any recency of use of
1293  * the old content is no longer relevant. (The usage_count starts out at
1294  * 1 so that the buffer can survive one clock-sweep pass.)
1295  */
1296  buf->tag = newTag;
1297  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1300  if (relpersistence == RELPERSISTENCE_PERMANENT)
1301  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1302  else
1303  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1304 
1305  UnlockBufHdr(buf, buf_state);
1306 
1307  if (oldPartitionLock != NULL)
1308  {
1309  BufTableDelete(&oldTag, oldHash);
1310  if (oldPartitionLock != newPartitionLock)
1311  LWLockRelease(oldPartitionLock);
1312  }
1313 
1314  LWLockRelease(newPartitionLock);
1315 
1316  /*
1317  * Buffer contents are currently invalid. Try to get the io_in_progress
1318  * lock. If StartBufferIO returns false, then someone else managed to
1319  * read it before we did, so there's nothing left for BufferAlloc() to do.
1320  */
1321  if (StartBufferIO(buf, true))
1322  *foundPtr = FALSE;
1323  else
1324  *foundPtr = TRUE;
1325 
1326  return buf;
1327 }
1328 
1329 /*
1330  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1331  * freelist.
1332  *
1333  * The buffer header spinlock must be held at entry. We drop it before
1334  * returning. (This is sane because the caller must have locked the
1335  * buffer in order to be sure it should be dropped.)
1336  *
1337  * This is used only in contexts such as dropping a relation. We assume
1338  * that no other backend could possibly be interested in using the page,
1339  * so the only reason the buffer might be pinned is if someone else is
1340  * trying to write it out. We have to let them finish before we can
1341  * reclaim the buffer.
1342  *
1343  * The buffer could get reclaimed by someone else while we are waiting
1344  * to acquire the necessary locks; if so, don't mess it up.
1345  */
1346 static void
1348 {
1349  BufferTag oldTag;
1350  uint32 oldHash; /* hash value for oldTag */
1351  LWLock *oldPartitionLock; /* buffer partition lock for it */
1352  uint32 oldFlags;
1353  uint32 buf_state;
1354 
1355  /* Save the original buffer tag before dropping the spinlock */
1356  oldTag = buf->tag;
1357 
1358  buf_state = pg_atomic_read_u32(&buf->state);
1359  Assert(buf_state & BM_LOCKED);
1360  UnlockBufHdr(buf, buf_state);
1361 
1362  /*
1363  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1364  * worth storing the hashcode in BufferDesc so we need not recompute it
1365  * here? Probably not.
1366  */
1367  oldHash = BufTableHashCode(&oldTag);
1368  oldPartitionLock = BufMappingPartitionLock(oldHash);
1369 
1370 retry:
1371 
1372  /*
1373  * Acquire exclusive mapping lock in preparation for changing the buffer's
1374  * association.
1375  */
1376  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1377 
1378  /* Re-lock the buffer header */
1379  buf_state = LockBufHdr(buf);
1380 
1381  /* If it's changed while we were waiting for lock, do nothing */
1382  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1383  {
1384  UnlockBufHdr(buf, buf_state);
1385  LWLockRelease(oldPartitionLock);
1386  return;
1387  }
1388 
1389  /*
1390  * We assume the only reason for it to be pinned is that someone else is
1391  * flushing the page out. Wait for them to finish. (This could be an
1392  * infinite loop if the refcount is messed up... it would be nice to time
1393  * out after awhile, but there seems no way to be sure how many loops may
1394  * be needed. Note that if the other guy has pinned the buffer but not
1395  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1396  * be busy-looping here.)
1397  */
1398  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1399  {
1400  UnlockBufHdr(buf, buf_state);
1401  LWLockRelease(oldPartitionLock);
1402  /* safety check: should definitely not be our *own* pin */
1404  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1405  WaitIO(buf);
1406  goto retry;
1407  }
1408 
1409  /*
1410  * Clear out the buffer's tag and flags. We must do this to ensure that
1411  * linear scans of the buffer array don't think the buffer is valid.
1412  */
1413  oldFlags = buf_state & BUF_FLAG_MASK;
1414  CLEAR_BUFFERTAG(buf->tag);
1415  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1416  UnlockBufHdr(buf, buf_state);
1417 
1418  /*
1419  * Remove the buffer from the lookup hashtable, if it was in there.
1420  */
1421  if (oldFlags & BM_TAG_VALID)
1422  BufTableDelete(&oldTag, oldHash);
1423 
1424  /*
1425  * Done with mapping lock.
1426  */
1427  LWLockRelease(oldPartitionLock);
1428 
1429  /*
1430  * Insert the buffer at the head of the list of free buffers.
1431  */
1432  StrategyFreeBuffer(buf);
1433 }
1434 
1435 /*
1436  * MarkBufferDirty
1437  *
1438  * Marks buffer contents as dirty (actual write happens later).
1439  *
1440  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1441  * exclusive lock, then somebody could be in process of writing the buffer,
1442  * leading to risk of bad data written to disk.)
1443  */
1444 void
1446 {
1447  BufferDesc *bufHdr;
1448  uint32 buf_state;
1449  uint32 old_buf_state;
1450 
1451  if (!BufferIsValid(buffer))
1452  elog(ERROR, "bad buffer ID: %d", buffer);
1453 
1454  if (BufferIsLocal(buffer))
1455  {
1456  MarkLocalBufferDirty(buffer);
1457  return;
1458  }
1459 
1460  bufHdr = GetBufferDescriptor(buffer - 1);
1461 
1462  Assert(BufferIsPinned(buffer));
1464  LW_EXCLUSIVE));
1465 
1466  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1467  for (;;)
1468  {
1469  if (old_buf_state & BM_LOCKED)
1470  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1471 
1472  buf_state = old_buf_state;
1473 
1474  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1475  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1476 
1477  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1478  buf_state))
1479  break;
1480  }
1481 
1482  /*
1483  * If the buffer was not dirty already, do vacuum accounting.
1484  */
1485  if (!(old_buf_state & BM_DIRTY))
1486  {
1487  VacuumPageDirty++;
1489  if (VacuumCostActive)
1491  }
1492 }
1493 
1494 /*
1495  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1496  *
1497  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1498  * compared to calling the two routines separately. Now it's mainly just
1499  * a convenience function. However, if the passed buffer is valid and
1500  * already contains the desired block, we just return it as-is; and that
1501  * does save considerable work compared to a full release and reacquire.
1502  *
1503  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1504  * buffer actually needs to be released. This case is the same as ReadBuffer,
1505  * but can save some tests in the caller.
1506  */
1507 Buffer
1509  Relation relation,
1510  BlockNumber blockNum)
1511 {
1512  ForkNumber forkNum = MAIN_FORKNUM;
1513  BufferDesc *bufHdr;
1514 
1515  if (BufferIsValid(buffer))
1516  {
1517  Assert(BufferIsPinned(buffer));
1518  if (BufferIsLocal(buffer))
1519  {
1520  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1521  if (bufHdr->tag.blockNum == blockNum &&
1522  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1523  bufHdr->tag.forkNum == forkNum)
1524  return buffer;
1526  LocalRefCount[-buffer - 1]--;
1527  }
1528  else
1529  {
1530  bufHdr = GetBufferDescriptor(buffer - 1);
1531  /* we have pin, so it's ok to examine tag without spinlock */
1532  if (bufHdr->tag.blockNum == blockNum &&
1533  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1534  bufHdr->tag.forkNum == forkNum)
1535  return buffer;
1536  UnpinBuffer(bufHdr, true);
1537  }
1538  }
1539 
1540  return ReadBuffer(relation, blockNum);
1541 }
1542 
1543 /*
1544  * PinBuffer -- make buffer unavailable for replacement.
1545  *
1546  * For the default access strategy, the buffer's usage_count is incremented
1547  * when we first pin it; for other strategies we just make sure the usage_count
1548  * isn't zero. (The idea of the latter is that we don't want synchronized
1549  * heap scans to inflate the count, but we need it to not be zero to discourage
1550  * other backends from stealing buffers from our ring. As long as we cycle
1551  * through the ring faster than the global clock-sweep cycles, buffers in
1552  * our ring won't be chosen as victims for replacement by other backends.)
1553  *
1554  * This should be applied only to shared buffers, never local ones.
1555  *
1556  * Since buffers are pinned/unpinned very frequently, pin buffers without
1557  * taking the buffer header lock; instead update the state variable in loop of
1558  * CAS operations. Hopefully it's just a single CAS.
1559  *
1560  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1561  *
1562  * Returns TRUE if buffer is BM_VALID, else FALSE. This provision allows
1563  * some callers to avoid an extra spinlock cycle.
1564  */
1565 static bool
1567 {
1569  bool result;
1570  PrivateRefCountEntry *ref;
1571 
1572  ref = GetPrivateRefCountEntry(b, true);
1573 
1574  if (ref == NULL)
1575  {
1576  uint32 buf_state;
1577  uint32 old_buf_state;
1578 
1580  ref = NewPrivateRefCountEntry(b);
1581 
1582  old_buf_state = pg_atomic_read_u32(&buf->state);
1583  for (;;)
1584  {
1585  if (old_buf_state & BM_LOCKED)
1586  old_buf_state = WaitBufHdrUnlocked(buf);
1587 
1588  buf_state = old_buf_state;
1589 
1590  /* increase refcount */
1591  buf_state += BUF_REFCOUNT_ONE;
1592 
1593  /* increase usagecount unless already max */
1594  if (BUF_STATE_GET_USAGECOUNT(buf_state) != BM_MAX_USAGE_COUNT)
1595  buf_state += BUF_USAGECOUNT_ONE;
1596 
1597  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1598  buf_state))
1599  {
1600  result = (buf_state & BM_VALID) != 0;
1601  break;
1602  }
1603  }
1604  }
1605  else
1606  {
1607  /* If we previously pinned the buffer, it must surely be valid */
1608  result = true;
1609  }
1610 
1611  ref->refcount++;
1612  Assert(ref->refcount > 0);
1614  return result;
1615 }
1616 
1617 /*
1618  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1619  * The spinlock is released before return.
1620  *
1621  * As this function is called with the spinlock held, the caller has to
1622  * previously call ReservePrivateRefCountEntry().
1623  *
1624  * Currently, no callers of this function want to modify the buffer's
1625  * usage_count at all, so there's no need for a strategy parameter.
1626  * Also we don't bother with a BM_VALID test (the caller could check that for
1627  * itself).
1628  *
1629  * Also all callers only ever use this function when it's known that the
1630  * buffer can't have a preexisting pin by this backend. That allows us to skip
1631  * searching the private refcount array & hash, which is a boon, because the
1632  * spinlock is still held.
1633  *
1634  * Note: use of this routine is frequently mandatory, not just an optimization
1635  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1636  * its state can change under us.
1637  */
1638 static void
1640 {
1641  Buffer b;
1642  PrivateRefCountEntry *ref;
1643  uint32 buf_state;
1644 
1645  /*
1646  * As explained, We don't expect any preexisting pins. That allows us to
1647  * manipulate the PrivateRefCount after releasing the spinlock
1648  */
1650 
1651  /*
1652  * Since we hold the buffer spinlock, we can update the buffer state and
1653  * release the lock in one operation.
1654  */
1655  buf_state = pg_atomic_read_u32(&buf->state);
1656  Assert(buf_state & BM_LOCKED);
1657  buf_state += BUF_REFCOUNT_ONE;
1658  UnlockBufHdr(buf, buf_state);
1659 
1660  b = BufferDescriptorGetBuffer(buf);
1661 
1662  ref = NewPrivateRefCountEntry(b);
1663  ref->refcount++;
1664 
1666 }
1667 
1668 /*
1669  * UnpinBuffer -- make buffer available for replacement.
1670  *
1671  * This should be applied only to shared buffers, never local ones.
1672  *
1673  * Most but not all callers want CurrentResourceOwner to be adjusted.
1674  * Those that don't should pass fixOwner = FALSE.
1675  */
1676 static void
1677 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1678 {
1679  PrivateRefCountEntry *ref;
1681 
1682  /* not moving as we're likely deleting it soon anyway */
1683  ref = GetPrivateRefCountEntry(b, false);
1684  Assert(ref != NULL);
1685 
1686  if (fixOwner)
1688 
1689  Assert(ref->refcount > 0);
1690  ref->refcount--;
1691  if (ref->refcount == 0)
1692  {
1693  uint32 buf_state;
1694  uint32 old_buf_state;
1695 
1696  /* I'd better not still hold any locks on the buffer */
1699 
1700  /*
1701  * Decrement the shared reference count.
1702  *
1703  * Since buffer spinlock holder can update status using just write,
1704  * it's not safe to use atomic decrement here; thus use a CAS loop.
1705  */
1706  old_buf_state = pg_atomic_read_u32(&buf->state);
1707  for (;;)
1708  {
1709  if (old_buf_state & BM_LOCKED)
1710  old_buf_state = WaitBufHdrUnlocked(buf);
1711 
1712  buf_state = old_buf_state;
1713 
1714  buf_state -= BUF_REFCOUNT_ONE;
1715 
1716  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1717  buf_state))
1718  break;
1719  }
1720 
1721  /* Support LockBufferForCleanup() */
1722  if (buf_state & BM_PIN_COUNT_WAITER)
1723  {
1724  /*
1725  * Acquire the buffer header lock, re-check that there's a waiter.
1726  * Another backend could have unpinned this buffer, and already
1727  * woken up the waiter. There's no danger of the buffer being
1728  * replaced after we unpinned it above, as it's pinned by the
1729  * waiter.
1730  */
1731  buf_state = LockBufHdr(buf);
1732 
1733  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1734  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1735  {
1736  /* we just released the last pin other than the waiter's */
1737  int wait_backend_pid = buf->wait_backend_pid;
1738 
1739  buf_state &= ~BM_PIN_COUNT_WAITER;
1740  UnlockBufHdr(buf, buf_state);
1741  ProcSendSignal(wait_backend_pid);
1742  }
1743  else
1744  UnlockBufHdr(buf, buf_state);
1745  }
1747  }
1748 }
1749 
1750 /*
1751  * BufferSync -- Write out all dirty buffers in the pool.
1752  *
1753  * This is called at checkpoint time to write out all dirty shared buffers.
1754  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1755  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1756  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1757  * unlogged buffers, which are otherwise skipped. The remaining flags
1758  * currently have no effect here.
1759  */
1760 static void
1761 BufferSync(int flags)
1762 {
1763  uint32 buf_state;
1764  int buf_id;
1765  int num_to_scan;
1766  int num_spaces;
1767  int num_processed;
1768  int num_written;
1769  CkptTsStatus *per_ts_stat = NULL;
1770  Oid last_tsid;
1771  binaryheap *ts_heap;
1772  int i;
1773  int mask = BM_DIRTY;
1774  WritebackContext wb_context;
1775 
1776  /* Make sure we can handle the pin inside SyncOneBuffer */
1778 
1779  /*
1780  * Unless this is a shutdown checkpoint or we have been explicitly told,
1781  * we write only permanent, dirty buffers. But at shutdown or end of
1782  * recovery, we write all dirty buffers.
1783  */
1786  mask |= BM_PERMANENT;
1787 
1788  /*
1789  * Loop over all buffers, and mark the ones that need to be written with
1790  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1791  * can estimate how much work needs to be done.
1792  *
1793  * This allows us to write only those pages that were dirty when the
1794  * checkpoint began, and not those that get dirtied while it proceeds.
1795  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1796  * later in this function, or by normal backends or the bgwriter cleaning
1797  * scan, the flag is cleared. Any buffer dirtied after this point won't
1798  * have the flag set.
1799  *
1800  * Note that if we fail to write some buffer, we may leave buffers with
1801  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1802  * certainly need to be written for the next checkpoint attempt, too.
1803  */
1804  num_to_scan = 0;
1805  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1806  {
1807  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1808 
1809  /*
1810  * Header spinlock is enough to examine BM_DIRTY, see comment in
1811  * SyncOneBuffer.
1812  */
1813  buf_state = LockBufHdr(bufHdr);
1814 
1815  if ((buf_state & mask) == mask)
1816  {
1817  CkptSortItem *item;
1818 
1819  buf_state |= BM_CHECKPOINT_NEEDED;
1820 
1821  item = &CkptBufferIds[num_to_scan++];
1822  item->buf_id = buf_id;
1823  item->tsId = bufHdr->tag.rnode.spcNode;
1824  item->relNode = bufHdr->tag.rnode.relNode;
1825  item->forkNum = bufHdr->tag.forkNum;
1826  item->blockNum = bufHdr->tag.blockNum;
1827  }
1828 
1829  UnlockBufHdr(bufHdr, buf_state);
1830  }
1831 
1832  if (num_to_scan == 0)
1833  return; /* nothing to do */
1834 
1836 
1837  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1838 
1839  /*
1840  * Sort buffers that need to be written to reduce the likelihood of random
1841  * IO. The sorting is also important for the implementation of balancing
1842  * writes between tablespaces. Without balancing writes we'd potentially
1843  * end up writing to the tablespaces one-by-one; possibly overloading the
1844  * underlying system.
1845  */
1846  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1848 
1849  num_spaces = 0;
1850 
1851  /*
1852  * Allocate progress status for each tablespace with buffers that need to
1853  * be flushed. This requires the to-be-flushed array to be sorted.
1854  */
1855  last_tsid = InvalidOid;
1856  for (i = 0; i < num_to_scan; i++)
1857  {
1858  CkptTsStatus *s;
1859  Oid cur_tsid;
1860 
1861  cur_tsid = CkptBufferIds[i].tsId;
1862 
1863  /*
1864  * Grow array of per-tablespace status structs, every time a new
1865  * tablespace is found.
1866  */
1867  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1868  {
1869  Size sz;
1870 
1871  num_spaces++;
1872 
1873  /*
1874  * Not worth adding grow-by-power-of-2 logic here - even with a
1875  * few hundred tablespaces this should be fine.
1876  */
1877  sz = sizeof(CkptTsStatus) * num_spaces;
1878 
1879  if (per_ts_stat == NULL)
1880  per_ts_stat = (CkptTsStatus *) palloc(sz);
1881  else
1882  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1883 
1884  s = &per_ts_stat[num_spaces - 1];
1885  memset(s, 0, sizeof(*s));
1886  s->tsId = cur_tsid;
1887 
1888  /*
1889  * The first buffer in this tablespace. As CkptBufferIds is sorted
1890  * by tablespace all (s->num_to_scan) buffers in this tablespace
1891  * will follow afterwards.
1892  */
1893  s->index = i;
1894 
1895  /*
1896  * progress_slice will be determined once we know how many buffers
1897  * are in each tablespace, i.e. after this loop.
1898  */
1899 
1900  last_tsid = cur_tsid;
1901  }
1902  else
1903  {
1904  s = &per_ts_stat[num_spaces - 1];
1905  }
1906 
1907  s->num_to_scan++;
1908  }
1909 
1910  Assert(num_spaces > 0);
1911 
1912  /*
1913  * Build a min-heap over the write-progress in the individual tablespaces,
1914  * and compute how large a portion of the total progress a single
1915  * processed buffer is.
1916  */
1917  ts_heap = binaryheap_allocate(num_spaces,
1919  NULL);
1920 
1921  for (i = 0; i < num_spaces; i++)
1922  {
1923  CkptTsStatus *ts_stat = &per_ts_stat[i];
1924 
1925  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1926 
1927  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1928  }
1929 
1930  binaryheap_build(ts_heap);
1931 
1932  /*
1933  * Iterate through to-be-checkpointed buffers and write the ones (still)
1934  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1935  * tablespaces; otherwise the sorting would lead to only one tablespace
1936  * receiving writes at a time, making inefficient use of the hardware.
1937  */
1938  num_processed = 0;
1939  num_written = 0;
1940  while (!binaryheap_empty(ts_heap))
1941  {
1942  BufferDesc *bufHdr = NULL;
1943  CkptTsStatus *ts_stat = (CkptTsStatus *)
1945 
1946  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1947  Assert(buf_id != -1);
1948 
1949  bufHdr = GetBufferDescriptor(buf_id);
1950 
1951  num_processed++;
1952 
1953  /*
1954  * We don't need to acquire the lock here, because we're only looking
1955  * at a single bit. It's possible that someone else writes the buffer
1956  * and clears the flag right after we check, but that doesn't matter
1957  * since SyncOneBuffer will then do nothing. However, there is a
1958  * further race condition: it's conceivable that between the time we
1959  * examine the bit here and the time SyncOneBuffer acquires the lock,
1960  * someone else not only wrote the buffer but replaced it with another
1961  * page and dirtied it. In that improbable case, SyncOneBuffer will
1962  * write the buffer though we didn't need to. It doesn't seem worth
1963  * guarding against this, though.
1964  */
1966  {
1967  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1968  {
1969  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1971  num_written++;
1972  }
1973  }
1974 
1975  /*
1976  * Measure progress independent of actually having to flush the buffer
1977  * - otherwise writing become unbalanced.
1978  */
1979  ts_stat->progress += ts_stat->progress_slice;
1980  ts_stat->num_scanned++;
1981  ts_stat->index++;
1982 
1983  /* Have all the buffers from the tablespace been processed? */
1984  if (ts_stat->num_scanned == ts_stat->num_to_scan)
1985  {
1986  binaryheap_remove_first(ts_heap);
1987  }
1988  else
1989  {
1990  /* update heap with the new progress */
1991  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
1992  }
1993 
1994  /*
1995  * Sleep to throttle our I/O rate.
1996  */
1997  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
1998  }
1999 
2000  /* issue all pending flushes */
2001  IssuePendingWritebacks(&wb_context);
2002 
2003  pfree(per_ts_stat);
2004  per_ts_stat = NULL;
2005  binaryheap_free(ts_heap);
2006 
2007  /*
2008  * Update checkpoint statistics. As noted above, this doesn't include
2009  * buffers written by other backends or bgwriter scan.
2010  */
2011  CheckpointStats.ckpt_bufs_written += num_written;
2012 
2013  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2014 }
2015 
2016 /*
2017  * BgBufferSync -- Write out some dirty buffers in the pool.
2018  *
2019  * This is called periodically by the background writer process.
2020  *
2021  * Returns true if it's appropriate for the bgwriter process to go into
2022  * low-power hibernation mode. (This happens if the strategy clock sweep
2023  * has been "lapped" and no buffer allocations have occurred recently,
2024  * or if the bgwriter has been effectively disabled by setting
2025  * bgwriter_lru_maxpages to 0.)
2026  */
2027 bool
2029 {
2030  /* info obtained from freelist.c */
2031  int strategy_buf_id;
2032  uint32 strategy_passes;
2033  uint32 recent_alloc;
2034 
2035  /*
2036  * Information saved between calls so we can determine the strategy
2037  * point's advance rate and avoid scanning already-cleaned buffers.
2038  */
2039  static bool saved_info_valid = false;
2040  static int prev_strategy_buf_id;
2041  static uint32 prev_strategy_passes;
2042  static int next_to_clean;
2043  static uint32 next_passes;
2044 
2045  /* Moving averages of allocation rate and clean-buffer density */
2046  static float smoothed_alloc = 0;
2047  static float smoothed_density = 10.0;
2048 
2049  /* Potentially these could be tunables, but for now, not */
2050  float smoothing_samples = 16;
2051  float scan_whole_pool_milliseconds = 120000.0;
2052 
2053  /* Used to compute how far we scan ahead */
2054  long strategy_delta;
2055  int bufs_to_lap;
2056  int bufs_ahead;
2057  float scans_per_alloc;
2058  int reusable_buffers_est;
2059  int upcoming_alloc_est;
2060  int min_scan_buffers;
2061 
2062  /* Variables for the scanning loop proper */
2063  int num_to_scan;
2064  int num_written;
2065  int reusable_buffers;
2066 
2067  /* Variables for final smoothed_density update */
2068  long new_strategy_delta;
2069  uint32 new_recent_alloc;
2070 
2071  /*
2072  * Find out where the freelist clock sweep currently is, and how many
2073  * buffer allocations have happened since our last call.
2074  */
2075  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2076 
2077  /* Report buffer alloc counts to pgstat */
2078  BgWriterStats.m_buf_alloc += recent_alloc;
2079 
2080  /*
2081  * If we're not running the LRU scan, just stop after doing the stats
2082  * stuff. We mark the saved state invalid so that we can recover sanely
2083  * if LRU scan is turned back on later.
2084  */
2085  if (bgwriter_lru_maxpages <= 0)
2086  {
2087  saved_info_valid = false;
2088  return true;
2089  }
2090 
2091  /*
2092  * Compute strategy_delta = how many buffers have been scanned by the
2093  * clock sweep since last time. If first time through, assume none. Then
2094  * see if we are still ahead of the clock sweep, and if so, how many
2095  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2096  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2097  * behavior when the passes counts wrap around.
2098  */
2099  if (saved_info_valid)
2100  {
2101  int32 passes_delta = strategy_passes - prev_strategy_passes;
2102 
2103  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2104  strategy_delta += (long) passes_delta *NBuffers;
2105 
2106  Assert(strategy_delta >= 0);
2107 
2108  if ((int32) (next_passes - strategy_passes) > 0)
2109  {
2110  /* we're one pass ahead of the strategy point */
2111  bufs_to_lap = strategy_buf_id - next_to_clean;
2112 #ifdef BGW_DEBUG
2113  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2114  next_passes, next_to_clean,
2115  strategy_passes, strategy_buf_id,
2116  strategy_delta, bufs_to_lap);
2117 #endif
2118  }
2119  else if (next_passes == strategy_passes &&
2120  next_to_clean >= strategy_buf_id)
2121  {
2122  /* on same pass, but ahead or at least not behind */
2123  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2124 #ifdef BGW_DEBUG
2125  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2126  next_passes, next_to_clean,
2127  strategy_passes, strategy_buf_id,
2128  strategy_delta, bufs_to_lap);
2129 #endif
2130  }
2131  else
2132  {
2133  /*
2134  * We're behind, so skip forward to the strategy point and start
2135  * cleaning from there.
2136  */
2137 #ifdef BGW_DEBUG
2138  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2139  next_passes, next_to_clean,
2140  strategy_passes, strategy_buf_id,
2141  strategy_delta);
2142 #endif
2143  next_to_clean = strategy_buf_id;
2144  next_passes = strategy_passes;
2145  bufs_to_lap = NBuffers;
2146  }
2147  }
2148  else
2149  {
2150  /*
2151  * Initializing at startup or after LRU scanning had been off. Always
2152  * start at the strategy point.
2153  */
2154 #ifdef BGW_DEBUG
2155  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2156  strategy_passes, strategy_buf_id);
2157 #endif
2158  strategy_delta = 0;
2159  next_to_clean = strategy_buf_id;
2160  next_passes = strategy_passes;
2161  bufs_to_lap = NBuffers;
2162  }
2163 
2164  /* Update saved info for next time */
2165  prev_strategy_buf_id = strategy_buf_id;
2166  prev_strategy_passes = strategy_passes;
2167  saved_info_valid = true;
2168 
2169  /*
2170  * Compute how many buffers had to be scanned for each new allocation, ie,
2171  * 1/density of reusable buffers, and track a moving average of that.
2172  *
2173  * If the strategy point didn't move, we don't update the density estimate
2174  */
2175  if (strategy_delta > 0 && recent_alloc > 0)
2176  {
2177  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2178  smoothed_density += (scans_per_alloc - smoothed_density) /
2179  smoothing_samples;
2180  }
2181 
2182  /*
2183  * Estimate how many reusable buffers there are between the current
2184  * strategy point and where we've scanned ahead to, based on the smoothed
2185  * density estimate.
2186  */
2187  bufs_ahead = NBuffers - bufs_to_lap;
2188  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2189 
2190  /*
2191  * Track a moving average of recent buffer allocations. Here, rather than
2192  * a true average we want a fast-attack, slow-decline behavior: we
2193  * immediately follow any increase.
2194  */
2195  if (smoothed_alloc <= (float) recent_alloc)
2196  smoothed_alloc = recent_alloc;
2197  else
2198  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2199  smoothing_samples;
2200 
2201  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2202  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2203 
2204  /*
2205  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2206  * eventually underflow to zero, and the underflows produce annoying
2207  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2208  * zero, there's no point in tracking smaller and smaller values of
2209  * smoothed_alloc, so just reset it to exactly zero to avoid this
2210  * syndrome. It will pop back up as soon as recent_alloc increases.
2211  */
2212  if (upcoming_alloc_est == 0)
2213  smoothed_alloc = 0;
2214 
2215  /*
2216  * Even in cases where there's been little or no buffer allocation
2217  * activity, we want to make a small amount of progress through the buffer
2218  * cache so that as many reusable buffers as possible are clean after an
2219  * idle period.
2220  *
2221  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2222  * the BGW will be called during the scan_whole_pool time; slice the
2223  * buffer pool into that many sections.
2224  */
2225  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2226 
2227  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2228  {
2229 #ifdef BGW_DEBUG
2230  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2231  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2232 #endif
2233  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2234  }
2235 
2236  /*
2237  * Now write out dirty reusable buffers, working forward from the
2238  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2239  * enough buffers to match our estimate of the next cycle's allocation
2240  * requirements, or hit the bgwriter_lru_maxpages limit.
2241  */
2242 
2243  /* Make sure we can handle the pin inside SyncOneBuffer */
2245 
2246  num_to_scan = bufs_to_lap;
2247  num_written = 0;
2248  reusable_buffers = reusable_buffers_est;
2249 
2250  /* Execute the LRU scan */
2251  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2252  {
2253  int sync_state = SyncOneBuffer(next_to_clean, true,
2254  wb_context);
2255 
2256  if (++next_to_clean >= NBuffers)
2257  {
2258  next_to_clean = 0;
2259  next_passes++;
2260  }
2261  num_to_scan--;
2262 
2263  if (sync_state & BUF_WRITTEN)
2264  {
2265  reusable_buffers++;
2266  if (++num_written >= bgwriter_lru_maxpages)
2267  {
2269  break;
2270  }
2271  }
2272  else if (sync_state & BUF_REUSABLE)
2273  reusable_buffers++;
2274  }
2275 
2276  BgWriterStats.m_buf_written_clean += num_written;
2277 
2278 #ifdef BGW_DEBUG
2279  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2280  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2281  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2282  bufs_to_lap - num_to_scan,
2283  num_written,
2284  reusable_buffers - reusable_buffers_est);
2285 #endif
2286 
2287  /*
2288  * Consider the above scan as being like a new allocation scan.
2289  * Characterize its density and update the smoothed one based on it. This
2290  * effectively halves the moving average period in cases where both the
2291  * strategy and the background writer are doing some useful scanning,
2292  * which is helpful because a long memory isn't as desirable on the
2293  * density estimates.
2294  */
2295  new_strategy_delta = bufs_to_lap - num_to_scan;
2296  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2297  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2298  {
2299  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2300  smoothed_density += (scans_per_alloc - smoothed_density) /
2301  smoothing_samples;
2302 
2303 #ifdef BGW_DEBUG
2304  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2305  new_recent_alloc, new_strategy_delta,
2306  scans_per_alloc, smoothed_density);
2307 #endif
2308  }
2309 
2310  /* Return true if OK to hibernate */
2311  return (bufs_to_lap == 0 && recent_alloc == 0);
2312 }
2313 
2314 /*
2315  * SyncOneBuffer -- process a single buffer during syncing.
2316  *
2317  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2318  * buffers marked recently used, as these are not replacement candidates.
2319  *
2320  * Returns a bitmask containing the following flag bits:
2321  * BUF_WRITTEN: we wrote the buffer.
2322  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2323  * pin count 0 and usage count 0.
2324  *
2325  * (BUF_WRITTEN could be set in error if FlushBuffers finds the buffer clean
2326  * after locking it, but we don't care all that much.)
2327  *
2328  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2329  */
2330 static int
2331 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2332 {
2333  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2334  int result = 0;
2335  uint32 buf_state;
2336  BufferTag tag;
2337 
2339 
2340  /*
2341  * Check whether buffer needs writing.
2342  *
2343  * We can make this check without taking the buffer content lock so long
2344  * as we mark pages dirty in access methods *before* logging changes with
2345  * XLogInsert(): if someone marks the buffer dirty just after our check we
2346  * don't worry because our checkpoint.redo points before log record for
2347  * upcoming changes and so we are not required to write such dirty buffer.
2348  */
2349  buf_state = LockBufHdr(bufHdr);
2350 
2351  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2352  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2353  {
2354  result |= BUF_REUSABLE;
2355  }
2356  else if (skip_recently_used)
2357  {
2358  /* Caller told us not to write recently-used buffers */
2359  UnlockBufHdr(bufHdr, buf_state);
2360  return result;
2361  }
2362 
2363  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2364  {
2365  /* It's clean, so nothing to do */
2366  UnlockBufHdr(bufHdr, buf_state);
2367  return result;
2368  }
2369 
2370  /*
2371  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2372  * buffer is clean by the time we've locked it.)
2373  */
2374  PinBuffer_Locked(bufHdr);
2376 
2377  FlushBuffer(bufHdr, NULL);
2378 
2380 
2381  tag = bufHdr->tag;
2382 
2383  UnpinBuffer(bufHdr, true);
2384 
2385  ScheduleBufferTagForWriteback(wb_context, &tag);
2386 
2387  return result | BUF_WRITTEN;
2388 }
2389 
2390 /*
2391  * AtEOXact_Buffers - clean up at end of transaction.
2392  *
2393  * As of PostgreSQL 8.0, buffer pins should get released by the
2394  * ResourceOwner mechanism. This routine is just a debugging
2395  * cross-check that no pins remain.
2396  */
2397 void
2398 AtEOXact_Buffers(bool isCommit)
2399 {
2401 
2402  AtEOXact_LocalBuffers(isCommit);
2403 
2405 }
2406 
2407 /*
2408  * Initialize access to shared buffer pool
2409  *
2410  * This is called during backend startup (whether standalone or under the
2411  * postmaster). It sets up for this backend's access to the already-existing
2412  * buffer pool.
2413  *
2414  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2415  * cannot do LWLockAcquire; hence we can't actually access stuff in
2416  * shared memory yet. We are only initializing local data here.
2417  * (See also InitBufferPoolBackend)
2418  */
2419 void
2421 {
2422  HASHCTL hash_ctl;
2423 
2424  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2425 
2426  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2427  hash_ctl.keysize = sizeof(int32);
2428  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2429 
2430  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2431  HASH_ELEM | HASH_BLOBS);
2432 }
2433 
2434 /*
2435  * InitBufferPoolBackend --- second-stage initialization of a new backend
2436  *
2437  * This is called after we have acquired a PGPROC and so can safely get
2438  * LWLocks. We don't currently need to do anything at this stage ...
2439  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2440  * access, and thereby has to be called at the corresponding phase of
2441  * backend shutdown.
2442  */
2443 void
2445 {
2447 }
2448 
2449 /*
2450  * During backend exit, ensure that we released all shared-buffer locks and
2451  * assert that we have no remaining pins.
2452  */
2453 static void
2455 {
2456  AbortBufferIO();
2457  UnlockBuffers();
2458 
2460 
2461  /* localbuf.c needs a chance too */
2463 }
2464 
2465 /*
2466  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2467  *
2468  * As of PostgreSQL 8.0, buffer pins should get released by the
2469  * ResourceOwner mechanism. This routine is just a debugging
2470  * cross-check that no pins remain.
2471  */
2472 static void
2474 {
2475 #ifdef USE_ASSERT_CHECKING
2476  int RefCountErrors = 0;
2477  PrivateRefCountEntry *res;
2478  int i;
2479 
2480  /* check the array */
2481  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2482  {
2483  res = &PrivateRefCountArray[i];
2484 
2485  if (res->buffer != InvalidBuffer)
2486  {
2488  RefCountErrors++;
2489  }
2490  }
2491 
2492  /* if necessary search the hash */
2494  {
2495  HASH_SEQ_STATUS hstat;
2496 
2497  hash_seq_init(&hstat, PrivateRefCountHash);
2498  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2499  {
2501  RefCountErrors++;
2502  }
2503 
2504  }
2505 
2506  Assert(RefCountErrors == 0);
2507 #endif
2508 }
2509 
2510 /*
2511  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2512  */
2513 void
2515 {
2516  BufferDesc *buf;
2517  int32 loccount;
2518  char *path;
2519  BackendId backend;
2520  uint32 buf_state;
2521 
2522  Assert(BufferIsValid(buffer));
2523  if (BufferIsLocal(buffer))
2524  {
2525  buf = GetLocalBufferDescriptor(-buffer - 1);
2526  loccount = LocalRefCount[-buffer - 1];
2527  backend = MyBackendId;
2528  }
2529  else
2530  {
2531  buf = GetBufferDescriptor(buffer - 1);
2532  loccount = GetPrivateRefCount(buffer);
2533  backend = InvalidBackendId;
2534  }
2535 
2536  /* theoretically we should lock the bufhdr here */
2537  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2538  buf_state = pg_atomic_read_u32(&buf->state);
2539  elog(WARNING,
2540  "buffer refcount leak: [%03d] "
2541  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2542  buffer, path,
2543  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2544  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2545  pfree(path);
2546 }
2547 
2548 /*
2549  * CheckPointBuffers
2550  *
2551  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2552  *
2553  * Note: temporary relations do not participate in checkpoints, so they don't
2554  * need to be flushed.
2555  */
2556 void
2558 {
2559  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2561  BufferSync(flags);
2563  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2564  smgrsync();
2566  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2567 }
2568 
2569 
2570 /*
2571  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2572  */
2573 void
2575 {
2576  /* Nothing to do in bufmgr anymore... */
2577 }
2578 
2579 /*
2580  * BufferGetBlockNumber
2581  * Returns the block number associated with a buffer.
2582  *
2583  * Note:
2584  * Assumes that the buffer is valid and pinned, else the
2585  * value may be obsolete immediately...
2586  */
2589 {
2590  BufferDesc *bufHdr;
2591 
2592  Assert(BufferIsPinned(buffer));
2593 
2594  if (BufferIsLocal(buffer))
2595  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2596  else
2597  bufHdr = GetBufferDescriptor(buffer - 1);
2598 
2599  /* pinned, so OK to read tag without spinlock */
2600  return bufHdr->tag.blockNum;
2601 }
2602 
2603 /*
2604  * BufferGetTag
2605  * Returns the relfilenode, fork number and block number associated with
2606  * a buffer.
2607  */
2608 void
2610  BlockNumber *blknum)
2611 {
2612  BufferDesc *bufHdr;
2613 
2614  /* Do the same checks as BufferGetBlockNumber. */
2615  Assert(BufferIsPinned(buffer));
2616 
2617  if (BufferIsLocal(buffer))
2618  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2619  else
2620  bufHdr = GetBufferDescriptor(buffer - 1);
2621 
2622  /* pinned, so OK to read tag without spinlock */
2623  *rnode = bufHdr->tag.rnode;
2624  *forknum = bufHdr->tag.forkNum;
2625  *blknum = bufHdr->tag.blockNum;
2626 }
2627 
2628 /*
2629  * FlushBuffer
2630  * Physically write out a shared buffer.
2631  *
2632  * NOTE: this actually just passes the buffer contents to the kernel; the
2633  * real write to disk won't happen until the kernel feels like it. This
2634  * is okay from our point of view since we can redo the changes from WAL.
2635  * However, we will need to force the changes to disk via fsync before
2636  * we can checkpoint WAL.
2637  *
2638  * The caller must hold a pin on the buffer and have share-locked the
2639  * buffer contents. (Note: a share-lock does not prevent updates of
2640  * hint bits in the buffer, so the page could change while the write
2641  * is in progress, but we assume that that will not invalidate the data
2642  * written.)
2643  *
2644  * If the caller has an smgr reference for the buffer's relation, pass it
2645  * as the second parameter. If not, pass NULL.
2646  */
2647 static void
2649 {
2650  XLogRecPtr recptr;
2651  ErrorContextCallback errcallback;
2652  instr_time io_start,
2653  io_time;
2654  Block bufBlock;
2655  char *bufToWrite;
2656  uint32 buf_state;
2657 
2658  /*
2659  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2660  * false, then someone else flushed the buffer before we could, so we need
2661  * not do anything.
2662  */
2663  if (!StartBufferIO(buf, false))
2664  return;
2665 
2666  /* Setup error traceback support for ereport() */
2668  errcallback.arg = (void *) buf;
2669  errcallback.previous = error_context_stack;
2670  error_context_stack = &errcallback;
2671 
2672  /* Find smgr relation for buffer */
2673  if (reln == NULL)
2674  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2675 
2676  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2677  buf->tag.blockNum,
2678  reln->smgr_rnode.node.spcNode,
2679  reln->smgr_rnode.node.dbNode,
2680  reln->smgr_rnode.node.relNode);
2681 
2682  buf_state = LockBufHdr(buf);
2683 
2684  /*
2685  * Run PageGetLSN while holding header lock, since we don't have the
2686  * buffer locked exclusively in all cases.
2687  */
2688  recptr = BufferGetLSN(buf);
2689 
2690  /* To check if block content changes while flushing. - vadim 01/17/97 */
2691  buf_state &= ~BM_JUST_DIRTIED;
2692  UnlockBufHdr(buf, buf_state);
2693 
2694  /*
2695  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2696  * rule that log updates must hit disk before any of the data-file changes
2697  * they describe do.
2698  *
2699  * However, this rule does not apply to unlogged relations, which will be
2700  * lost after a crash anyway. Most unlogged relation pages do not bear
2701  * LSNs since we never emit WAL records for them, and therefore flushing
2702  * up through the buffer LSN would be useless, but harmless. However,
2703  * GiST indexes use LSNs internally to track page-splits, and therefore
2704  * unlogged GiST pages bear "fake" LSNs generated by
2705  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2706  * LSN counter could advance past the WAL insertion point; and if it did
2707  * happen, attempting to flush WAL through that location would fail, with
2708  * disastrous system-wide consequences. To make sure that can't happen,
2709  * skip the flush if the buffer isn't permanent.
2710  */
2711  if (buf_state & BM_PERMANENT)
2712  XLogFlush(recptr);
2713 
2714  /*
2715  * Now it's safe to write buffer to disk. Note that no one else should
2716  * have been able to write it while we were busy with log flushing because
2717  * we have the io_in_progress lock.
2718  */
2719  bufBlock = BufHdrGetBlock(buf);
2720 
2721  /*
2722  * Update page checksum if desired. Since we have only shared lock on the
2723  * buffer, other processes might be updating hint bits in it, so we must
2724  * copy the page to private storage if we do checksumming.
2725  */
2726  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2727 
2728  if (track_io_timing)
2729  INSTR_TIME_SET_CURRENT(io_start);
2730 
2731  /*
2732  * bufToWrite is either the shared buffer or a copy, as appropriate.
2733  */
2734  smgrwrite(reln,
2735  buf->tag.forkNum,
2736  buf->tag.blockNum,
2737  bufToWrite,
2738  false);
2739 
2740  if (track_io_timing)
2741  {
2742  INSTR_TIME_SET_CURRENT(io_time);
2743  INSTR_TIME_SUBTRACT(io_time, io_start);
2746  }
2747 
2749 
2750  /*
2751  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2752  * end the io_in_progress state.
2753  */
2754  TerminateBufferIO(buf, true, 0);
2755 
2756  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2757  buf->tag.blockNum,
2758  reln->smgr_rnode.node.spcNode,
2759  reln->smgr_rnode.node.dbNode,
2760  reln->smgr_rnode.node.relNode);
2761 
2762  /* Pop the error context stack */
2763  error_context_stack = errcallback.previous;
2764 }
2765 
2766 /*
2767  * RelationGetNumberOfBlocksInFork
2768  * Determines the current number of pages in the specified relation fork.
2769  */
2772 {
2773  /* Open it at the smgr level if not already done */
2774  RelationOpenSmgr(relation);
2775 
2776  return smgrnblocks(relation->rd_smgr, forkNum);
2777 }
2778 
2779 /*
2780  * BufferIsPermanent
2781  * Determines whether a buffer will potentially still be around after
2782  * a crash. Caller must hold a buffer pin.
2783  */
2784 bool
2786 {
2787  BufferDesc *bufHdr;
2788 
2789  /* Local buffers are used only for temp relations. */
2790  if (BufferIsLocal(buffer))
2791  return false;
2792 
2793  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2794  Assert(BufferIsValid(buffer));
2795  Assert(BufferIsPinned(buffer));
2796 
2797  /*
2798  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2799  * need not bother with the buffer header spinlock. Even if someone else
2800  * changes the buffer header state while we're doing this, the state is
2801  * changed atomically, so we'll read the old value or the new value, but
2802  * not random garbage.
2803  */
2804  bufHdr = GetBufferDescriptor(buffer - 1);
2805  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2806 }
2807 
2808 /*
2809  * BufferGetLSNAtomic
2810  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2811  * This is necessary for some callers who may not have an exclusive lock
2812  * on the buffer.
2813  */
2814 XLogRecPtr
2816 {
2817  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2818  char *page = BufferGetPage(buffer);
2819  XLogRecPtr lsn;
2820  uint32 buf_state;
2821 
2822  /*
2823  * If we don't need locking for correctness, fastpath out.
2824  */
2825  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2826  return PageGetLSN(page);
2827 
2828  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2829  Assert(BufferIsValid(buffer));
2830  Assert(BufferIsPinned(buffer));
2831 
2832  buf_state = LockBufHdr(bufHdr);
2833  lsn = PageGetLSN(page);
2834  UnlockBufHdr(bufHdr, buf_state);
2835 
2836  return lsn;
2837 }
2838 
2839 /* ---------------------------------------------------------------------
2840  * DropRelFileNodeBuffers
2841  *
2842  * This function removes from the buffer pool all the pages of the
2843  * specified relation fork that have block numbers >= firstDelBlock.
2844  * (In particular, with firstDelBlock = 0, all pages are removed.)
2845  * Dirty pages are simply dropped, without bothering to write them
2846  * out first. Therefore, this is NOT rollback-able, and so should be
2847  * used only with extreme caution!
2848  *
2849  * Currently, this is called only from smgr.c when the underlying file
2850  * is about to be deleted or truncated (firstDelBlock is needed for
2851  * the truncation case). The data in the affected pages would therefore
2852  * be deleted momentarily anyway, and there is no point in writing it.
2853  * It is the responsibility of higher-level code to ensure that the
2854  * deletion or truncation does not lose any data that could be needed
2855  * later. It is also the responsibility of higher-level code to ensure
2856  * that no other process could be trying to load more pages of the
2857  * relation into buffers.
2858  *
2859  * XXX currently it sequentially searches the buffer pool, should be
2860  * changed to more clever ways of searching. However, this routine
2861  * is used only in code paths that aren't very performance-critical,
2862  * and we shouldn't slow down the hot paths to make it faster ...
2863  * --------------------------------------------------------------------
2864  */
2865 void
2867  BlockNumber firstDelBlock)
2868 {
2869  int i;
2870 
2871  /* If it's a local relation, it's localbuf.c's problem. */
2872  if (RelFileNodeBackendIsTemp(rnode))
2873  {
2874  if (rnode.backend == MyBackendId)
2875  DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
2876  return;
2877  }
2878 
2879  for (i = 0; i < NBuffers; i++)
2880  {
2881  BufferDesc *bufHdr = GetBufferDescriptor(i);
2882  uint32 buf_state;
2883 
2884  /*
2885  * We can make this a tad faster by prechecking the buffer tag before
2886  * we attempt to lock the buffer; this saves a lot of lock
2887  * acquisitions in typical cases. It should be safe because the
2888  * caller must have AccessExclusiveLock on the relation, or some other
2889  * reason to be certain that no one is loading new pages of the rel
2890  * into the buffer pool. (Otherwise we might well miss such pages
2891  * entirely.) Therefore, while the tag might be changing while we
2892  * look at it, it can't be changing *to* a value we care about, only
2893  * *away* from such a value. So false negatives are impossible, and
2894  * false positives are safe because we'll recheck after getting the
2895  * buffer lock.
2896  *
2897  * We could check forkNum and blockNum as well as the rnode, but the
2898  * incremental win from doing so seems small.
2899  */
2900  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2901  continue;
2902 
2903  buf_state = LockBufHdr(bufHdr);
2904  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2905  bufHdr->tag.forkNum == forkNum &&
2906  bufHdr->tag.blockNum >= firstDelBlock)
2907  InvalidateBuffer(bufHdr); /* releases spinlock */
2908  else
2909  UnlockBufHdr(bufHdr, buf_state);
2910  }
2911 }
2912 
2913 /* ---------------------------------------------------------------------
2914  * DropRelFileNodesAllBuffers
2915  *
2916  * This function removes from the buffer pool all the pages of all
2917  * forks of the specified relations. It's equivalent to calling
2918  * DropRelFileNodeBuffers once per fork per relation with
2919  * firstDelBlock = 0.
2920  * --------------------------------------------------------------------
2921  */
2922 void
2924 {
2925  int i,
2926  n = 0;
2927  RelFileNode *nodes;
2928  bool use_bsearch;
2929 
2930  if (nnodes == 0)
2931  return;
2932 
2933  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2934 
2935  /* If it's a local relation, it's localbuf.c's problem. */
2936  for (i = 0; i < nnodes; i++)
2937  {
2938  if (RelFileNodeBackendIsTemp(rnodes[i]))
2939  {
2940  if (rnodes[i].backend == MyBackendId)
2941  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
2942  }
2943  else
2944  nodes[n++] = rnodes[i].node;
2945  }
2946 
2947  /*
2948  * If there are no non-local relations, then we're done. Release the
2949  * memory and return.
2950  */
2951  if (n == 0)
2952  {
2953  pfree(nodes);
2954  return;
2955  }
2956 
2957  /*
2958  * For low number of relations to drop just use a simple walk through, to
2959  * save the bsearch overhead. The threshold to use is rather a guess than
2960  * an exactly determined value, as it depends on many factors (CPU and RAM
2961  * speeds, amount of shared buffers etc.).
2962  */
2963  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
2964 
2965  /* sort the list of rnodes if necessary */
2966  if (use_bsearch)
2967  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
2968 
2969  for (i = 0; i < NBuffers; i++)
2970  {
2971  RelFileNode *rnode = NULL;
2972  BufferDesc *bufHdr = GetBufferDescriptor(i);
2973  uint32 buf_state;
2974 
2975  /*
2976  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
2977  * and saves some cycles.
2978  */
2979 
2980  if (!use_bsearch)
2981  {
2982  int j;
2983 
2984  for (j = 0; j < n; j++)
2985  {
2986  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
2987  {
2988  rnode = &nodes[j];
2989  break;
2990  }
2991  }
2992  }
2993  else
2994  {
2995  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
2996  nodes, n, sizeof(RelFileNode),
2998  }
2999 
3000  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3001  if (rnode == NULL)
3002  continue;
3003 
3004  buf_state = LockBufHdr(bufHdr);
3005  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3006  InvalidateBuffer(bufHdr); /* releases spinlock */
3007  else
3008  UnlockBufHdr(bufHdr, buf_state);
3009  }
3010 
3011  pfree(nodes);
3012 }
3013 
3014 /* ---------------------------------------------------------------------
3015  * DropDatabaseBuffers
3016  *
3017  * This function removes all the buffers in the buffer cache for a
3018  * particular database. Dirty pages are simply dropped, without
3019  * bothering to write them out first. This is used when we destroy a
3020  * database, to avoid trying to flush data to disk when the directory
3021  * tree no longer exists. Implementation is pretty similar to
3022  * DropRelFileNodeBuffers() which is for destroying just one relation.
3023  * --------------------------------------------------------------------
3024  */
3025 void
3027 {
3028  int i;
3029 
3030  /*
3031  * We needn't consider local buffers, since by assumption the target
3032  * database isn't our own.
3033  */
3034 
3035  for (i = 0; i < NBuffers; i++)
3036  {
3037  BufferDesc *bufHdr = GetBufferDescriptor(i);
3038  uint32 buf_state;
3039 
3040  /*
3041  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3042  * and saves some cycles.
3043  */
3044  if (bufHdr->tag.rnode.dbNode != dbid)
3045  continue;
3046 
3047  buf_state = LockBufHdr(bufHdr);
3048  if (bufHdr->tag.rnode.dbNode == dbid)
3049  InvalidateBuffer(bufHdr); /* releases spinlock */
3050  else
3051  UnlockBufHdr(bufHdr, buf_state);
3052  }
3053 }
3054 
3055 /* -----------------------------------------------------------------
3056  * PrintBufferDescs
3057  *
3058  * this function prints all the buffer descriptors, for debugging
3059  * use only.
3060  * -----------------------------------------------------------------
3061  */
3062 #ifdef NOT_USED
3063 void
3064 PrintBufferDescs(void)
3065 {
3066  int i;
3067 
3068  for (i = 0; i < NBuffers; ++i)
3069  {
3072 
3073  /* theoretically we should lock the bufhdr here */
3074  elog(LOG,
3075  "[%02d] (freeNext=%d, rel=%s, "
3076  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3077  i, buf->freeNext,
3079  buf->tag.blockNum, buf->flags,
3080  buf->refcount, GetPrivateRefCount(b));
3081  }
3082 }
3083 #endif
3084 
3085 #ifdef NOT_USED
3086 void
3087 PrintPinnedBufs(void)
3088 {
3089  int i;
3090 
3091  for (i = 0; i < NBuffers; ++i)
3092  {
3093  BufferDesc *buf = GetBufferDescriptor(i);
3095 
3096  if (GetPrivateRefCount(b) > 0)
3097  {
3098  /* theoretically we should lock the bufhdr here */
3099  elog(LOG,
3100  "[%02d] (freeNext=%d, rel=%s, "
3101  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3102  i, buf->freeNext,
3103  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3104  buf->tag.blockNum, buf->flags,
3105  buf->refcount, GetPrivateRefCount(b));
3106  }
3107  }
3108 }
3109 #endif
3110 
3111 /* ---------------------------------------------------------------------
3112  * FlushRelationBuffers
3113  *
3114  * This function writes all dirty pages of a relation out to disk
3115  * (or more accurately, out to kernel disk buffers), ensuring that the
3116  * kernel has an up-to-date view of the relation.
3117  *
3118  * Generally, the caller should be holding AccessExclusiveLock on the
3119  * target relation to ensure that no other backend is busy dirtying
3120  * more blocks of the relation; the effects can't be expected to last
3121  * after the lock is released.
3122  *
3123  * XXX currently it sequentially searches the buffer pool, should be
3124  * changed to more clever ways of searching. This routine is not
3125  * used in any performance-critical code paths, so it's not worth
3126  * adding additional overhead to normal paths to make it go faster;
3127  * but see also DropRelFileNodeBuffers.
3128  * --------------------------------------------------------------------
3129  */
3130 void
3132 {
3133  int i;
3134  BufferDesc *bufHdr;
3135 
3136  /* Open rel at the smgr level if not already done */
3137  RelationOpenSmgr(rel);
3138 
3139  if (RelationUsesLocalBuffers(rel))
3140  {
3141  for (i = 0; i < NLocBuffer; i++)
3142  {
3143  uint32 buf_state;
3144 
3145  bufHdr = GetLocalBufferDescriptor(i);
3146  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3147  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3148  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3149  {
3150  ErrorContextCallback errcallback;
3151  Page localpage;
3152 
3153  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3154 
3155  /* Setup error traceback support for ereport() */
3157  errcallback.arg = (void *) bufHdr;
3158  errcallback.previous = error_context_stack;
3159  error_context_stack = &errcallback;
3160 
3161  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3162 
3163  smgrwrite(rel->rd_smgr,
3164  bufHdr->tag.forkNum,
3165  bufHdr->tag.blockNum,
3166  localpage,
3167  false);
3168 
3169  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3170  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3171 
3172  /* Pop the error context stack */
3173  error_context_stack = errcallback.previous;
3174  }
3175  }
3176 
3177  return;
3178  }
3179 
3180  /* Make sure we can handle the pin inside the loop */
3182 
3183  for (i = 0; i < NBuffers; i++)
3184  {
3185  uint32 buf_state;
3186 
3187  bufHdr = GetBufferDescriptor(i);
3188 
3189  /*
3190  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3191  * and saves some cycles.
3192  */
3193  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3194  continue;
3195 
3197 
3198  buf_state = LockBufHdr(bufHdr);
3199  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3200  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3201  {
3202  PinBuffer_Locked(bufHdr);
3204  FlushBuffer(bufHdr, rel->rd_smgr);
3206  UnpinBuffer(bufHdr, true);
3207  }
3208  else
3209  UnlockBufHdr(bufHdr, buf_state);
3210  }
3211 }
3212 
3213 /* ---------------------------------------------------------------------
3214  * FlushDatabaseBuffers
3215  *
3216  * This function writes all dirty pages of a database out to disk
3217  * (or more accurately, out to kernel disk buffers), ensuring that the
3218  * kernel has an up-to-date view of the database.
3219  *
3220  * Generally, the caller should be holding an appropriate lock to ensure
3221  * no other backend is active in the target database; otherwise more
3222  * pages could get dirtied.
3223  *
3224  * Note we don't worry about flushing any pages of temporary relations.
3225  * It's assumed these wouldn't be interesting.
3226  * --------------------------------------------------------------------
3227  */
3228 void
3230 {
3231  int i;
3232  BufferDesc *bufHdr;
3233 
3234  /* Make sure we can handle the pin inside the loop */
3236 
3237  for (i = 0; i < NBuffers; i++)
3238  {
3239  uint32 buf_state;
3240 
3241  bufHdr = GetBufferDescriptor(i);
3242 
3243  /*
3244  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3245  * and saves some cycles.
3246  */
3247  if (bufHdr->tag.rnode.dbNode != dbid)
3248  continue;
3249 
3251 
3252  buf_state = LockBufHdr(bufHdr);
3253  if (bufHdr->tag.rnode.dbNode == dbid &&
3254  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3255  {
3256  PinBuffer_Locked(bufHdr);
3258  FlushBuffer(bufHdr, NULL);
3260  UnpinBuffer(bufHdr, true);
3261  }
3262  else
3263  UnlockBufHdr(bufHdr, buf_state);
3264  }
3265 }
3266 
3267 /*
3268  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3269  * OS.
3270  */
3271 void
3273 {
3274  BufferDesc *bufHdr;
3275 
3276  /* currently not needed, but no fundamental reason not to support */
3277  Assert(!BufferIsLocal(buffer));
3278 
3279  Assert(BufferIsPinned(buffer));
3280 
3281  bufHdr = GetBufferDescriptor(buffer - 1);
3282 
3284 
3285  FlushBuffer(bufHdr, NULL);
3286 }
3287 
3288 /*
3289  * ReleaseBuffer -- release the pin on a buffer
3290  */
3291 void
3293 {
3294  if (!BufferIsValid(buffer))
3295  elog(ERROR, "bad buffer ID: %d", buffer);
3296 
3297  if (BufferIsLocal(buffer))
3298  {
3300 
3301  Assert(LocalRefCount[-buffer - 1] > 0);
3302  LocalRefCount[-buffer - 1]--;
3303  return;
3304  }
3305 
3306  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3307 }
3308 
3309 /*
3310  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3311  *
3312  * This is just a shorthand for a common combination.
3313  */
3314 void
3316 {
3317  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3318  ReleaseBuffer(buffer);
3319 }
3320 
3321 /*
3322  * IncrBufferRefCount
3323  * Increment the pin count on a buffer that we have *already* pinned
3324  * at least once.
3325  *
3326  * This function cannot be used on a buffer we do not have pinned,
3327  * because it doesn't change the shared buffer state.
3328  */
3329 void
3331 {
3332  Assert(BufferIsPinned(buffer));
3335  if (BufferIsLocal(buffer))
3336  LocalRefCount[-buffer - 1]++;
3337  else
3338  {
3339  PrivateRefCountEntry *ref;
3340 
3341  ref = GetPrivateRefCountEntry(buffer, true);
3342  Assert(ref != NULL);
3343  ref->refcount++;
3344  }
3345 }
3346 
3347 /*
3348  * MarkBufferDirtyHint
3349  *
3350  * Mark a buffer dirty for non-critical changes.
3351  *
3352  * This is essentially the same as MarkBufferDirty, except:
3353  *
3354  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3355  * to write an XLOG_FPI WAL record to protect against torn pages.
3356  * 2. The caller might have only share-lock instead of exclusive-lock on the
3357  * buffer's content lock.
3358  * 3. This function does not guarantee that the buffer is always marked dirty
3359  * (due to a race condition), so it cannot be used for important changes.
3360  */
3361 void
3363 {
3364  BufferDesc *bufHdr;
3365  Page page = BufferGetPage(buffer);
3366 
3367  if (!BufferIsValid(buffer))
3368  elog(ERROR, "bad buffer ID: %d", buffer);
3369 
3370  if (BufferIsLocal(buffer))
3371  {
3372  MarkLocalBufferDirty(buffer);
3373  return;
3374  }
3375 
3376  bufHdr = GetBufferDescriptor(buffer - 1);
3377 
3378  Assert(GetPrivateRefCount(buffer) > 0);
3379  /* here, either share or exclusive lock is OK */
3381 
3382  /*
3383  * This routine might get called many times on the same page, if we are
3384  * making the first scan after commit of an xact that added/deleted many
3385  * tuples. So, be as quick as we can if the buffer is already dirty. We
3386  * do this by not acquiring spinlock if it looks like the status bits are
3387  * already set. Since we make this test unlocked, there's a chance we
3388  * might fail to notice that the flags have just been cleared, and failed
3389  * to reset them, due to memory-ordering issues. But since this function
3390  * is only intended to be used in cases where failing to write out the
3391  * data would be harmless anyway, it doesn't really matter.
3392  */
3393  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3395  {
3397  bool dirtied = false;
3398  bool delayChkpt = false;
3399  uint32 buf_state;
3400 
3401  /*
3402  * If we need to protect hint bit updates from torn writes, WAL-log a
3403  * full page image of the page. This full page image is only necessary
3404  * if the hint bit update is the first change to the page since the
3405  * last checkpoint.
3406  *
3407  * We don't check full_page_writes here because that logic is included
3408  * when we call XLogInsert() since the value changes dynamically.
3409  */
3410  if (XLogHintBitIsNeeded() &&
3411  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3412  {
3413  /*
3414  * If we're in recovery we cannot dirty a page because of a hint.
3415  * We can set the hint, just not dirty the page as a result so the
3416  * hint is lost when we evict the page or shutdown.
3417  *
3418  * See src/backend/storage/page/README for longer discussion.
3419  */
3420  if (RecoveryInProgress())
3421  return;
3422 
3423  /*
3424  * If the block is already dirty because we either made a change
3425  * or set a hint already, then we don't need to write a full page
3426  * image. Note that aggressive cleaning of blocks dirtied by hint
3427  * bit setting would increase the call rate. Bulk setting of hint
3428  * bits would reduce the call rate...
3429  *
3430  * We must issue the WAL record before we mark the buffer dirty.
3431  * Otherwise we might write the page before we write the WAL. That
3432  * causes a race condition, since a checkpoint might occur between
3433  * writing the WAL record and marking the buffer dirty. We solve
3434  * that with a kluge, but one that is already in use during
3435  * transaction commit to prevent race conditions. Basically, we
3436  * simply prevent the checkpoint WAL record from being written
3437  * until we have marked the buffer dirty. We don't start the
3438  * checkpoint flush until we have marked dirty, so our checkpoint
3439  * must flush the change to disk successfully or the checkpoint
3440  * never gets written, so crash recovery will fix.
3441  *
3442  * It's possible we may enter here without an xid, so it is
3443  * essential that CreateCheckpoint waits for virtual transactions
3444  * rather than full transactionids.
3445  */
3446  MyPgXact->delayChkpt = delayChkpt = true;
3447  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3448  }
3449 
3450  buf_state = LockBufHdr(bufHdr);
3451 
3452  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3453 
3454  if (!(buf_state & BM_DIRTY))
3455  {
3456  dirtied = true; /* Means "will be dirtied by this action" */
3457 
3458  /*
3459  * Set the page LSN if we wrote a backup block. We aren't supposed
3460  * to set this when only holding a share lock but as long as we
3461  * serialise it somehow we're OK. We choose to set LSN while
3462  * holding the buffer header lock, which causes any reader of an
3463  * LSN who holds only a share lock to also obtain a buffer header
3464  * lock before using PageGetLSN(), which is enforced in
3465  * BufferGetLSNAtomic().
3466  *
3467  * If checksums are enabled, you might think we should reset the
3468  * checksum here. That will happen when the page is written
3469  * sometime later in this checkpoint cycle.
3470  */
3471  if (!XLogRecPtrIsInvalid(lsn))
3472  PageSetLSN(page, lsn);
3473  }
3474 
3475  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3476  UnlockBufHdr(bufHdr, buf_state);
3477 
3478  if (delayChkpt)
3479  MyPgXact->delayChkpt = false;
3480 
3481  if (dirtied)
3482  {
3483  VacuumPageDirty++;
3485  if (VacuumCostActive)
3487  }
3488  }
3489 }
3490 
3491 /*
3492  * Release buffer content locks for shared buffers.
3493  *
3494  * Used to clean up after errors.
3495  *
3496  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3497  * of releasing buffer content locks per se; the only thing we need to deal
3498  * with here is clearing any PIN_COUNT request that was in progress.
3499  */
3500 void
3502 {
3503  BufferDesc *buf = PinCountWaitBuf;
3504 
3505  if (buf)
3506  {
3507  uint32 buf_state;
3508 
3509  buf_state = LockBufHdr(buf);
3510 
3511  /*
3512  * Don't complain if flag bit not set; it could have been reset but we
3513  * got a cancel/die interrupt before getting the signal.
3514  */
3515  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3516  buf->wait_backend_pid == MyProcPid)
3517  buf_state &= ~BM_PIN_COUNT_WAITER;
3518 
3519  UnlockBufHdr(buf, buf_state);
3520 
3521  PinCountWaitBuf = NULL;
3522  }
3523 }
3524 
3525 /*
3526  * Acquire or release the content_lock for the buffer.
3527  */
3528 void
3530 {
3531  BufferDesc *buf;
3532 
3533  Assert(BufferIsValid(buffer));
3534  if (BufferIsLocal(buffer))
3535  return; /* local buffers need no lock */
3536 
3537  buf = GetBufferDescriptor(buffer - 1);
3538 
3539  if (mode == BUFFER_LOCK_UNLOCK)
3541  else if (mode == BUFFER_LOCK_SHARE)
3543  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3545  else
3546  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3547 }
3548 
3549 /*
3550  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3551  *
3552  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3553  */
3554 bool
3556 {
3557  BufferDesc *buf;
3558 
3559  Assert(BufferIsValid(buffer));
3560  if (BufferIsLocal(buffer))
3561  return true; /* act as though we got it */
3562 
3563  buf = GetBufferDescriptor(buffer - 1);
3564 
3566  LW_EXCLUSIVE);
3567 }
3568 
3569 /*
3570  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3571  *
3572  * Items may be deleted from a disk page only when the caller (a) holds an
3573  * exclusive lock on the buffer and (b) has observed that no other backend
3574  * holds a pin on the buffer. If there is a pin, then the other backend
3575  * might have a pointer into the buffer (for example, a heapscan reference
3576  * to an item --- see README for more details). It's OK if a pin is added
3577  * after the cleanup starts, however; the newly-arrived backend will be
3578  * unable to look at the page until we release the exclusive lock.
3579  *
3580  * To implement this protocol, a would-be deleter must pin the buffer and
3581  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3582  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3583  * it has successfully observed pin count = 1.
3584  */
3585 void
3587 {
3588  BufferDesc *bufHdr;
3589 
3590  Assert(BufferIsValid(buffer));
3591  Assert(PinCountWaitBuf == NULL);
3592 
3593  if (BufferIsLocal(buffer))
3594  {
3595  /* There should be exactly one pin */
3596  if (LocalRefCount[-buffer - 1] != 1)
3597  elog(ERROR, "incorrect local pin count: %d",
3598  LocalRefCount[-buffer - 1]);
3599  /* Nobody else to wait for */
3600  return;
3601  }
3602 
3603  /* There should be exactly one local pin */
3604  if (GetPrivateRefCount(buffer) != 1)
3605  elog(ERROR, "incorrect local pin count: %d",
3606  GetPrivateRefCount(buffer));
3607 
3608  bufHdr = GetBufferDescriptor(buffer - 1);
3609 
3610  for (;;)
3611  {
3612  uint32 buf_state;
3613 
3614  /* Try to acquire lock */
3616  buf_state = LockBufHdr(bufHdr);
3617 
3618  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3619  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3620  {
3621  /* Successfully acquired exclusive lock with pincount 1 */
3622  UnlockBufHdr(bufHdr, buf_state);
3623  return;
3624  }
3625  /* Failed, so mark myself as waiting for pincount 1 */
3626  if (buf_state & BM_PIN_COUNT_WAITER)
3627  {
3628  UnlockBufHdr(bufHdr, buf_state);
3629  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3630  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3631  }
3632  bufHdr->wait_backend_pid = MyProcPid;
3633  PinCountWaitBuf = bufHdr;
3634  buf_state |= BM_PIN_COUNT_WAITER;
3635  UnlockBufHdr(bufHdr, buf_state);
3636  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3637 
3638  /* Wait to be signaled by UnpinBuffer() */
3639  if (InHotStandby)
3640  {
3641  /* Publish the bufid that Startup process waits on */
3642  SetStartupBufferPinWaitBufId(buffer - 1);
3643  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3645  /* Reset the published bufid */
3647  }
3648  else
3650 
3651  /*
3652  * Remove flag marking us as waiter. Normally this will not be set
3653  * anymore, but ProcWaitForSignal() can return for other signals as
3654  * well. We take care to only reset the flag if we're the waiter, as
3655  * theoretically another backend could have started waiting. That's
3656  * impossible with the current usages due to table level locking, but
3657  * better be safe.
3658  */
3659  buf_state = LockBufHdr(bufHdr);
3660  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3661  bufHdr->wait_backend_pid == MyProcPid)
3662  buf_state &= ~BM_PIN_COUNT_WAITER;
3663  UnlockBufHdr(bufHdr, buf_state);
3664 
3665  PinCountWaitBuf = NULL;
3666  /* Loop back and try again */
3667  }
3668 }
3669 
3670 /*
3671  * Check called from RecoveryConflictInterrupt handler when Startup
3672  * process requests cancellation of all pin holders that are blocking it.
3673  */
3674 bool
3676 {
3677  int bufid = GetStartupBufferPinWaitBufId();
3678 
3679  /*
3680  * If we get woken slowly then it's possible that the Startup process was
3681  * already woken by other backends before we got here. Also possible that
3682  * we get here by multiple interrupts or interrupts at inappropriate
3683  * times, so make sure we do nothing if the bufid is not set.
3684  */
3685  if (bufid < 0)
3686  return false;
3687 
3688  if (GetPrivateRefCount(bufid + 1) > 0)
3689  return true;
3690 
3691  return false;
3692 }
3693 
3694 /*
3695  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3696  *
3697  * We won't loop, but just check once to see if the pin count is OK. If
3698  * not, return FALSE with no lock held.
3699  */
3700 bool
3702 {
3703  BufferDesc *bufHdr;
3704  uint32 buf_state,
3705  refcount;
3706 
3707  Assert(BufferIsValid(buffer));
3708 
3709  if (BufferIsLocal(buffer))
3710  {
3711  refcount = LocalRefCount[-buffer - 1];
3712  /* There should be exactly one pin */
3713  Assert(refcount > 0);
3714  if (refcount != 1)
3715  return false;
3716  /* Nobody else to wait for */
3717  return true;
3718  }
3719 
3720  /* There should be exactly one local pin */
3721  refcount = GetPrivateRefCount(buffer);
3722  Assert(refcount);
3723  if (refcount != 1)
3724  return false;
3725 
3726  /* Try to acquire lock */
3727  if (!ConditionalLockBuffer(buffer))
3728  return false;
3729 
3730  bufHdr = GetBufferDescriptor(buffer - 1);
3731  buf_state = LockBufHdr(bufHdr);
3732  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3733 
3734  Assert(refcount > 0);
3735  if (refcount == 1)
3736  {
3737  /* Successfully acquired exclusive lock with pincount 1 */
3738  UnlockBufHdr(bufHdr, buf_state);
3739  return true;
3740  }
3741 
3742  /* Failed, so release the lock */
3743  UnlockBufHdr(bufHdr, buf_state);
3744  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3745  return false;
3746 }
3747 
3748 /*
3749  * IsBufferCleanupOK - as above, but we already have the lock
3750  *
3751  * Check whether it's OK to perform cleanup on a buffer we've already
3752  * locked. If we observe that the pin count is 1, our exclusive lock
3753  * happens to be a cleanup lock, and we can proceed with anything that
3754  * would have been allowable had we sought a cleanup lock originally.
3755  */
3756 bool
3758 {
3759  BufferDesc *bufHdr;
3760  uint32 buf_state;
3761 
3762  Assert(BufferIsValid(buffer));
3763 
3764  if (BufferIsLocal(buffer))
3765  {
3766  /* There should be exactly one pin */
3767  if (LocalRefCount[-buffer - 1] != 1)
3768  return false;
3769  /* Nobody else to wait for */
3770  return true;
3771  }
3772 
3773  /* There should be exactly one local pin */
3774  if (GetPrivateRefCount(buffer) != 1)
3775  return false;
3776 
3777  bufHdr = GetBufferDescriptor(buffer - 1);
3778 
3779  /* caller must hold exclusive lock on buffer */
3781  LW_EXCLUSIVE));
3782 
3783  buf_state = LockBufHdr(bufHdr);
3784 
3785  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3786  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3787  {
3788  /* pincount is OK. */
3789  UnlockBufHdr(bufHdr, buf_state);
3790  return true;
3791  }
3792 
3793  UnlockBufHdr(bufHdr, buf_state);
3794  return false;
3795 }
3796 
3797 
3798 /*
3799  * Functions for buffer I/O handling
3800  *
3801  * Note: We assume that nested buffer I/O never occurs.
3802  * i.e at most one io_in_progress lock is held per proc.
3803  *
3804  * Also note that these are used only for shared buffers, not local ones.
3805  */
3806 
3807 /*
3808  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3809  */
3810 static void
3812 {
3813  /*
3814  * Changed to wait until there's no IO - Inoue 01/13/2000
3815  *
3816  * Note this is *necessary* because an error abort in the process doing
3817  * I/O could release the io_in_progress_lock prematurely. See
3818  * AbortBufferIO.
3819  */
3820  for (;;)
3821  {
3822  uint32 buf_state;
3823 
3824  /*
3825  * It may not be necessary to acquire the spinlock to check the flag
3826  * here, but since this test is essential for correctness, we'd better
3827  * play it safe.
3828  */
3829  buf_state = LockBufHdr(buf);
3830  UnlockBufHdr(buf, buf_state);
3831 
3832  if (!(buf_state & BM_IO_IN_PROGRESS))
3833  break;
3836  }
3837 }
3838 
3839 /*
3840  * StartBufferIO: begin I/O on this buffer
3841  * (Assumptions)
3842  * My process is executing no IO
3843  * The buffer is Pinned
3844  *
3845  * In some scenarios there are race conditions in which multiple backends
3846  * could attempt the same I/O operation concurrently. If someone else
3847  * has already started I/O on this buffer then we will block on the
3848  * io_in_progress lock until he's done.
3849  *
3850  * Input operations are only attempted on buffers that are not BM_VALID,
3851  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3852  * so we can always tell if the work is already done.
3853  *
3854  * Returns TRUE if we successfully marked the buffer as I/O busy,
3855  * FALSE if someone else already did the work.
3856  */
3857 static bool
3858 StartBufferIO(BufferDesc *buf, bool forInput)
3859 {
3860  uint32 buf_state;
3861 
3862  Assert(!InProgressBuf);
3863 
3864  for (;;)
3865  {
3866  /*
3867  * Grab the io_in_progress lock so that other processes can wait for
3868  * me to finish the I/O.
3869  */
3871 
3872  buf_state = LockBufHdr(buf);
3873 
3874  if (!(buf_state & BM_IO_IN_PROGRESS))
3875  break;
3876 
3877  /*
3878  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3879  * lock isn't held is if the process doing the I/O is recovering from
3880  * an error (see AbortBufferIO). If that's the case, we must wait for
3881  * him to get unwedged.
3882  */
3883  UnlockBufHdr(buf, buf_state);
3885  WaitIO(buf);
3886  }
3887 
3888  /* Once we get here, there is definitely no I/O active on this buffer */
3889 
3890  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3891  {
3892  /* someone else already did the I/O */
3893  UnlockBufHdr(buf, buf_state);
3895  return false;
3896  }
3897 
3898  buf_state |= BM_IO_IN_PROGRESS;
3899  UnlockBufHdr(buf, buf_state);
3900 
3901  InProgressBuf = buf;
3902  IsForInput = forInput;
3903 
3904  return true;
3905 }
3906 
3907 /*
3908  * TerminateBufferIO: release a buffer we were doing I/O on
3909  * (Assumptions)
3910  * My process is executing IO for the buffer
3911  * BM_IO_IN_PROGRESS bit is set for the buffer
3912  * We hold the buffer's io_in_progress lock
3913  * The buffer is Pinned
3914  *
3915  * If clear_dirty is TRUE and BM_JUST_DIRTIED is not set, we clear the
3916  * buffer's BM_DIRTY flag. This is appropriate when terminating a
3917  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
3918  * marking the buffer clean if it was re-dirtied while we were writing.
3919  *
3920  * set_flag_bits gets ORed into the buffer's flags. It must include
3921  * BM_IO_ERROR in a failure case. For successful completion it could
3922  * be 0, or BM_VALID if we just finished reading in the page.
3923  */
3924 static void
3925 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
3926 {
3927  uint32 buf_state;
3928 
3929  Assert(buf == InProgressBuf);
3930 
3931  buf_state = LockBufHdr(buf);
3932 
3933  Assert(buf_state & BM_IO_IN_PROGRESS);
3934 
3935  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
3936  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
3937  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
3938 
3939  buf_state |= set_flag_bits;
3940  UnlockBufHdr(buf, buf_state);
3941 
3942  InProgressBuf = NULL;
3943 
3945 }
3946 
3947 /*
3948  * AbortBufferIO: Clean up any active buffer I/O after an error.
3949  *
3950  * All LWLocks we might have held have been released,
3951  * but we haven't yet released buffer pins, so the buffer is still pinned.
3952  *
3953  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
3954  * possible the error condition wasn't related to the I/O.
3955  */
3956 void
3958 {
3959  BufferDesc *buf = InProgressBuf;
3960 
3961  if (buf)
3962  {
3963  uint32 buf_state;
3964 
3965  /*
3966  * Since LWLockReleaseAll has already been called, we're not holding
3967  * the buffer's io_in_progress_lock. We have to re-acquire it so that
3968  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
3969  * buffer will be in a busy spin until we succeed in doing this.
3970  */
3972 
3973  buf_state = LockBufHdr(buf);
3974  Assert(buf_state & BM_IO_IN_PROGRESS);
3975  if (IsForInput)
3976  {
3977  Assert(!(buf_state & BM_DIRTY));
3978 
3979  /* We'd better not think buffer is valid yet */
3980  Assert(!(buf_state & BM_VALID));
3981  UnlockBufHdr(buf, buf_state);
3982  }
3983  else
3984  {
3985  Assert(buf_state & BM_DIRTY);
3986  UnlockBufHdr(buf, buf_state);
3987  /* Issue notice if this is not the first failure... */
3988  if (buf_state & BM_IO_ERROR)
3989  {
3990  /* Buffer is pinned, so we can read tag without spinlock */
3991  char *path;
3992 
3993  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
3994  ereport(WARNING,
3995  (errcode(ERRCODE_IO_ERROR),
3996  errmsg("could not write block %u of %s",
3997  buf->tag.blockNum, path),
3998  errdetail("Multiple failures --- write error might be permanent.")));
3999  pfree(path);
4000  }
4001  }
4002  TerminateBufferIO(buf, false, BM_IO_ERROR);
4003  }
4004 }
4005 
4006 /*
4007  * Error context callback for errors occurring during shared buffer writes.
4008  */
4009 static void
4011 {
4012  BufferDesc *bufHdr = (BufferDesc *) arg;
4013 
4014  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4015  if (bufHdr != NULL)
4016  {
4017  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4018 
4019  errcontext("writing block %u of relation %s",
4020  bufHdr->tag.blockNum, path);
4021  pfree(path);
4022  }
4023 }
4024 
4025 /*
4026  * Error context callback for errors occurring during local buffer writes.
4027  */
4028 static void
4030 {
4031  BufferDesc *bufHdr = (BufferDesc *) arg;
4032 
4033  if (bufHdr != NULL)
4034  {
4035  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4036  bufHdr->tag.forkNum);
4037 
4038  errcontext("writing block %u of relation %s",
4039  bufHdr->tag.blockNum, path);
4040  pfree(path);
4041  }
4042 }
4043 
4044 /*
4045  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4046  */
4047 static int
4048 rnode_comparator(const void *p1, const void *p2)
4049 {
4050  RelFileNode n1 = *(RelFileNode *) p1;
4051  RelFileNode n2 = *(RelFileNode *) p2;
4052 
4053  if (n1.relNode < n2.relNode)
4054  return -1;
4055  else if (n1.relNode > n2.relNode)
4056  return 1;
4057 
4058  if (n1.dbNode < n2.dbNode)
4059  return -1;
4060  else if (n1.dbNode > n2.dbNode)
4061  return 1;
4062 
4063  if (n1.spcNode < n2.spcNode)
4064  return -1;
4065  else if (n1.spcNode > n2.spcNode)
4066  return 1;
4067  else
4068  return 0;
4069 }
4070 
4071 /*
4072  * Lock buffer header - set BM_LOCKED in buffer state.
4073  */
4074 uint32
4076 {
4077  SpinDelayStatus delayStatus;
4078  uint32 old_buf_state;
4079 
4080  init_local_spin_delay(&delayStatus);
4081 
4082  while (true)
4083  {
4084  /* set BM_LOCKED flag */
4085  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4086  /* if it wasn't set before we're OK */
4087  if (!(old_buf_state & BM_LOCKED))
4088  break;
4089  perform_spin_delay(&delayStatus);
4090  }
4091  finish_spin_delay(&delayStatus);
4092  return old_buf_state | BM_LOCKED;
4093 }
4094 
4095 /*
4096  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4097  * state at that point.
4098  *
4099  * Obviously the buffer could be locked by the time the value is returned, so
4100  * this is primarily useful in CAS style loops.
4101  */
4102 static uint32
4104 {
4105  SpinDelayStatus delayStatus;
4106  uint32 buf_state;
4107 
4108  init_local_spin_delay(&delayStatus);
4109 
4110  buf_state = pg_atomic_read_u32(&buf->state);
4111 
4112  while (buf_state & BM_LOCKED)
4113  {
4114  perform_spin_delay(&delayStatus);
4115  buf_state = pg_atomic_read_u32(&buf->state);
4116  }
4117 
4118  finish_spin_delay(&delayStatus);
4119 
4120  return buf_state;
4121 }
4122 
4123 /*
4124  * BufferTag comparator.
4125  */
4126 static int
4127 buffertag_comparator(const void *a, const void *b)
4128 {
4129  const BufferTag *ba = (const BufferTag *) a;
4130  const BufferTag *bb = (const BufferTag *) b;
4131  int ret;
4132 
4133  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4134 
4135  if (ret != 0)
4136  return ret;
4137 
4138  if (ba->forkNum < bb->forkNum)
4139  return -1;
4140  if (ba->forkNum > bb->forkNum)
4141  return 1;
4142 
4143  if (ba->blockNum < bb->blockNum)
4144  return -1;
4145  if (ba->blockNum > bb->blockNum)
4146  return 1;
4147 
4148  return 0;
4149 }
4150 
4151 /*
4152  * Comparator determining the writeout order in a checkpoint.
4153  *
4154  * It is important that tablespaces are compared first, the logic balancing
4155  * writes between tablespaces relies on it.
4156  */
4157 static int
4158 ckpt_buforder_comparator(const void *pa, const void *pb)
4159 {
4160  const CkptSortItem *a = (CkptSortItem *) pa;
4161  const CkptSortItem *b = (CkptSortItem *) pb;
4162 
4163  /* compare tablespace */
4164  if (a->tsId < b->tsId)
4165  return -1;
4166  else if (a->tsId > b->tsId)
4167  return 1;
4168  /* compare relation */
4169  if (a->relNode < b->relNode)
4170  return -1;
4171  else if (a->relNode > b->relNode)
4172  return 1;
4173  /* compare fork */
4174  else if (a->forkNum < b->forkNum)
4175  return -1;
4176  else if (a->forkNum > b->forkNum)
4177  return 1;
4178  /* compare block number */
4179  else if (a->blockNum < b->blockNum)
4180  return -1;
4181  else /* should not be the same block ... */
4182  return 1;
4183 }
4184 
4185 /*
4186  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4187  * progress.
4188  */
4189 static int
4191 {
4192  CkptTsStatus *sa = (CkptTsStatus *) a;
4193  CkptTsStatus *sb = (CkptTsStatus *) b;
4194 
4195  /* we want a min-heap, so return 1 for the a < b */
4196  if (sa->progress < sb->progress)
4197  return 1;
4198  else if (sa->progress == sb->progress)
4199  return 0;
4200  else
4201  return -1;
4202 }
4203 
4204 /*
4205  * Initialize a writeback context, discarding potential previous state.
4206  *
4207  * *max_pending is a pointer instead of an immediate value, so the coalesce
4208  * limits can easily changed by the GUC mechanism, and so calling code does
4209  * not have to check the current configuration. A value is 0 means that no
4210  * writeback control will be performed.
4211  */
4212 void
4213 WritebackContextInit(WritebackContext *context, int *max_pending)
4214 {
4215  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4216 
4217  context->max_pending = max_pending;
4218  context->nr_pending = 0;
4219 }
4220 
4221 /*
4222  * Add buffer to list of pending writeback requests.
4223  */
4224 void
4226 {
4227  PendingWriteback *pending;
4228 
4229  /*
4230  * Add buffer to the pending writeback array, unless writeback control is
4231  * disabled.
4232  */
4233  if (*context->max_pending > 0)
4234  {
4236 
4237  pending = &context->pending_writebacks[context->nr_pending++];
4238 
4239  pending->tag = *tag;
4240  }
4241 
4242  /*
4243  * Perform pending flushes if the writeback limit is exceeded. This
4244  * includes the case where previously an item has been added, but control
4245  * is now disabled.
4246  */
4247  if (context->nr_pending >= *context->max_pending)
4248  IssuePendingWritebacks(context);
4249 }
4250 
4251 /*
4252  * Issue all pending writeback requests, previously scheduled with
4253  * ScheduleBufferTagForWriteback, to the OS.
4254  *
4255  * Because this is only used to improve the OSs IO scheduling we try to never
4256  * error out - it's just a hint.
4257  */
4258 void
4260 {
4261  int i;
4262 
4263  if (context->nr_pending == 0)
4264  return;
4265 
4266  /*
4267  * Executing the writes in-order can make them a lot faster, and allows to
4268  * merge writeback requests to consecutive blocks into larger writebacks.
4269  */
4270  qsort(&context->pending_writebacks, context->nr_pending,
4272 
4273  /*
4274  * Coalesce neighbouring writes, but nothing else. For that we iterate
4275  * through the, now sorted, array of pending flushes, and look forward to
4276  * find all neighbouring (or identical) writes.
4277  */
4278  for (i = 0; i < context->nr_pending; i++)
4279  {
4282  SMgrRelation reln;
4283  int ahead;
4284  BufferTag tag;
4285  Size nblocks = 1;
4286 
4287  cur = &context->pending_writebacks[i];
4288  tag = cur->tag;
4289 
4290  /*
4291  * Peek ahead, into following writeback requests, to see if they can
4292  * be combined with the current one.
4293  */
4294  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4295  {
4296  next = &context->pending_writebacks[i + ahead + 1];
4297 
4298  /* different file, stop */
4299  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4300  cur->tag.forkNum != next->tag.forkNum)
4301  break;
4302 
4303  /* ok, block queued twice, skip */
4304  if (cur->tag.blockNum == next->tag.blockNum)
4305  continue;
4306 
4307  /* only merge consecutive writes */
4308  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4309  break;
4310 
4311  nblocks++;
4312  cur = next;
4313  }
4314 
4315  i += ahead;
4316 
4317  /* and finally tell the kernel to write the data to storage */
4318  reln = smgropen(tag.rnode, InvalidBackendId);
4319  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4320  }
4321 
4322  context->nr_pending = 0;
4323 }
4324 
4325 
4326 /*
4327  * Implement slower/larger portions of TestForOldSnapshot
4328  *
4329  * Smaller/faster portions are put inline, but the entire set of logic is too
4330  * big for that.
4331  */
4332 void
4334 {
4335  if (RelationAllowsEarlyPruning(relation)
4336  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4337  ereport(ERROR,
4338  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4339  errmsg("snapshot too old")));
4340 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:61
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1566
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:278
#define init_local_spin_delay(status)
Definition: s_lock.h:997
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:419
#define MAX_IO_CONCURRENCY
Definition: bufmgr.h:79
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:184
int target_prefetch_pages
Definition: bufmgr.c:129
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:150
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:677
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3586
Definition: lwlock.h:32
#define relpathperm(rnode, forknum)
Definition: relpath.h:67
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:415
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2557
PgStat_Counter m_buf_alloc
Definition: pgstat.h:420
TimestampTz ckpt_sync_end_t
Definition: xlog.h:202
#define BM_PERMANENT
Definition: buf_internals.h:67
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1848
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:378
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:181
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:38
int errhint(const char *fmt,...)
Definition: elog.c:987
BackendId MyBackendId
Definition: globals.c:72
long local_blks_read
Definition: instrument.h:26
#define BM_TAG_VALID
Definition: buf_internals.h:61
Oid tsId
Definition: bufmgr.c:86
static int32 next
Definition: blutils.c:210
int VacuumCostBalance
Definition: globals.c:138
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2028
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2514
int BgWriterDelay
Definition: bgwriter.c:67
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:94
#define HASH_ELEM
Definition: hsearch.h:87
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:81
static uint32 PrivateRefCountClock
Definition: bufmgr.c:171
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3362
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:170
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1830
instr_time blk_read_time
Definition: instrument.h:31
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1688
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1347
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4029
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:321
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3064
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1445
int backend_flush_after
Definition: bufmgr.c:120
#define PointerGetDatum(X)
Definition: postgres.h:564
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2398
int VacuumPageHit
Definition: globals.c:134
ResourceOwner CurrentResourceOwner
Definition: resowner.c:138
struct SMgrRelationData * rd_smgr
Definition: rel.h:87
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:417
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: bufmgr.c:2866
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:894
struct timeval instr_time
Definition: instr_time.h:147
bool InRecovery
Definition: xlog.c:190
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:66
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3811
#define Min(x, y)
Definition: c.h:801
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:80
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4190
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:381
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:416
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:127
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:682
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1143
int checkpoint_flush_after
Definition: bufmgr.c:118
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:852
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:347
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3292
#define P_NEW
Definition: bufmgr.h:82
double bgwriter_lru_multiplier
Definition: bufmgr.c:110
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:885
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:92
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:113
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:7825
#define BM_DIRTY
Definition: buf_internals.h:59
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:68
int VacuumCostPageDirty
Definition: globals.c:130
struct ErrorContextCallback * previous
Definition: elog.h:238
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:855
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2744
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2648
int effective_io_concurrency
Definition: bufmgr.c:112
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4259
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:132
signed int int32
Definition: c.h:253
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4213
PGXACT * MyPgXact
Definition: proc.c:68
int bgwriter_flush_after
Definition: bufmgr.c:119
bool ComputeIoConcurrency(int io_concurrency, double *target)
Definition: bufmgr.c:467
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1714
#define RELPERSISTENCE_PERMANENT
Definition: pg_class.h:170
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:625
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:252
ErrorContextCallback * error_context_stack
Definition: elog.c:88
#define RelationOpenSmgr(relation)
Definition: rel.h:457
void ProcSendSignal(int pid)
Definition: proc.c:1750
#define SmgrIsTemp(smgr)
Definition: smgr.h:80
#define BUF_REUSABLE
Definition: bufmgr.c:66
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:193
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3858
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:992
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:120
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2420
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3701
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:377
#define RelationIsValid(relation)
Definition: rel.h:386
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:434
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:167
#define FALSE
Definition: c.h:218
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4225
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:46
int bgwriter_lru_maxpages
Definition: bufmgr.c:109
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:43
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1165
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:175
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
int num_to_scan
Definition: bufmgr.c:99
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:348
float8 progress_slice
Definition: bufmgr.c:96
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2815
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1282
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:611
static char * buf
Definition: pg_test_fsync.c:65
int index
Definition: bufmgr.c:104
float8 progress
Definition: bufmgr.c:95
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3229
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:155
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:647
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4158
int errdetail(const char *fmt,...)
Definition: elog.c:873
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:176
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:44
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:64
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:358
unsigned int uint32
Definition: c.h:265
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:660
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4127
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2785
#define BUF_WRITTEN
Definition: bufmgr.c:65
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
static bool IsForInput
Definition: bufmgr.c:133
#define ereport(elevel, rest)
Definition: elog.h:122
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:168
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3555
bool delayChkpt
Definition: proc.h:210
int VacuumCostPageHit
Definition: globals.c:128
static void BufferSync(int flags)
Definition: bufmgr.c:1761
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:137
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3757
ForkNumber
Definition: relpath.h:24
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1677
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1739
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:839
int ckpt_bufs_written
Definition: xlog.h:205
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:172
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:38
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:623
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:41
#define BM_LOCKED
Definition: buf_internals.h:58
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1170
void UnlockBuffers(void)
Definition: bufmgr.c:3501
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3925
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4048
int VacuumPageDirty
Definition: globals.c:136
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:60
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:2923
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:57
int64 GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1656
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:301
uintptr_t Datum
Definition: postgres.h:374
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:703
#define InvalidOid
Definition: postgres_ext.h:36
void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:529
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:669
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:60
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2771
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:987
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:720
RelFileNode rd_node
Definition: rel.h:85
#define Max(x, y)
Definition: c.h:795
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context)
Definition: bufmgr.c:2331
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4075
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:672
#define NULL
Definition: c.h:226
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:670
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1175
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:98
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2473
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:530
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3675
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:202
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:77
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3131
CheckpointStatsData CheckpointStats
Definition: xlog.c:171
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1639
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
size_t Size
Definition: c.h:352
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:43
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:200
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1177
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1172
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1508
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1110
void AbortBufferIO(void)
Definition: bufmgr.c:3957
BlockNumber blockNum
Definition: buf_internals.h:95
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4103
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1353
TimestampTz ckpt_sync_t
Definition: xlog.h:201
RelFileNode rnode
Definition: buf_internals.h:93
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1021
void smgrsync(void)
Definition: smgr.c:759
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1343
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:153
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3272
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:77
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:509
#define BM_IO_ERROR
Definition: buf_internals.h:63
#define PageGetLSN(page)
Definition: bufpage.h:363
#define DatumGetPointer(X)
Definition: postgres.h:557
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:600
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
void(* callback)(void *arg)
Definition: elog.h:239
#define PageIsNew(page)
Definition: bufpage.h:226
void * palloc(Size size)
Definition: mcxt.c:891
int errmsg(const char *fmt,...)
Definition: elog.c:797
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:169
int i
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:136
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:611
#define relpath(rnode, forknum)
Definition: relpath.h:71
#define errcontext
Definition: elog.h:164
int NBuffers
Definition: globals.c:122
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:88
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:287
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:186
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:382
int num_scanned
Definition: bufmgr.c:101
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2444
#define TRUE
Definition: c.h:214
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:62
int VacuumPageMiss
Definition: globals.c:135
int VacuumCostPageMiss
Definition: globals.c:129
#define elog
Definition: elog.h:219
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:50
#define qsort(a, b, c, d)
Definition: port.h:440
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:58
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2609
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3026
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:62
#define PageSetLSN(page, lsn)
Definition: bufpage.h:365
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4010
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4333
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2574
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3330
#define XLogHintBitIsNeeded()
Definition: xlog.h:156
bool track_io_timing
Definition: bufmgr.c:111
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:74
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:175
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:49
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:125
void * Block
Definition: bufmgr.h:25
bool VacuumCostActive
Definition: globals.c:139
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:251
bool zero_damaged_pages
Definition: bufmgr.c:108
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:869
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:65
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2454