PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/ps_status.h"
53 #include "utils/rel.h"
54 #include "utils/resowner_private.h"
55 #include "utils/timestamp.h"
56 
57 
58 /* Note: these two macros only work on shared buffers, not local ones! */
59 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
60 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
61 
62 /* Note: this macro only works on local buffers, not shared ones! */
63 #define LocalBufHdrGetBlock(bufHdr) \
64  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
65 
66 /* Bits in SyncOneBuffer's return value */
67 #define BUF_WRITTEN 0x01
68 #define BUF_REUSABLE 0x02
69 
70 #define DROP_RELS_BSEARCH_THRESHOLD 20
71 
72 typedef struct PrivateRefCountEntry
73 {
77 
78 /* 64 bytes, about the size of a cache line on common systems */
79 #define REFCOUNT_ARRAY_ENTRIES 8
80 
81 /*
82  * Status of buffers to checkpoint for a particular tablespace, used
83  * internally in BufferSync.
84  */
85 typedef struct CkptTsStatus
86 {
87  /* oid of the tablespace */
89 
90  /*
91  * Checkpoint progress for this tablespace. To make progress comparable
92  * between tablespaces the progress is, for each tablespace, measured as a
93  * number between 0 and the total number of to-be-checkpointed pages. Each
94  * page checkpointed in this tablespace increments this space's progress
95  * by progress_slice.
96  */
99 
100  /* number of to-be checkpointed pages in this tablespace */
102  /* already processed pages in this tablespace */
104 
105  /* current offset in CkptBufferIds for this tablespace */
106  int index;
107 } CkptTsStatus;
108 
109 /* GUC variables */
110 bool zero_damaged_pages = false;
113 bool track_io_timing = false;
114 
115 /*
116  * How many buffers PrefetchBuffer callers should try to stay ahead of their
117  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
118  * for buffers not belonging to tablespaces that have their
119  * effective_io_concurrency parameter set.
120  */
122 
123 /*
124  * Like effective_io_concurrency, but used by maintenance code paths that might
125  * benefit from a higher setting because they work on behalf of many sessions.
126  * Overridden by the tablespace setting of the same name.
127  */
129 
130 /*
131  * GUC variables about triggering kernel writeback for buffers written; OS
132  * dependent defaults are set via the GUC mechanism.
133  */
137 
138 /* local state for StartBufferIO and related functions */
139 static BufferDesc *InProgressBuf = NULL;
140 static bool IsForInput;
141 
142 /* local state for LockBufferForCleanup */
144 
145 /*
146  * Backend-Private refcount management:
147  *
148  * Each buffer also has a private refcount that keeps track of the number of
149  * times the buffer is pinned in the current process. This is so that the
150  * shared refcount needs to be modified only once if a buffer is pinned more
151  * than once by an individual backend. It's also used to check that no buffers
152  * are still pinned at the end of transactions and when exiting.
153  *
154  *
155  * To avoid - as we used to - requiring an array with NBuffers entries to keep
156  * track of local buffers, we use a small sequentially searched array
157  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
158  * keep track of backend local pins.
159  *
160  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
161  * refcounts are kept track of in the array; after that, new array entries
162  * displace old ones into the hash table. That way a frequently used entry
163  * can't get "stuck" in the hashtable while infrequent ones clog the array.
164  *
165  * Note that in most scenarios the number of pinned buffers will not exceed
166  * REFCOUNT_ARRAY_ENTRIES.
167  *
168  *
169  * To enter a buffer into the refcount tracking mechanism first reserve a free
170  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
171  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
172  * memory allocations in NewPrivateRefCountEntry() which can be important
173  * because in some scenarios it's called with a spinlock held...
174  */
176 static HTAB *PrivateRefCountHash = NULL;
180 
181 static void ReservePrivateRefCountEntry(void);
184 static inline int32 GetPrivateRefCount(Buffer buffer);
186 
187 /*
188  * Ensure that the PrivateRefCountArray has sufficient space to store one more
189  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
190  * a new entry - but it's perfectly fine to not use a reserved entry.
191  */
192 static void
194 {
195  /* Already reserved (or freed), nothing to do */
196  if (ReservedRefCountEntry != NULL)
197  return;
198 
199  /*
200  * First search for a free entry the array, that'll be sufficient in the
201  * majority of cases.
202  */
203  {
204  int i;
205 
206  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
207  {
209 
210  res = &PrivateRefCountArray[i];
211 
212  if (res->buffer == InvalidBuffer)
213  {
214  ReservedRefCountEntry = res;
215  return;
216  }
217  }
218  }
219 
220  /*
221  * No luck. All array entries are full. Move one array entry into the hash
222  * table.
223  */
224  {
225  /*
226  * Move entry from the current clock position in the array into the
227  * hashtable. Use that slot.
228  */
229  PrivateRefCountEntry *hashent;
230  bool found;
231 
232  /* select victim slot */
233  ReservedRefCountEntry =
235 
236  /* Better be used, otherwise we shouldn't get here. */
237  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
238 
239  /* enter victim array entry into hashtable */
240  hashent = hash_search(PrivateRefCountHash,
241  (void *) &(ReservedRefCountEntry->buffer),
242  HASH_ENTER,
243  &found);
244  Assert(!found);
245  hashent->refcount = ReservedRefCountEntry->refcount;
246 
247  /* clear the now free array slot */
248  ReservedRefCountEntry->buffer = InvalidBuffer;
249  ReservedRefCountEntry->refcount = 0;
250 
252  }
253 }
254 
255 /*
256  * Fill a previously reserved refcount entry.
257  */
258 static PrivateRefCountEntry *
260 {
262 
263  /* only allowed to be called when a reservation has been made */
264  Assert(ReservedRefCountEntry != NULL);
265 
266  /* use up the reserved entry */
267  res = ReservedRefCountEntry;
268  ReservedRefCountEntry = NULL;
269 
270  /* and fill it */
271  res->buffer = buffer;
272  res->refcount = 0;
273 
274  return res;
275 }
276 
277 /*
278  * Return the PrivateRefCount entry for the passed buffer.
279  *
280  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
281  * do_move is true, and the entry resides in the hashtable the entry is
282  * optimized for frequent access by moving it to the array.
283  */
284 static PrivateRefCountEntry *
286 {
288  int i;
289 
290  Assert(BufferIsValid(buffer));
291  Assert(!BufferIsLocal(buffer));
292 
293  /*
294  * First search for references in the array, that'll be sufficient in the
295  * majority of cases.
296  */
297  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
298  {
299  res = &PrivateRefCountArray[i];
300 
301  if (res->buffer == buffer)
302  return res;
303  }
304 
305  /*
306  * By here we know that the buffer, if already pinned, isn't residing in
307  * the array.
308  *
309  * Only look up the buffer in the hashtable if we've previously overflowed
310  * into it.
311  */
312  if (PrivateRefCountOverflowed == 0)
313  return NULL;
314 
315  res = hash_search(PrivateRefCountHash,
316  (void *) &buffer,
317  HASH_FIND,
318  NULL);
319 
320  if (res == NULL)
321  return NULL;
322  else if (!do_move)
323  {
324  /* caller doesn't want us to move the hash entry into the array */
325  return res;
326  }
327  else
328  {
329  /* move buffer from hashtable into the free array slot */
330  bool found;
332 
333  /* Ensure there's a free array slot */
335 
336  /* Use up the reserved slot */
337  Assert(ReservedRefCountEntry != NULL);
338  free = ReservedRefCountEntry;
339  ReservedRefCountEntry = NULL;
340  Assert(free->buffer == InvalidBuffer);
341 
342  /* and fill it */
343  free->buffer = buffer;
344  free->refcount = res->refcount;
345 
346  /* delete from hashtable */
347  hash_search(PrivateRefCountHash,
348  (void *) &buffer,
349  HASH_REMOVE,
350  &found);
351  Assert(found);
354 
355  return free;
356  }
357 }
358 
359 /*
360  * Returns how many times the passed buffer is pinned by this backend.
361  *
362  * Only works for shared memory buffers!
363  */
364 static inline int32
366 {
368 
369  Assert(BufferIsValid(buffer));
370  Assert(!BufferIsLocal(buffer));
371 
372  /*
373  * Not moving the entry - that's ok for the current users, but we might
374  * want to change this one day.
375  */
376  ref = GetPrivateRefCountEntry(buffer, false);
377 
378  if (ref == NULL)
379  return 0;
380  return ref->refcount;
381 }
382 
383 /*
384  * Release resources used to track the reference count of a buffer which we no
385  * longer have pinned and don't want to pin again immediately.
386  */
387 static void
389 {
390  Assert(ref->refcount == 0);
391 
392  if (ref >= &PrivateRefCountArray[0] &&
394  {
395  ref->buffer = InvalidBuffer;
396 
397  /*
398  * Mark the just used entry as reserved - in many scenarios that
399  * allows us to avoid ever having to search the array/hash for free
400  * entries.
401  */
402  ReservedRefCountEntry = ref;
403  }
404  else
405  {
406  bool found;
407  Buffer buffer = ref->buffer;
408 
409  hash_search(PrivateRefCountHash,
410  (void *) &buffer,
411  HASH_REMOVE,
412  &found);
413  Assert(found);
416  }
417 }
418 
419 /*
420  * BufferIsPinned
421  * True iff the buffer is pinned (also checks for valid buffer number).
422  *
423  * NOTE: what we check here is that *this* backend holds a pin on
424  * the buffer. We do not care whether some other backend does.
425  */
426 #define BufferIsPinned(bufnum) \
427 ( \
428  !BufferIsValid(bufnum) ? \
429  false \
430  : \
431  BufferIsLocal(bufnum) ? \
432  (LocalRefCount[-(bufnum) - 1] > 0) \
433  : \
434  (GetPrivateRefCount(bufnum) > 0) \
435 )
436 
437 
438 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
439  ForkNumber forkNum, BlockNumber blockNum,
441  bool *hit);
442 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
443 static void PinBuffer_Locked(BufferDesc *buf);
444 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
445 static void BufferSync(int flags);
447 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
448  WritebackContext *wb_context);
449 static void WaitIO(BufferDesc *buf);
450 static bool StartBufferIO(BufferDesc *buf, bool forInput);
451 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
452  uint32 set_flag_bits);
453 static void shared_buffer_write_error_callback(void *arg);
454 static void local_buffer_write_error_callback(void *arg);
455 static BufferDesc *BufferAlloc(SMgrRelation smgr,
456  char relpersistence,
457  ForkNumber forkNum,
458  BlockNumber blockNum,
459  BufferAccessStrategy strategy,
460  bool *foundPtr);
461 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
462 static void AtProcExit_Buffers(int code, Datum arg);
463 static void CheckForBufferLeaks(void);
464 static int rnode_comparator(const void *p1, const void *p2);
465 static int buffertag_comparator(const void *p1, const void *p2);
466 static int ckpt_buforder_comparator(const void *pa, const void *pb);
467 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
468 
469 
470 /*
471  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
472  *
473  * This is named by analogy to ReadBuffer but doesn't actually allocate a
474  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
475  * block will not be delayed by the I/O. Prefetching is optional.
476  * No-op if prefetching isn't compiled in.
477  */
478 void
480 {
481 #ifdef USE_PREFETCH
482  Assert(RelationIsValid(reln));
483  Assert(BlockNumberIsValid(blockNum));
484 
485  /* Open it at the smgr level if not already done */
486  RelationOpenSmgr(reln);
487 
488  if (RelationUsesLocalBuffers(reln))
489  {
490  /* see comments in ReadBufferExtended */
491  if (RELATION_IS_OTHER_TEMP(reln))
492  ereport(ERROR,
493  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
494  errmsg("cannot access temporary tables of other sessions")));
495 
496  /* pass it off to localbuf.c */
497  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
498  }
499  else
500  {
501  BufferTag newTag; /* identity of requested block */
502  uint32 newHash; /* hash value for newTag */
503  LWLock *newPartitionLock; /* buffer partition lock for it */
504  int buf_id;
505 
506  /* create a tag so we can lookup the buffer */
507  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
508  forkNum, blockNum);
509 
510  /* determine its hash code and partition lock ID */
511  newHash = BufTableHashCode(&newTag);
512  newPartitionLock = BufMappingPartitionLock(newHash);
513 
514  /* see if the block is in the buffer pool already */
515  LWLockAcquire(newPartitionLock, LW_SHARED);
516  buf_id = BufTableLookup(&newTag, newHash);
517  LWLockRelease(newPartitionLock);
518 
519  /* If not in buffers, initiate prefetch */
520  if (buf_id < 0)
521  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
522 
523  /*
524  * If the block *is* in buffers, we do nothing. This is not really
525  * ideal: the block might be just about to be evicted, which would be
526  * stupid since we know we are going to need it soon. But the only
527  * easy answer is to bump the usage_count, which does not seem like a
528  * great solution: when the caller does ultimately touch the block,
529  * usage_count would get bumped again, resulting in too much
530  * favoritism for blocks that are involved in a prefetch sequence. A
531  * real fix would involve some additional per-buffer state, and it's
532  * not clear that there's enough of a problem to justify that.
533  */
534  }
535 #endif /* USE_PREFETCH */
536 }
537 
538 
539 /*
540  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
541  * fork with RBM_NORMAL mode and default strategy.
542  */
543 Buffer
545 {
546  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
547 }
548 
549 /*
550  * ReadBufferExtended -- returns a buffer containing the requested
551  * block of the requested relation. If the blknum
552  * requested is P_NEW, extend the relation file and
553  * allocate a new block. (Caller is responsible for
554  * ensuring that only one backend tries to extend a
555  * relation at the same time!)
556  *
557  * Returns: the buffer number for the buffer containing
558  * the block read. The returned buffer has been pinned.
559  * Does not return on error --- elog's instead.
560  *
561  * Assume when this function is called, that reln has been opened already.
562  *
563  * In RBM_NORMAL mode, the page is read from disk, and the page header is
564  * validated. An error is thrown if the page header is not valid. (But
565  * note that an all-zero page is considered "valid"; see PageIsVerified().)
566  *
567  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
568  * valid, the page is zeroed instead of throwing an error. This is intended
569  * for non-critical data, where the caller is prepared to repair errors.
570  *
571  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
572  * filled with zeros instead of reading it from disk. Useful when the caller
573  * is going to fill the page from scratch, since this saves I/O and avoids
574  * unnecessary failure if the page-on-disk has corrupt page headers.
575  * The page is returned locked to ensure that the caller has a chance to
576  * initialize the page before it's made visible to others.
577  * Caution: do not use this mode to read a page that is beyond the relation's
578  * current physical EOF; that is likely to cause problems in md.c when
579  * the page is modified and written out. P_NEW is OK, though.
580  *
581  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
582  * a cleanup-strength lock on the page.
583  *
584  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
585  *
586  * If strategy is not NULL, a nondefault buffer access strategy is used.
587  * See buffer/README for details.
588  */
589 Buffer
592 {
593  bool hit;
594  Buffer buf;
595 
596  /* Open it at the smgr level if not already done */
597  RelationOpenSmgr(reln);
598 
599  /*
600  * Reject attempts to read non-local temporary relations; we would be
601  * likely to get wrong data since we have no visibility into the owning
602  * session's local buffers.
603  */
604  if (RELATION_IS_OTHER_TEMP(reln))
605  ereport(ERROR,
606  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
607  errmsg("cannot access temporary tables of other sessions")));
608 
609  /*
610  * Read the buffer, and update pgstat counters to reflect a cache hit or
611  * miss.
612  */
614  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
615  forkNum, blockNum, mode, strategy, &hit);
616  if (hit)
618  return buf;
619 }
620 
621 
622 /*
623  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
624  * a relcache entry for the relation.
625  *
626  * NB: At present, this function may only be used on permanent relations, which
627  * is OK, because we only use it during XLOG replay. If in the future we
628  * want to use it on temporary or unlogged relations, we could pass additional
629  * parameters.
630  */
631 Buffer
633  BlockNumber blockNum, ReadBufferMode mode,
634  BufferAccessStrategy strategy)
635 {
636  bool hit;
637 
638  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
639 
641 
642  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
643  mode, strategy, &hit);
644 }
645 
646 
647 /*
648  * ReadBuffer_common -- common logic for all ReadBuffer variants
649  *
650  * *hit is set to true if the request was satisfied from shared buffer cache.
651  */
652 static Buffer
653 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
654  BlockNumber blockNum, ReadBufferMode mode,
655  BufferAccessStrategy strategy, bool *hit)
656 {
657  BufferDesc *bufHdr;
658  Block bufBlock;
659  bool found;
660  bool isExtend;
661  bool isLocalBuf = SmgrIsTemp(smgr);
662 
663  *hit = false;
664 
665  /* Make sure we will have room to remember the buffer pin */
667 
668  isExtend = (blockNum == P_NEW);
669 
670  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
671  smgr->smgr_rnode.node.spcNode,
672  smgr->smgr_rnode.node.dbNode,
673  smgr->smgr_rnode.node.relNode,
674  smgr->smgr_rnode.backend,
675  isExtend);
676 
677  /* Substitute proper block number if caller asked for P_NEW */
678  if (isExtend)
679  blockNum = smgrnblocks(smgr, forkNum);
680 
681  if (isLocalBuf)
682  {
683  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
684  if (found)
686  else if (isExtend)
688  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
689  mode == RBM_ZERO_ON_ERROR)
691  }
692  else
693  {
694  /*
695  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
696  * not currently in memory.
697  */
698  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
699  strategy, &found);
700  if (found)
702  else if (isExtend)
704  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
705  mode == RBM_ZERO_ON_ERROR)
707  }
708 
709  /* At this point we do NOT hold any locks. */
710 
711  /* if it was already in the buffer pool, we're done */
712  if (found)
713  {
714  if (!isExtend)
715  {
716  /* Just need to update stats before we exit */
717  *hit = true;
718  VacuumPageHit++;
719 
720  if (VacuumCostActive)
722 
723  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
724  smgr->smgr_rnode.node.spcNode,
725  smgr->smgr_rnode.node.dbNode,
726  smgr->smgr_rnode.node.relNode,
727  smgr->smgr_rnode.backend,
728  isExtend,
729  found);
730 
731  /*
732  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
733  * locked on return.
734  */
735  if (!isLocalBuf)
736  {
737  if (mode == RBM_ZERO_AND_LOCK)
739  LW_EXCLUSIVE);
740  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
742  }
743 
744  return BufferDescriptorGetBuffer(bufHdr);
745  }
746 
747  /*
748  * We get here only in the corner case where we are trying to extend
749  * the relation but we found a pre-existing buffer marked BM_VALID.
750  * This can happen because mdread doesn't complain about reads beyond
751  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
752  * read a block beyond EOF could have left a "valid" zero-filled
753  * buffer. Unfortunately, we have also seen this case occurring
754  * because of buggy Linux kernels that sometimes return an
755  * lseek(SEEK_END) result that doesn't account for a recent write. In
756  * that situation, the pre-existing buffer would contain valid data
757  * that we don't want to overwrite. Since the legitimate case should
758  * always have left a zero-filled buffer, complain if not PageIsNew.
759  */
760  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
761  if (!PageIsNew((Page) bufBlock))
762  ereport(ERROR,
763  (errmsg("unexpected data beyond EOF in block %u of relation %s",
764  blockNum, relpath(smgr->smgr_rnode, forkNum)),
765  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
766 
767  /*
768  * We *must* do smgrextend before succeeding, else the page will not
769  * be reserved by the kernel, and the next P_NEW call will decide to
770  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
771  * call that BufferAlloc didn't, and proceed.
772  */
773  if (isLocalBuf)
774  {
775  /* Only need to adjust flags */
776  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
777 
778  Assert(buf_state & BM_VALID);
779  buf_state &= ~BM_VALID;
780  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
781  }
782  else
783  {
784  /*
785  * Loop to handle the very small possibility that someone re-sets
786  * BM_VALID between our clearing it and StartBufferIO inspecting
787  * it.
788  */
789  do
790  {
791  uint32 buf_state = LockBufHdr(bufHdr);
792 
793  Assert(buf_state & BM_VALID);
794  buf_state &= ~BM_VALID;
795  UnlockBufHdr(bufHdr, buf_state);
796  } while (!StartBufferIO(bufHdr, true));
797  }
798  }
799 
800  /*
801  * if we have gotten to this point, we have allocated a buffer for the
802  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
803  * if it's a shared buffer.
804  *
805  * Note: if smgrextend fails, we will end up with a buffer that is
806  * allocated but not marked BM_VALID. P_NEW will still select the same
807  * block number (because the relation didn't get any longer on disk) and
808  * so future attempts to extend the relation will find the same buffer (if
809  * it's not been recycled) but come right back here to try smgrextend
810  * again.
811  */
812  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
813 
814  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
815 
816  if (isExtend)
817  {
818  /* new buffers are zero-filled */
819  MemSet((char *) bufBlock, 0, BLCKSZ);
820  /* don't set checksum for all-zero page */
821  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
822 
823  /*
824  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
825  * although we're essentially performing a write. At least on linux
826  * doing so defeats the 'delayed allocation' mechanism, leading to
827  * increased file fragmentation.
828  */
829  }
830  else
831  {
832  /*
833  * Read in the page, unless the caller intends to overwrite it and
834  * just wants us to allocate a buffer.
835  */
836  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
837  MemSet((char *) bufBlock, 0, BLCKSZ);
838  else
839  {
840  instr_time io_start,
841  io_time;
842 
843  if (track_io_timing)
844  INSTR_TIME_SET_CURRENT(io_start);
845 
846  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
847 
848  if (track_io_timing)
849  {
850  INSTR_TIME_SET_CURRENT(io_time);
851  INSTR_TIME_SUBTRACT(io_time, io_start);
854  }
855 
856  /* check for garbage data */
857  if (!PageIsVerified((Page) bufBlock, blockNum))
858  {
859  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
860  {
863  errmsg("invalid page in block %u of relation %s; zeroing out page",
864  blockNum,
865  relpath(smgr->smgr_rnode, forkNum))));
866  MemSet((char *) bufBlock, 0, BLCKSZ);
867  }
868  else
869  ereport(ERROR,
871  errmsg("invalid page in block %u of relation %s",
872  blockNum,
873  relpath(smgr->smgr_rnode, forkNum))));
874  }
875  }
876  }
877 
878  /*
879  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
880  * the page as valid, to make sure that no other backend sees the zeroed
881  * page before the caller has had a chance to initialize it.
882  *
883  * Since no-one else can be looking at the page contents yet, there is no
884  * difference between an exclusive lock and a cleanup-strength lock. (Note
885  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
886  * they assert that the buffer is already valid.)
887  */
888  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
889  !isLocalBuf)
890  {
892  }
893 
894  if (isLocalBuf)
895  {
896  /* Only need to adjust flags */
897  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
898 
899  buf_state |= BM_VALID;
900  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
901  }
902  else
903  {
904  /* Set BM_VALID, terminate IO, and wake up any waiters */
905  TerminateBufferIO(bufHdr, false, BM_VALID);
906  }
907 
908  VacuumPageMiss++;
909  if (VacuumCostActive)
911 
912  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
913  smgr->smgr_rnode.node.spcNode,
914  smgr->smgr_rnode.node.dbNode,
915  smgr->smgr_rnode.node.relNode,
916  smgr->smgr_rnode.backend,
917  isExtend,
918  found);
919 
920  return BufferDescriptorGetBuffer(bufHdr);
921 }
922 
923 /*
924  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
925  * buffer. If no buffer exists already, selects a replacement
926  * victim and evicts the old page, but does NOT read in new page.
927  *
928  * "strategy" can be a buffer replacement strategy object, or NULL for
929  * the default strategy. The selected buffer's usage_count is advanced when
930  * using the default strategy, but otherwise possibly not (see PinBuffer).
931  *
932  * The returned buffer is pinned and is already marked as holding the
933  * desired page. If it already did have the desired page, *foundPtr is
934  * set true. Otherwise, *foundPtr is set false and the buffer is marked
935  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
936  *
937  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
938  * we keep it for simplicity in ReadBuffer.
939  *
940  * No locks are held either at entry or exit.
941  */
942 static BufferDesc *
943 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
944  BlockNumber blockNum,
945  BufferAccessStrategy strategy,
946  bool *foundPtr)
947 {
948  BufferTag newTag; /* identity of requested block */
949  uint32 newHash; /* hash value for newTag */
950  LWLock *newPartitionLock; /* buffer partition lock for it */
951  BufferTag oldTag; /* previous identity of selected buffer */
952  uint32 oldHash; /* hash value for oldTag */
953  LWLock *oldPartitionLock; /* buffer partition lock for it */
954  uint32 oldFlags;
955  int buf_id;
956  BufferDesc *buf;
957  bool valid;
958  uint32 buf_state;
959 
960  /* create a tag so we can lookup the buffer */
961  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
962 
963  /* determine its hash code and partition lock ID */
964  newHash = BufTableHashCode(&newTag);
965  newPartitionLock = BufMappingPartitionLock(newHash);
966 
967  /* see if the block is in the buffer pool already */
968  LWLockAcquire(newPartitionLock, LW_SHARED);
969  buf_id = BufTableLookup(&newTag, newHash);
970  if (buf_id >= 0)
971  {
972  /*
973  * Found it. Now, pin the buffer so no one can steal it from the
974  * buffer pool, and check to see if the correct data has been loaded
975  * into the buffer.
976  */
977  buf = GetBufferDescriptor(buf_id);
978 
979  valid = PinBuffer(buf, strategy);
980 
981  /* Can release the mapping lock as soon as we've pinned it */
982  LWLockRelease(newPartitionLock);
983 
984  *foundPtr = true;
985 
986  if (!valid)
987  {
988  /*
989  * We can only get here if (a) someone else is still reading in
990  * the page, or (b) a previous read attempt failed. We have to
991  * wait for any active read attempt to finish, and then set up our
992  * own read attempt if the page is still not BM_VALID.
993  * StartBufferIO does it all.
994  */
995  if (StartBufferIO(buf, true))
996  {
997  /*
998  * If we get here, previous attempts to read the buffer must
999  * have failed ... but we shall bravely try again.
1000  */
1001  *foundPtr = false;
1002  }
1003  }
1004 
1005  return buf;
1006  }
1007 
1008  /*
1009  * Didn't find it in the buffer pool. We'll have to initialize a new
1010  * buffer. Remember to unlock the mapping lock while doing the work.
1011  */
1012  LWLockRelease(newPartitionLock);
1013 
1014  /* Loop here in case we have to try another victim buffer */
1015  for (;;)
1016  {
1017  /*
1018  * Ensure, while the spinlock's not yet held, that there's a free
1019  * refcount entry.
1020  */
1022 
1023  /*
1024  * Select a victim buffer. The buffer is returned with its header
1025  * spinlock still held!
1026  */
1027  buf = StrategyGetBuffer(strategy, &buf_state);
1028 
1029  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1030 
1031  /* Must copy buffer flags while we still hold the spinlock */
1032  oldFlags = buf_state & BUF_FLAG_MASK;
1033 
1034  /* Pin the buffer and then release the buffer spinlock */
1035  PinBuffer_Locked(buf);
1036 
1037  /*
1038  * If the buffer was dirty, try to write it out. There is a race
1039  * condition here, in that someone might dirty it after we released it
1040  * above, or even while we are writing it out (since our share-lock
1041  * won't prevent hint-bit updates). We will recheck the dirty bit
1042  * after re-locking the buffer header.
1043  */
1044  if (oldFlags & BM_DIRTY)
1045  {
1046  /*
1047  * We need a share-lock on the buffer contents to write it out
1048  * (else we might write invalid data, eg because someone else is
1049  * compacting the page contents while we write). We must use a
1050  * conditional lock acquisition here to avoid deadlock. Even
1051  * though the buffer was not pinned (and therefore surely not
1052  * locked) when StrategyGetBuffer returned it, someone else could
1053  * have pinned and exclusive-locked it by the time we get here. If
1054  * we try to get the lock unconditionally, we'd block waiting for
1055  * them; if they later block waiting for us, deadlock ensues.
1056  * (This has been observed to happen when two backends are both
1057  * trying to split btree index pages, and the second one just
1058  * happens to be trying to split the page the first one got from
1059  * StrategyGetBuffer.)
1060  */
1062  LW_SHARED))
1063  {
1064  /*
1065  * If using a nondefault strategy, and writing the buffer
1066  * would require a WAL flush, let the strategy decide whether
1067  * to go ahead and write/reuse the buffer or to choose another
1068  * victim. We need lock to inspect the page LSN, so this
1069  * can't be done inside StrategyGetBuffer.
1070  */
1071  if (strategy != NULL)
1072  {
1073  XLogRecPtr lsn;
1074 
1075  /* Read the LSN while holding buffer header lock */
1076  buf_state = LockBufHdr(buf);
1077  lsn = BufferGetLSN(buf);
1078  UnlockBufHdr(buf, buf_state);
1079 
1080  if (XLogNeedsFlush(lsn) &&
1081  StrategyRejectBuffer(strategy, buf))
1082  {
1083  /* Drop lock/pin and loop around for another buffer */
1085  UnpinBuffer(buf, true);
1086  continue;
1087  }
1088  }
1089 
1090  /* OK, do the I/O */
1091  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1092  smgr->smgr_rnode.node.spcNode,
1093  smgr->smgr_rnode.node.dbNode,
1094  smgr->smgr_rnode.node.relNode);
1095 
1096  FlushBuffer(buf, NULL);
1098 
1100  &buf->tag);
1101 
1102  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1103  smgr->smgr_rnode.node.spcNode,
1104  smgr->smgr_rnode.node.dbNode,
1105  smgr->smgr_rnode.node.relNode);
1106  }
1107  else
1108  {
1109  /*
1110  * Someone else has locked the buffer, so give it up and loop
1111  * back to get another one.
1112  */
1113  UnpinBuffer(buf, true);
1114  continue;
1115  }
1116  }
1117 
1118  /*
1119  * To change the association of a valid buffer, we'll need to have
1120  * exclusive lock on both the old and new mapping partitions.
1121  */
1122  if (oldFlags & BM_TAG_VALID)
1123  {
1124  /*
1125  * Need to compute the old tag's hashcode and partition lock ID.
1126  * XXX is it worth storing the hashcode in BufferDesc so we need
1127  * not recompute it here? Probably not.
1128  */
1129  oldTag = buf->tag;
1130  oldHash = BufTableHashCode(&oldTag);
1131  oldPartitionLock = BufMappingPartitionLock(oldHash);
1132 
1133  /*
1134  * Must lock the lower-numbered partition first to avoid
1135  * deadlocks.
1136  */
1137  if (oldPartitionLock < newPartitionLock)
1138  {
1139  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1140  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1141  }
1142  else if (oldPartitionLock > newPartitionLock)
1143  {
1144  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1145  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1146  }
1147  else
1148  {
1149  /* only one partition, only one lock */
1150  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1151  }
1152  }
1153  else
1154  {
1155  /* if it wasn't valid, we need only the new partition */
1156  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1157  /* remember we have no old-partition lock or tag */
1158  oldPartitionLock = NULL;
1159  /* keep the compiler quiet about uninitialized variables */
1160  oldHash = 0;
1161  }
1162 
1163  /*
1164  * Try to make a hashtable entry for the buffer under its new tag.
1165  * This could fail because while we were writing someone else
1166  * allocated another buffer for the same block we want to read in.
1167  * Note that we have not yet removed the hashtable entry for the old
1168  * tag.
1169  */
1170  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1171 
1172  if (buf_id >= 0)
1173  {
1174  /*
1175  * Got a collision. Someone has already done what we were about to
1176  * do. We'll just handle this as if it were found in the buffer
1177  * pool in the first place. First, give up the buffer we were
1178  * planning to use.
1179  */
1180  UnpinBuffer(buf, true);
1181 
1182  /* Can give up that buffer's mapping partition lock now */
1183  if (oldPartitionLock != NULL &&
1184  oldPartitionLock != newPartitionLock)
1185  LWLockRelease(oldPartitionLock);
1186 
1187  /* remaining code should match code at top of routine */
1188 
1189  buf = GetBufferDescriptor(buf_id);
1190 
1191  valid = PinBuffer(buf, strategy);
1192 
1193  /* Can release the mapping lock as soon as we've pinned it */
1194  LWLockRelease(newPartitionLock);
1195 
1196  *foundPtr = true;
1197 
1198  if (!valid)
1199  {
1200  /*
1201  * We can only get here if (a) someone else is still reading
1202  * in the page, or (b) a previous read attempt failed. We
1203  * have to wait for any active read attempt to finish, and
1204  * then set up our own read attempt if the page is still not
1205  * BM_VALID. StartBufferIO does it all.
1206  */
1207  if (StartBufferIO(buf, true))
1208  {
1209  /*
1210  * If we get here, previous attempts to read the buffer
1211  * must have failed ... but we shall bravely try again.
1212  */
1213  *foundPtr = false;
1214  }
1215  }
1216 
1217  return buf;
1218  }
1219 
1220  /*
1221  * Need to lock the buffer header too in order to change its tag.
1222  */
1223  buf_state = LockBufHdr(buf);
1224 
1225  /*
1226  * Somebody could have pinned or re-dirtied the buffer while we were
1227  * doing the I/O and making the new hashtable entry. If so, we can't
1228  * recycle this buffer; we must undo everything we've done and start
1229  * over with a new victim buffer.
1230  */
1231  oldFlags = buf_state & BUF_FLAG_MASK;
1232  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1233  break;
1234 
1235  UnlockBufHdr(buf, buf_state);
1236  BufTableDelete(&newTag, newHash);
1237  if (oldPartitionLock != NULL &&
1238  oldPartitionLock != newPartitionLock)
1239  LWLockRelease(oldPartitionLock);
1240  LWLockRelease(newPartitionLock);
1241  UnpinBuffer(buf, true);
1242  }
1243 
1244  /*
1245  * Okay, it's finally safe to rename the buffer.
1246  *
1247  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1248  * paranoia. We also reset the usage_count since any recency of use of
1249  * the old content is no longer relevant. (The usage_count starts out at
1250  * 1 so that the buffer can survive one clock-sweep pass.)
1251  *
1252  * Make sure BM_PERMANENT is set for buffers that must be written at every
1253  * checkpoint. Unlogged buffers only need to be written at shutdown
1254  * checkpoints, except for their "init" forks, which need to be treated
1255  * just like permanent relations.
1256  */
1257  buf->tag = newTag;
1258  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1261  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1262  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1263  else
1264  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1265 
1266  UnlockBufHdr(buf, buf_state);
1267 
1268  if (oldPartitionLock != NULL)
1269  {
1270  BufTableDelete(&oldTag, oldHash);
1271  if (oldPartitionLock != newPartitionLock)
1272  LWLockRelease(oldPartitionLock);
1273  }
1274 
1275  LWLockRelease(newPartitionLock);
1276 
1277  /*
1278  * Buffer contents are currently invalid. Try to get the io_in_progress
1279  * lock. If StartBufferIO returns false, then someone else managed to
1280  * read it before we did, so there's nothing left for BufferAlloc() to do.
1281  */
1282  if (StartBufferIO(buf, true))
1283  *foundPtr = false;
1284  else
1285  *foundPtr = true;
1286 
1287  return buf;
1288 }
1289 
1290 /*
1291  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1292  * freelist.
1293  *
1294  * The buffer header spinlock must be held at entry. We drop it before
1295  * returning. (This is sane because the caller must have locked the
1296  * buffer in order to be sure it should be dropped.)
1297  *
1298  * This is used only in contexts such as dropping a relation. We assume
1299  * that no other backend could possibly be interested in using the page,
1300  * so the only reason the buffer might be pinned is if someone else is
1301  * trying to write it out. We have to let them finish before we can
1302  * reclaim the buffer.
1303  *
1304  * The buffer could get reclaimed by someone else while we are waiting
1305  * to acquire the necessary locks; if so, don't mess it up.
1306  */
1307 static void
1309 {
1310  BufferTag oldTag;
1311  uint32 oldHash; /* hash value for oldTag */
1312  LWLock *oldPartitionLock; /* buffer partition lock for it */
1313  uint32 oldFlags;
1314  uint32 buf_state;
1315 
1316  /* Save the original buffer tag before dropping the spinlock */
1317  oldTag = buf->tag;
1318 
1319  buf_state = pg_atomic_read_u32(&buf->state);
1320  Assert(buf_state & BM_LOCKED);
1321  UnlockBufHdr(buf, buf_state);
1322 
1323  /*
1324  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1325  * worth storing the hashcode in BufferDesc so we need not recompute it
1326  * here? Probably not.
1327  */
1328  oldHash = BufTableHashCode(&oldTag);
1329  oldPartitionLock = BufMappingPartitionLock(oldHash);
1330 
1331 retry:
1332 
1333  /*
1334  * Acquire exclusive mapping lock in preparation for changing the buffer's
1335  * association.
1336  */
1337  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1338 
1339  /* Re-lock the buffer header */
1340  buf_state = LockBufHdr(buf);
1341 
1342  /* If it's changed while we were waiting for lock, do nothing */
1343  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1344  {
1345  UnlockBufHdr(buf, buf_state);
1346  LWLockRelease(oldPartitionLock);
1347  return;
1348  }
1349 
1350  /*
1351  * We assume the only reason for it to be pinned is that someone else is
1352  * flushing the page out. Wait for them to finish. (This could be an
1353  * infinite loop if the refcount is messed up... it would be nice to time
1354  * out after awhile, but there seems no way to be sure how many loops may
1355  * be needed. Note that if the other guy has pinned the buffer but not
1356  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1357  * be busy-looping here.)
1358  */
1359  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1360  {
1361  UnlockBufHdr(buf, buf_state);
1362  LWLockRelease(oldPartitionLock);
1363  /* safety check: should definitely not be our *own* pin */
1365  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1366  WaitIO(buf);
1367  goto retry;
1368  }
1369 
1370  /*
1371  * Clear out the buffer's tag and flags. We must do this to ensure that
1372  * linear scans of the buffer array don't think the buffer is valid.
1373  */
1374  oldFlags = buf_state & BUF_FLAG_MASK;
1375  CLEAR_BUFFERTAG(buf->tag);
1376  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1377  UnlockBufHdr(buf, buf_state);
1378 
1379  /*
1380  * Remove the buffer from the lookup hashtable, if it was in there.
1381  */
1382  if (oldFlags & BM_TAG_VALID)
1383  BufTableDelete(&oldTag, oldHash);
1384 
1385  /*
1386  * Done with mapping lock.
1387  */
1388  LWLockRelease(oldPartitionLock);
1389 
1390  /*
1391  * Insert the buffer at the head of the list of free buffers.
1392  */
1393  StrategyFreeBuffer(buf);
1394 }
1395 
1396 /*
1397  * MarkBufferDirty
1398  *
1399  * Marks buffer contents as dirty (actual write happens later).
1400  *
1401  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1402  * exclusive lock, then somebody could be in process of writing the buffer,
1403  * leading to risk of bad data written to disk.)
1404  */
1405 void
1407 {
1408  BufferDesc *bufHdr;
1409  uint32 buf_state;
1410  uint32 old_buf_state;
1411 
1412  if (!BufferIsValid(buffer))
1413  elog(ERROR, "bad buffer ID: %d", buffer);
1414 
1415  if (BufferIsLocal(buffer))
1416  {
1417  MarkLocalBufferDirty(buffer);
1418  return;
1419  }
1420 
1421  bufHdr = GetBufferDescriptor(buffer - 1);
1422 
1423  Assert(BufferIsPinned(buffer));
1425  LW_EXCLUSIVE));
1426 
1427  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1428  for (;;)
1429  {
1430  if (old_buf_state & BM_LOCKED)
1431  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1432 
1433  buf_state = old_buf_state;
1434 
1435  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1436  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1437 
1438  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1439  buf_state))
1440  break;
1441  }
1442 
1443  /*
1444  * If the buffer was not dirty already, do vacuum accounting.
1445  */
1446  if (!(old_buf_state & BM_DIRTY))
1447  {
1448  VacuumPageDirty++;
1450  if (VacuumCostActive)
1452  }
1453 }
1454 
1455 /*
1456  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1457  *
1458  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1459  * compared to calling the two routines separately. Now it's mainly just
1460  * a convenience function. However, if the passed buffer is valid and
1461  * already contains the desired block, we just return it as-is; and that
1462  * does save considerable work compared to a full release and reacquire.
1463  *
1464  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1465  * buffer actually needs to be released. This case is the same as ReadBuffer,
1466  * but can save some tests in the caller.
1467  */
1468 Buffer
1470  Relation relation,
1471  BlockNumber blockNum)
1472 {
1473  ForkNumber forkNum = MAIN_FORKNUM;
1474  BufferDesc *bufHdr;
1475 
1476  if (BufferIsValid(buffer))
1477  {
1478  Assert(BufferIsPinned(buffer));
1479  if (BufferIsLocal(buffer))
1480  {
1481  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1482  if (bufHdr->tag.blockNum == blockNum &&
1483  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1484  bufHdr->tag.forkNum == forkNum)
1485  return buffer;
1487  LocalRefCount[-buffer - 1]--;
1488  }
1489  else
1490  {
1491  bufHdr = GetBufferDescriptor(buffer - 1);
1492  /* we have pin, so it's ok to examine tag without spinlock */
1493  if (bufHdr->tag.blockNum == blockNum &&
1494  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1495  bufHdr->tag.forkNum == forkNum)
1496  return buffer;
1497  UnpinBuffer(bufHdr, true);
1498  }
1499  }
1500 
1501  return ReadBuffer(relation, blockNum);
1502 }
1503 
1504 /*
1505  * PinBuffer -- make buffer unavailable for replacement.
1506  *
1507  * For the default access strategy, the buffer's usage_count is incremented
1508  * when we first pin it; for other strategies we just make sure the usage_count
1509  * isn't zero. (The idea of the latter is that we don't want synchronized
1510  * heap scans to inflate the count, but we need it to not be zero to discourage
1511  * other backends from stealing buffers from our ring. As long as we cycle
1512  * through the ring faster than the global clock-sweep cycles, buffers in
1513  * our ring won't be chosen as victims for replacement by other backends.)
1514  *
1515  * This should be applied only to shared buffers, never local ones.
1516  *
1517  * Since buffers are pinned/unpinned very frequently, pin buffers without
1518  * taking the buffer header lock; instead update the state variable in loop of
1519  * CAS operations. Hopefully it's just a single CAS.
1520  *
1521  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1522  *
1523  * Returns true if buffer is BM_VALID, else false. This provision allows
1524  * some callers to avoid an extra spinlock cycle.
1525  */
1526 static bool
1528 {
1530  bool result;
1531  PrivateRefCountEntry *ref;
1532 
1533  ref = GetPrivateRefCountEntry(b, true);
1534 
1535  if (ref == NULL)
1536  {
1537  uint32 buf_state;
1538  uint32 old_buf_state;
1539 
1541  ref = NewPrivateRefCountEntry(b);
1542 
1543  old_buf_state = pg_atomic_read_u32(&buf->state);
1544  for (;;)
1545  {
1546  if (old_buf_state & BM_LOCKED)
1547  old_buf_state = WaitBufHdrUnlocked(buf);
1548 
1549  buf_state = old_buf_state;
1550 
1551  /* increase refcount */
1552  buf_state += BUF_REFCOUNT_ONE;
1553 
1554  if (strategy == NULL)
1555  {
1556  /* Default case: increase usagecount unless already max. */
1558  buf_state += BUF_USAGECOUNT_ONE;
1559  }
1560  else
1561  {
1562  /*
1563  * Ring buffers shouldn't evict others from pool. Thus we
1564  * don't make usagecount more than 1.
1565  */
1566  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1567  buf_state += BUF_USAGECOUNT_ONE;
1568  }
1569 
1570  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1571  buf_state))
1572  {
1573  result = (buf_state & BM_VALID) != 0;
1574  break;
1575  }
1576  }
1577  }
1578  else
1579  {
1580  /* If we previously pinned the buffer, it must surely be valid */
1581  result = true;
1582  }
1583 
1584  ref->refcount++;
1585  Assert(ref->refcount > 0);
1587  return result;
1588 }
1589 
1590 /*
1591  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1592  * The spinlock is released before return.
1593  *
1594  * As this function is called with the spinlock held, the caller has to
1595  * previously call ReservePrivateRefCountEntry().
1596  *
1597  * Currently, no callers of this function want to modify the buffer's
1598  * usage_count at all, so there's no need for a strategy parameter.
1599  * Also we don't bother with a BM_VALID test (the caller could check that for
1600  * itself).
1601  *
1602  * Also all callers only ever use this function when it's known that the
1603  * buffer can't have a preexisting pin by this backend. That allows us to skip
1604  * searching the private refcount array & hash, which is a boon, because the
1605  * spinlock is still held.
1606  *
1607  * Note: use of this routine is frequently mandatory, not just an optimization
1608  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1609  * its state can change under us.
1610  */
1611 static void
1613 {
1614  Buffer b;
1615  PrivateRefCountEntry *ref;
1616  uint32 buf_state;
1617 
1618  /*
1619  * As explained, We don't expect any preexisting pins. That allows us to
1620  * manipulate the PrivateRefCount after releasing the spinlock
1621  */
1623 
1624  /*
1625  * Since we hold the buffer spinlock, we can update the buffer state and
1626  * release the lock in one operation.
1627  */
1628  buf_state = pg_atomic_read_u32(&buf->state);
1629  Assert(buf_state & BM_LOCKED);
1630  buf_state += BUF_REFCOUNT_ONE;
1631  UnlockBufHdr(buf, buf_state);
1632 
1633  b = BufferDescriptorGetBuffer(buf);
1634 
1635  ref = NewPrivateRefCountEntry(b);
1636  ref->refcount++;
1637 
1639 }
1640 
1641 /*
1642  * UnpinBuffer -- make buffer available for replacement.
1643  *
1644  * This should be applied only to shared buffers, never local ones.
1645  *
1646  * Most but not all callers want CurrentResourceOwner to be adjusted.
1647  * Those that don't should pass fixOwner = false.
1648  */
1649 static void
1650 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1651 {
1652  PrivateRefCountEntry *ref;
1654 
1655  /* not moving as we're likely deleting it soon anyway */
1656  ref = GetPrivateRefCountEntry(b, false);
1657  Assert(ref != NULL);
1658 
1659  if (fixOwner)
1661 
1662  Assert(ref->refcount > 0);
1663  ref->refcount--;
1664  if (ref->refcount == 0)
1665  {
1666  uint32 buf_state;
1667  uint32 old_buf_state;
1668 
1669  /* I'd better not still hold any locks on the buffer */
1672 
1673  /*
1674  * Decrement the shared reference count.
1675  *
1676  * Since buffer spinlock holder can update status using just write,
1677  * it's not safe to use atomic decrement here; thus use a CAS loop.
1678  */
1679  old_buf_state = pg_atomic_read_u32(&buf->state);
1680  for (;;)
1681  {
1682  if (old_buf_state & BM_LOCKED)
1683  old_buf_state = WaitBufHdrUnlocked(buf);
1684 
1685  buf_state = old_buf_state;
1686 
1687  buf_state -= BUF_REFCOUNT_ONE;
1688 
1689  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1690  buf_state))
1691  break;
1692  }
1693 
1694  /* Support LockBufferForCleanup() */
1695  if (buf_state & BM_PIN_COUNT_WAITER)
1696  {
1697  /*
1698  * Acquire the buffer header lock, re-check that there's a waiter.
1699  * Another backend could have unpinned this buffer, and already
1700  * woken up the waiter. There's no danger of the buffer being
1701  * replaced after we unpinned it above, as it's pinned by the
1702  * waiter.
1703  */
1704  buf_state = LockBufHdr(buf);
1705 
1706  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1707  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1708  {
1709  /* we just released the last pin other than the waiter's */
1710  int wait_backend_pid = buf->wait_backend_pid;
1711 
1712  buf_state &= ~BM_PIN_COUNT_WAITER;
1713  UnlockBufHdr(buf, buf_state);
1714  ProcSendSignal(wait_backend_pid);
1715  }
1716  else
1717  UnlockBufHdr(buf, buf_state);
1718  }
1720  }
1721 }
1722 
1723 /*
1724  * BufferSync -- Write out all dirty buffers in the pool.
1725  *
1726  * This is called at checkpoint time to write out all dirty shared buffers.
1727  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1728  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1729  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1730  * unlogged buffers, which are otherwise skipped. The remaining flags
1731  * currently have no effect here.
1732  */
1733 static void
1734 BufferSync(int flags)
1735 {
1736  uint32 buf_state;
1737  int buf_id;
1738  int num_to_scan;
1739  int num_spaces;
1740  int num_processed;
1741  int num_written;
1742  CkptTsStatus *per_ts_stat = NULL;
1743  Oid last_tsid;
1744  binaryheap *ts_heap;
1745  int i;
1746  int mask = BM_DIRTY;
1747  WritebackContext wb_context;
1748 
1749  /* Make sure we can handle the pin inside SyncOneBuffer */
1751 
1752  /*
1753  * Unless this is a shutdown checkpoint or we have been explicitly told,
1754  * we write only permanent, dirty buffers. But at shutdown or end of
1755  * recovery, we write all dirty buffers.
1756  */
1759  mask |= BM_PERMANENT;
1760 
1761  /*
1762  * Loop over all buffers, and mark the ones that need to be written with
1763  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1764  * can estimate how much work needs to be done.
1765  *
1766  * This allows us to write only those pages that were dirty when the
1767  * checkpoint began, and not those that get dirtied while it proceeds.
1768  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1769  * later in this function, or by normal backends or the bgwriter cleaning
1770  * scan, the flag is cleared. Any buffer dirtied after this point won't
1771  * have the flag set.
1772  *
1773  * Note that if we fail to write some buffer, we may leave buffers with
1774  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1775  * certainly need to be written for the next checkpoint attempt, too.
1776  */
1777  num_to_scan = 0;
1778  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1779  {
1780  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1781 
1782  /*
1783  * Header spinlock is enough to examine BM_DIRTY, see comment in
1784  * SyncOneBuffer.
1785  */
1786  buf_state = LockBufHdr(bufHdr);
1787 
1788  if ((buf_state & mask) == mask)
1789  {
1790  CkptSortItem *item;
1791 
1792  buf_state |= BM_CHECKPOINT_NEEDED;
1793 
1794  item = &CkptBufferIds[num_to_scan++];
1795  item->buf_id = buf_id;
1796  item->tsId = bufHdr->tag.rnode.spcNode;
1797  item->relNode = bufHdr->tag.rnode.relNode;
1798  item->forkNum = bufHdr->tag.forkNum;
1799  item->blockNum = bufHdr->tag.blockNum;
1800  }
1801 
1802  UnlockBufHdr(bufHdr, buf_state);
1803 
1804  /* Check for barrier events in case NBuffers is large. */
1807  }
1808 
1809  if (num_to_scan == 0)
1810  return; /* nothing to do */
1811 
1813 
1814  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1815 
1816  /*
1817  * Sort buffers that need to be written to reduce the likelihood of random
1818  * IO. The sorting is also important for the implementation of balancing
1819  * writes between tablespaces. Without balancing writes we'd potentially
1820  * end up writing to the tablespaces one-by-one; possibly overloading the
1821  * underlying system.
1822  */
1823  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1825 
1826  num_spaces = 0;
1827 
1828  /*
1829  * Allocate progress status for each tablespace with buffers that need to
1830  * be flushed. This requires the to-be-flushed array to be sorted.
1831  */
1832  last_tsid = InvalidOid;
1833  for (i = 0; i < num_to_scan; i++)
1834  {
1835  CkptTsStatus *s;
1836  Oid cur_tsid;
1837 
1838  cur_tsid = CkptBufferIds[i].tsId;
1839 
1840  /*
1841  * Grow array of per-tablespace status structs, every time a new
1842  * tablespace is found.
1843  */
1844  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1845  {
1846  Size sz;
1847 
1848  num_spaces++;
1849 
1850  /*
1851  * Not worth adding grow-by-power-of-2 logic here - even with a
1852  * few hundred tablespaces this should be fine.
1853  */
1854  sz = sizeof(CkptTsStatus) * num_spaces;
1855 
1856  if (per_ts_stat == NULL)
1857  per_ts_stat = (CkptTsStatus *) palloc(sz);
1858  else
1859  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1860 
1861  s = &per_ts_stat[num_spaces - 1];
1862  memset(s, 0, sizeof(*s));
1863  s->tsId = cur_tsid;
1864 
1865  /*
1866  * The first buffer in this tablespace. As CkptBufferIds is sorted
1867  * by tablespace all (s->num_to_scan) buffers in this tablespace
1868  * will follow afterwards.
1869  */
1870  s->index = i;
1871 
1872  /*
1873  * progress_slice will be determined once we know how many buffers
1874  * are in each tablespace, i.e. after this loop.
1875  */
1876 
1877  last_tsid = cur_tsid;
1878  }
1879  else
1880  {
1881  s = &per_ts_stat[num_spaces - 1];
1882  }
1883 
1884  s->num_to_scan++;
1885 
1886  /* Check for barrier events. */
1889  }
1890 
1891  Assert(num_spaces > 0);
1892 
1893  /*
1894  * Build a min-heap over the write-progress in the individual tablespaces,
1895  * and compute how large a portion of the total progress a single
1896  * processed buffer is.
1897  */
1898  ts_heap = binaryheap_allocate(num_spaces,
1900  NULL);
1901 
1902  for (i = 0; i < num_spaces; i++)
1903  {
1904  CkptTsStatus *ts_stat = &per_ts_stat[i];
1905 
1906  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1907 
1908  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1909  }
1910 
1911  binaryheap_build(ts_heap);
1912 
1913  /*
1914  * Iterate through to-be-checkpointed buffers and write the ones (still)
1915  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1916  * tablespaces; otherwise the sorting would lead to only one tablespace
1917  * receiving writes at a time, making inefficient use of the hardware.
1918  */
1919  num_processed = 0;
1920  num_written = 0;
1921  while (!binaryheap_empty(ts_heap))
1922  {
1923  BufferDesc *bufHdr = NULL;
1924  CkptTsStatus *ts_stat = (CkptTsStatus *)
1926 
1927  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1928  Assert(buf_id != -1);
1929 
1930  bufHdr = GetBufferDescriptor(buf_id);
1931 
1932  num_processed++;
1933 
1934  /*
1935  * We don't need to acquire the lock here, because we're only looking
1936  * at a single bit. It's possible that someone else writes the buffer
1937  * and clears the flag right after we check, but that doesn't matter
1938  * since SyncOneBuffer will then do nothing. However, there is a
1939  * further race condition: it's conceivable that between the time we
1940  * examine the bit here and the time SyncOneBuffer acquires the lock,
1941  * someone else not only wrote the buffer but replaced it with another
1942  * page and dirtied it. In that improbable case, SyncOneBuffer will
1943  * write the buffer though we didn't need to. It doesn't seem worth
1944  * guarding against this, though.
1945  */
1947  {
1948  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1949  {
1950  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1952  num_written++;
1953  }
1954  }
1955 
1956  /*
1957  * Measure progress independent of actually having to flush the buffer
1958  * - otherwise writing become unbalanced.
1959  */
1960  ts_stat->progress += ts_stat->progress_slice;
1961  ts_stat->num_scanned++;
1962  ts_stat->index++;
1963 
1964  /* Have all the buffers from the tablespace been processed? */
1965  if (ts_stat->num_scanned == ts_stat->num_to_scan)
1966  {
1967  binaryheap_remove_first(ts_heap);
1968  }
1969  else
1970  {
1971  /* update heap with the new progress */
1972  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
1973  }
1974 
1975  /*
1976  * Sleep to throttle our I/O rate.
1977  *
1978  * (This will check for barrier events even if it doesn't sleep.)
1979  */
1980  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
1981  }
1982 
1983  /* issue all pending flushes */
1984  IssuePendingWritebacks(&wb_context);
1985 
1986  pfree(per_ts_stat);
1987  per_ts_stat = NULL;
1988  binaryheap_free(ts_heap);
1989 
1990  /*
1991  * Update checkpoint statistics. As noted above, this doesn't include
1992  * buffers written by other backends or bgwriter scan.
1993  */
1994  CheckpointStats.ckpt_bufs_written += num_written;
1995 
1996  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
1997 }
1998 
1999 /*
2000  * BgBufferSync -- Write out some dirty buffers in the pool.
2001  *
2002  * This is called periodically by the background writer process.
2003  *
2004  * Returns true if it's appropriate for the bgwriter process to go into
2005  * low-power hibernation mode. (This happens if the strategy clock sweep
2006  * has been "lapped" and no buffer allocations have occurred recently,
2007  * or if the bgwriter has been effectively disabled by setting
2008  * bgwriter_lru_maxpages to 0.)
2009  */
2010 bool
2012 {
2013  /* info obtained from freelist.c */
2014  int strategy_buf_id;
2015  uint32 strategy_passes;
2016  uint32 recent_alloc;
2017 
2018  /*
2019  * Information saved between calls so we can determine the strategy
2020  * point's advance rate and avoid scanning already-cleaned buffers.
2021  */
2022  static bool saved_info_valid = false;
2023  static int prev_strategy_buf_id;
2024  static uint32 prev_strategy_passes;
2025  static int next_to_clean;
2026  static uint32 next_passes;
2027 
2028  /* Moving averages of allocation rate and clean-buffer density */
2029  static float smoothed_alloc = 0;
2030  static float smoothed_density = 10.0;
2031 
2032  /* Potentially these could be tunables, but for now, not */
2033  float smoothing_samples = 16;
2034  float scan_whole_pool_milliseconds = 120000.0;
2035 
2036  /* Used to compute how far we scan ahead */
2037  long strategy_delta;
2038  int bufs_to_lap;
2039  int bufs_ahead;
2040  float scans_per_alloc;
2041  int reusable_buffers_est;
2042  int upcoming_alloc_est;
2043  int min_scan_buffers;
2044 
2045  /* Variables for the scanning loop proper */
2046  int num_to_scan;
2047  int num_written;
2048  int reusable_buffers;
2049 
2050  /* Variables for final smoothed_density update */
2051  long new_strategy_delta;
2052  uint32 new_recent_alloc;
2053 
2054  /*
2055  * Find out where the freelist clock sweep currently is, and how many
2056  * buffer allocations have happened since our last call.
2057  */
2058  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2059 
2060  /* Report buffer alloc counts to pgstat */
2061  BgWriterStats.m_buf_alloc += recent_alloc;
2062 
2063  /*
2064  * If we're not running the LRU scan, just stop after doing the stats
2065  * stuff. We mark the saved state invalid so that we can recover sanely
2066  * if LRU scan is turned back on later.
2067  */
2068  if (bgwriter_lru_maxpages <= 0)
2069  {
2070  saved_info_valid = false;
2071  return true;
2072  }
2073 
2074  /*
2075  * Compute strategy_delta = how many buffers have been scanned by the
2076  * clock sweep since last time. If first time through, assume none. Then
2077  * see if we are still ahead of the clock sweep, and if so, how many
2078  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2079  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2080  * behavior when the passes counts wrap around.
2081  */
2082  if (saved_info_valid)
2083  {
2084  int32 passes_delta = strategy_passes - prev_strategy_passes;
2085 
2086  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2087  strategy_delta += (long) passes_delta * NBuffers;
2088 
2089  Assert(strategy_delta >= 0);
2090 
2091  if ((int32) (next_passes - strategy_passes) > 0)
2092  {
2093  /* we're one pass ahead of the strategy point */
2094  bufs_to_lap = strategy_buf_id - next_to_clean;
2095 #ifdef BGW_DEBUG
2096  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2097  next_passes, next_to_clean,
2098  strategy_passes, strategy_buf_id,
2099  strategy_delta, bufs_to_lap);
2100 #endif
2101  }
2102  else if (next_passes == strategy_passes &&
2103  next_to_clean >= strategy_buf_id)
2104  {
2105  /* on same pass, but ahead or at least not behind */
2106  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2107 #ifdef BGW_DEBUG
2108  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2109  next_passes, next_to_clean,
2110  strategy_passes, strategy_buf_id,
2111  strategy_delta, bufs_to_lap);
2112 #endif
2113  }
2114  else
2115  {
2116  /*
2117  * We're behind, so skip forward to the strategy point and start
2118  * cleaning from there.
2119  */
2120 #ifdef BGW_DEBUG
2121  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2122  next_passes, next_to_clean,
2123  strategy_passes, strategy_buf_id,
2124  strategy_delta);
2125 #endif
2126  next_to_clean = strategy_buf_id;
2127  next_passes = strategy_passes;
2128  bufs_to_lap = NBuffers;
2129  }
2130  }
2131  else
2132  {
2133  /*
2134  * Initializing at startup or after LRU scanning had been off. Always
2135  * start at the strategy point.
2136  */
2137 #ifdef BGW_DEBUG
2138  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2139  strategy_passes, strategy_buf_id);
2140 #endif
2141  strategy_delta = 0;
2142  next_to_clean = strategy_buf_id;
2143  next_passes = strategy_passes;
2144  bufs_to_lap = NBuffers;
2145  }
2146 
2147  /* Update saved info for next time */
2148  prev_strategy_buf_id = strategy_buf_id;
2149  prev_strategy_passes = strategy_passes;
2150  saved_info_valid = true;
2151 
2152  /*
2153  * Compute how many buffers had to be scanned for each new allocation, ie,
2154  * 1/density of reusable buffers, and track a moving average of that.
2155  *
2156  * If the strategy point didn't move, we don't update the density estimate
2157  */
2158  if (strategy_delta > 0 && recent_alloc > 0)
2159  {
2160  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2161  smoothed_density += (scans_per_alloc - smoothed_density) /
2162  smoothing_samples;
2163  }
2164 
2165  /*
2166  * Estimate how many reusable buffers there are between the current
2167  * strategy point and where we've scanned ahead to, based on the smoothed
2168  * density estimate.
2169  */
2170  bufs_ahead = NBuffers - bufs_to_lap;
2171  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2172 
2173  /*
2174  * Track a moving average of recent buffer allocations. Here, rather than
2175  * a true average we want a fast-attack, slow-decline behavior: we
2176  * immediately follow any increase.
2177  */
2178  if (smoothed_alloc <= (float) recent_alloc)
2179  smoothed_alloc = recent_alloc;
2180  else
2181  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2182  smoothing_samples;
2183 
2184  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2185  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2186 
2187  /*
2188  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2189  * eventually underflow to zero, and the underflows produce annoying
2190  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2191  * zero, there's no point in tracking smaller and smaller values of
2192  * smoothed_alloc, so just reset it to exactly zero to avoid this
2193  * syndrome. It will pop back up as soon as recent_alloc increases.
2194  */
2195  if (upcoming_alloc_est == 0)
2196  smoothed_alloc = 0;
2197 
2198  /*
2199  * Even in cases where there's been little or no buffer allocation
2200  * activity, we want to make a small amount of progress through the buffer
2201  * cache so that as many reusable buffers as possible are clean after an
2202  * idle period.
2203  *
2204  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2205  * the BGW will be called during the scan_whole_pool time; slice the
2206  * buffer pool into that many sections.
2207  */
2208  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2209 
2210  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2211  {
2212 #ifdef BGW_DEBUG
2213  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2214  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2215 #endif
2216  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2217  }
2218 
2219  /*
2220  * Now write out dirty reusable buffers, working forward from the
2221  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2222  * enough buffers to match our estimate of the next cycle's allocation
2223  * requirements, or hit the bgwriter_lru_maxpages limit.
2224  */
2225 
2226  /* Make sure we can handle the pin inside SyncOneBuffer */
2228 
2229  num_to_scan = bufs_to_lap;
2230  num_written = 0;
2231  reusable_buffers = reusable_buffers_est;
2232 
2233  /* Execute the LRU scan */
2234  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2235  {
2236  int sync_state = SyncOneBuffer(next_to_clean, true,
2237  wb_context);
2238 
2239  if (++next_to_clean >= NBuffers)
2240  {
2241  next_to_clean = 0;
2242  next_passes++;
2243  }
2244  num_to_scan--;
2245 
2246  if (sync_state & BUF_WRITTEN)
2247  {
2248  reusable_buffers++;
2249  if (++num_written >= bgwriter_lru_maxpages)
2250  {
2252  break;
2253  }
2254  }
2255  else if (sync_state & BUF_REUSABLE)
2256  reusable_buffers++;
2257  }
2258 
2259  BgWriterStats.m_buf_written_clean += num_written;
2260 
2261 #ifdef BGW_DEBUG
2262  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2263  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2264  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2265  bufs_to_lap - num_to_scan,
2266  num_written,
2267  reusable_buffers - reusable_buffers_est);
2268 #endif
2269 
2270  /*
2271  * Consider the above scan as being like a new allocation scan.
2272  * Characterize its density and update the smoothed one based on it. This
2273  * effectively halves the moving average period in cases where both the
2274  * strategy and the background writer are doing some useful scanning,
2275  * which is helpful because a long memory isn't as desirable on the
2276  * density estimates.
2277  */
2278  new_strategy_delta = bufs_to_lap - num_to_scan;
2279  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2280  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2281  {
2282  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2283  smoothed_density += (scans_per_alloc - smoothed_density) /
2284  smoothing_samples;
2285 
2286 #ifdef BGW_DEBUG
2287  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2288  new_recent_alloc, new_strategy_delta,
2289  scans_per_alloc, smoothed_density);
2290 #endif
2291  }
2292 
2293  /* Return true if OK to hibernate */
2294  return (bufs_to_lap == 0 && recent_alloc == 0);
2295 }
2296 
2297 /*
2298  * SyncOneBuffer -- process a single buffer during syncing.
2299  *
2300  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2301  * buffers marked recently used, as these are not replacement candidates.
2302  *
2303  * Returns a bitmask containing the following flag bits:
2304  * BUF_WRITTEN: we wrote the buffer.
2305  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2306  * pin count 0 and usage count 0.
2307  *
2308  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2309  * after locking it, but we don't care all that much.)
2310  *
2311  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2312  */
2313 static int
2314 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2315 {
2316  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2317  int result = 0;
2318  uint32 buf_state;
2319  BufferTag tag;
2320 
2322 
2323  /*
2324  * Check whether buffer needs writing.
2325  *
2326  * We can make this check without taking the buffer content lock so long
2327  * as we mark pages dirty in access methods *before* logging changes with
2328  * XLogInsert(): if someone marks the buffer dirty just after our check we
2329  * don't worry because our checkpoint.redo points before log record for
2330  * upcoming changes and so we are not required to write such dirty buffer.
2331  */
2332  buf_state = LockBufHdr(bufHdr);
2333 
2334  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2335  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2336  {
2337  result |= BUF_REUSABLE;
2338  }
2339  else if (skip_recently_used)
2340  {
2341  /* Caller told us not to write recently-used buffers */
2342  UnlockBufHdr(bufHdr, buf_state);
2343  return result;
2344  }
2345 
2346  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2347  {
2348  /* It's clean, so nothing to do */
2349  UnlockBufHdr(bufHdr, buf_state);
2350  return result;
2351  }
2352 
2353  /*
2354  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2355  * buffer is clean by the time we've locked it.)
2356  */
2357  PinBuffer_Locked(bufHdr);
2359 
2360  FlushBuffer(bufHdr, NULL);
2361 
2363 
2364  tag = bufHdr->tag;
2365 
2366  UnpinBuffer(bufHdr, true);
2367 
2368  ScheduleBufferTagForWriteback(wb_context, &tag);
2369 
2370  return result | BUF_WRITTEN;
2371 }
2372 
2373 /*
2374  * AtEOXact_Buffers - clean up at end of transaction.
2375  *
2376  * As of PostgreSQL 8.0, buffer pins should get released by the
2377  * ResourceOwner mechanism. This routine is just a debugging
2378  * cross-check that no pins remain.
2379  */
2380 void
2381 AtEOXact_Buffers(bool isCommit)
2382 {
2384 
2385  AtEOXact_LocalBuffers(isCommit);
2386 
2388 }
2389 
2390 /*
2391  * Initialize access to shared buffer pool
2392  *
2393  * This is called during backend startup (whether standalone or under the
2394  * postmaster). It sets up for this backend's access to the already-existing
2395  * buffer pool.
2396  *
2397  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2398  * cannot do LWLockAcquire; hence we can't actually access stuff in
2399  * shared memory yet. We are only initializing local data here.
2400  * (See also InitBufferPoolBackend)
2401  */
2402 void
2404 {
2405  HASHCTL hash_ctl;
2406 
2407  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2408 
2409  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2410  hash_ctl.keysize = sizeof(int32);
2411  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2412 
2413  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2414  HASH_ELEM | HASH_BLOBS);
2415 }
2416 
2417 /*
2418  * InitBufferPoolBackend --- second-stage initialization of a new backend
2419  *
2420  * This is called after we have acquired a PGPROC and so can safely get
2421  * LWLocks. We don't currently need to do anything at this stage ...
2422  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2423  * access, and thereby has to be called at the corresponding phase of
2424  * backend shutdown.
2425  */
2426 void
2428 {
2430 }
2431 
2432 /*
2433  * During backend exit, ensure that we released all shared-buffer locks and
2434  * assert that we have no remaining pins.
2435  */
2436 static void
2438 {
2439  AbortBufferIO();
2440  UnlockBuffers();
2441 
2443 
2444  /* localbuf.c needs a chance too */
2446 }
2447 
2448 /*
2449  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2450  *
2451  * As of PostgreSQL 8.0, buffer pins should get released by the
2452  * ResourceOwner mechanism. This routine is just a debugging
2453  * cross-check that no pins remain.
2454  */
2455 static void
2457 {
2458 #ifdef USE_ASSERT_CHECKING
2459  int RefCountErrors = 0;
2460  PrivateRefCountEntry *res;
2461  int i;
2462 
2463  /* check the array */
2464  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2465  {
2466  res = &PrivateRefCountArray[i];
2467 
2468  if (res->buffer != InvalidBuffer)
2469  {
2471  RefCountErrors++;
2472  }
2473  }
2474 
2475  /* if necessary search the hash */
2477  {
2478  HASH_SEQ_STATUS hstat;
2479 
2480  hash_seq_init(&hstat, PrivateRefCountHash);
2481  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2482  {
2484  RefCountErrors++;
2485  }
2486 
2487  }
2488 
2489  Assert(RefCountErrors == 0);
2490 #endif
2491 }
2492 
2493 /*
2494  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2495  */
2496 void
2498 {
2499  BufferDesc *buf;
2500  int32 loccount;
2501  char *path;
2502  BackendId backend;
2503  uint32 buf_state;
2504 
2505  Assert(BufferIsValid(buffer));
2506  if (BufferIsLocal(buffer))
2507  {
2508  buf = GetLocalBufferDescriptor(-buffer - 1);
2509  loccount = LocalRefCount[-buffer - 1];
2510  backend = MyBackendId;
2511  }
2512  else
2513  {
2514  buf = GetBufferDescriptor(buffer - 1);
2515  loccount = GetPrivateRefCount(buffer);
2516  backend = InvalidBackendId;
2517  }
2518 
2519  /* theoretically we should lock the bufhdr here */
2520  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2521  buf_state = pg_atomic_read_u32(&buf->state);
2522  elog(WARNING,
2523  "buffer refcount leak: [%03d] "
2524  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2525  buffer, path,
2526  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2527  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2528  pfree(path);
2529 }
2530 
2531 /*
2532  * CheckPointBuffers
2533  *
2534  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2535  *
2536  * Note: temporary relations do not participate in checkpoints, so they don't
2537  * need to be flushed.
2538  */
2539 void
2541 {
2542  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2544  BufferSync(flags);
2546  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2549  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2550 }
2551 
2552 
2553 /*
2554  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2555  */
2556 void
2558 {
2559  /* Nothing to do in bufmgr anymore... */
2560 }
2561 
2562 /*
2563  * BufferGetBlockNumber
2564  * Returns the block number associated with a buffer.
2565  *
2566  * Note:
2567  * Assumes that the buffer is valid and pinned, else the
2568  * value may be obsolete immediately...
2569  */
2572 {
2573  BufferDesc *bufHdr;
2574 
2575  Assert(BufferIsPinned(buffer));
2576 
2577  if (BufferIsLocal(buffer))
2578  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2579  else
2580  bufHdr = GetBufferDescriptor(buffer - 1);
2581 
2582  /* pinned, so OK to read tag without spinlock */
2583  return bufHdr->tag.blockNum;
2584 }
2585 
2586 /*
2587  * BufferGetTag
2588  * Returns the relfilenode, fork number and block number associated with
2589  * a buffer.
2590  */
2591 void
2593  BlockNumber *blknum)
2594 {
2595  BufferDesc *bufHdr;
2596 
2597  /* Do the same checks as BufferGetBlockNumber. */
2598  Assert(BufferIsPinned(buffer));
2599 
2600  if (BufferIsLocal(buffer))
2601  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2602  else
2603  bufHdr = GetBufferDescriptor(buffer - 1);
2604 
2605  /* pinned, so OK to read tag without spinlock */
2606  *rnode = bufHdr->tag.rnode;
2607  *forknum = bufHdr->tag.forkNum;
2608  *blknum = bufHdr->tag.blockNum;
2609 }
2610 
2611 /*
2612  * FlushBuffer
2613  * Physically write out a shared buffer.
2614  *
2615  * NOTE: this actually just passes the buffer contents to the kernel; the
2616  * real write to disk won't happen until the kernel feels like it. This
2617  * is okay from our point of view since we can redo the changes from WAL.
2618  * However, we will need to force the changes to disk via fsync before
2619  * we can checkpoint WAL.
2620  *
2621  * The caller must hold a pin on the buffer and have share-locked the
2622  * buffer contents. (Note: a share-lock does not prevent updates of
2623  * hint bits in the buffer, so the page could change while the write
2624  * is in progress, but we assume that that will not invalidate the data
2625  * written.)
2626  *
2627  * If the caller has an smgr reference for the buffer's relation, pass it
2628  * as the second parameter. If not, pass NULL.
2629  */
2630 static void
2632 {
2633  XLogRecPtr recptr;
2634  ErrorContextCallback errcallback;
2635  instr_time io_start,
2636  io_time;
2637  Block bufBlock;
2638  char *bufToWrite;
2639  uint32 buf_state;
2640 
2641  /*
2642  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2643  * false, then someone else flushed the buffer before we could, so we need
2644  * not do anything.
2645  */
2646  if (!StartBufferIO(buf, false))
2647  return;
2648 
2649  /* Setup error traceback support for ereport() */
2651  errcallback.arg = (void *) buf;
2652  errcallback.previous = error_context_stack;
2653  error_context_stack = &errcallback;
2654 
2655  /* Find smgr relation for buffer */
2656  if (reln == NULL)
2657  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2658 
2659  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2660  buf->tag.blockNum,
2661  reln->smgr_rnode.node.spcNode,
2662  reln->smgr_rnode.node.dbNode,
2663  reln->smgr_rnode.node.relNode);
2664 
2665  buf_state = LockBufHdr(buf);
2666 
2667  /*
2668  * Run PageGetLSN while holding header lock, since we don't have the
2669  * buffer locked exclusively in all cases.
2670  */
2671  recptr = BufferGetLSN(buf);
2672 
2673  /* To check if block content changes while flushing. - vadim 01/17/97 */
2674  buf_state &= ~BM_JUST_DIRTIED;
2675  UnlockBufHdr(buf, buf_state);
2676 
2677  /*
2678  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2679  * rule that log updates must hit disk before any of the data-file changes
2680  * they describe do.
2681  *
2682  * However, this rule does not apply to unlogged relations, which will be
2683  * lost after a crash anyway. Most unlogged relation pages do not bear
2684  * LSNs since we never emit WAL records for them, and therefore flushing
2685  * up through the buffer LSN would be useless, but harmless. However,
2686  * GiST indexes use LSNs internally to track page-splits, and therefore
2687  * unlogged GiST pages bear "fake" LSNs generated by
2688  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2689  * LSN counter could advance past the WAL insertion point; and if it did
2690  * happen, attempting to flush WAL through that location would fail, with
2691  * disastrous system-wide consequences. To make sure that can't happen,
2692  * skip the flush if the buffer isn't permanent.
2693  */
2694  if (buf_state & BM_PERMANENT)
2695  XLogFlush(recptr);
2696 
2697  /*
2698  * Now it's safe to write buffer to disk. Note that no one else should
2699  * have been able to write it while we were busy with log flushing because
2700  * we have the io_in_progress lock.
2701  */
2702  bufBlock = BufHdrGetBlock(buf);
2703 
2704  /*
2705  * Update page checksum if desired. Since we have only shared lock on the
2706  * buffer, other processes might be updating hint bits in it, so we must
2707  * copy the page to private storage if we do checksumming.
2708  */
2709  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2710 
2711  if (track_io_timing)
2712  INSTR_TIME_SET_CURRENT(io_start);
2713 
2714  /*
2715  * bufToWrite is either the shared buffer or a copy, as appropriate.
2716  */
2717  smgrwrite(reln,
2718  buf->tag.forkNum,
2719  buf->tag.blockNum,
2720  bufToWrite,
2721  false);
2722 
2723  if (track_io_timing)
2724  {
2725  INSTR_TIME_SET_CURRENT(io_time);
2726  INSTR_TIME_SUBTRACT(io_time, io_start);
2729  }
2730 
2732 
2733  /*
2734  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2735  * end the io_in_progress state.
2736  */
2737  TerminateBufferIO(buf, true, 0);
2738 
2739  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2740  buf->tag.blockNum,
2741  reln->smgr_rnode.node.spcNode,
2742  reln->smgr_rnode.node.dbNode,
2743  reln->smgr_rnode.node.relNode);
2744 
2745  /* Pop the error context stack */
2746  error_context_stack = errcallback.previous;
2747 }
2748 
2749 /*
2750  * RelationGetNumberOfBlocksInFork
2751  * Determines the current number of pages in the specified relation fork.
2752  *
2753  * Note that the accuracy of the result will depend on the details of the
2754  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2755  * it might not be.
2756  */
2759 {
2760  switch (relation->rd_rel->relkind)
2761  {
2762  case RELKIND_SEQUENCE:
2763  case RELKIND_INDEX:
2764  case RELKIND_PARTITIONED_INDEX:
2765  /* Open it at the smgr level if not already done */
2766  RelationOpenSmgr(relation);
2767 
2768  return smgrnblocks(relation->rd_smgr, forkNum);
2769 
2770  case RELKIND_RELATION:
2771  case RELKIND_TOASTVALUE:
2772  case RELKIND_MATVIEW:
2773  {
2774  /*
2775  * Not every table AM uses BLCKSZ wide fixed size blocks.
2776  * Therefore tableam returns the size in bytes - but for the
2777  * purpose of this routine, we want the number of blocks.
2778  * Therefore divide, rounding up.
2779  */
2780  uint64 szbytes;
2781 
2782  szbytes = table_relation_size(relation, forkNum);
2783 
2784  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2785  }
2786  case RELKIND_VIEW:
2787  case RELKIND_COMPOSITE_TYPE:
2788  case RELKIND_FOREIGN_TABLE:
2789  case RELKIND_PARTITIONED_TABLE:
2790  default:
2791  Assert(false);
2792  break;
2793  }
2794 
2795  return 0; /* keep compiler quiet */
2796 }
2797 
2798 /*
2799  * BufferIsPermanent
2800  * Determines whether a buffer will potentially still be around after
2801  * a crash. Caller must hold a buffer pin.
2802  */
2803 bool
2805 {
2806  BufferDesc *bufHdr;
2807 
2808  /* Local buffers are used only for temp relations. */
2809  if (BufferIsLocal(buffer))
2810  return false;
2811 
2812  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2813  Assert(BufferIsValid(buffer));
2814  Assert(BufferIsPinned(buffer));
2815 
2816  /*
2817  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2818  * need not bother with the buffer header spinlock. Even if someone else
2819  * changes the buffer header state while we're doing this, the state is
2820  * changed atomically, so we'll read the old value or the new value, but
2821  * not random garbage.
2822  */
2823  bufHdr = GetBufferDescriptor(buffer - 1);
2824  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2825 }
2826 
2827 /*
2828  * BufferGetLSNAtomic
2829  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2830  * This is necessary for some callers who may not have an exclusive lock
2831  * on the buffer.
2832  */
2833 XLogRecPtr
2835 {
2836  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2837  char *page = BufferGetPage(buffer);
2838  XLogRecPtr lsn;
2839  uint32 buf_state;
2840 
2841  /*
2842  * If we don't need locking for correctness, fastpath out.
2843  */
2844  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2845  return PageGetLSN(page);
2846 
2847  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2848  Assert(BufferIsValid(buffer));
2849  Assert(BufferIsPinned(buffer));
2850 
2851  buf_state = LockBufHdr(bufHdr);
2852  lsn = PageGetLSN(page);
2853  UnlockBufHdr(bufHdr, buf_state);
2854 
2855  return lsn;
2856 }
2857 
2858 /* ---------------------------------------------------------------------
2859  * DropRelFileNodeBuffers
2860  *
2861  * This function removes from the buffer pool all the pages of the
2862  * specified relation forks that have block numbers >= firstDelBlock.
2863  * (In particular, with firstDelBlock = 0, all pages are removed.)
2864  * Dirty pages are simply dropped, without bothering to write them
2865  * out first. Therefore, this is NOT rollback-able, and so should be
2866  * used only with extreme caution!
2867  *
2868  * Currently, this is called only from smgr.c when the underlying file
2869  * is about to be deleted or truncated (firstDelBlock is needed for
2870  * the truncation case). The data in the affected pages would therefore
2871  * be deleted momentarily anyway, and there is no point in writing it.
2872  * It is the responsibility of higher-level code to ensure that the
2873  * deletion or truncation does not lose any data that could be needed
2874  * later. It is also the responsibility of higher-level code to ensure
2875  * that no other process could be trying to load more pages of the
2876  * relation into buffers.
2877  *
2878  * XXX currently it sequentially searches the buffer pool, should be
2879  * changed to more clever ways of searching. However, this routine
2880  * is used only in code paths that aren't very performance-critical,
2881  * and we shouldn't slow down the hot paths to make it faster ...
2882  * --------------------------------------------------------------------
2883  */
2884 void
2886  int nforks, BlockNumber *firstDelBlock)
2887 {
2888  int i;
2889  int j;
2890 
2891  /* If it's a local relation, it's localbuf.c's problem. */
2892  if (RelFileNodeBackendIsTemp(rnode))
2893  {
2894  if (rnode.backend == MyBackendId)
2895  {
2896  for (j = 0; j < nforks; j++)
2897  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2898  firstDelBlock[j]);
2899  }
2900  return;
2901  }
2902 
2903  for (i = 0; i < NBuffers; i++)
2904  {
2905  BufferDesc *bufHdr = GetBufferDescriptor(i);
2906  uint32 buf_state;
2907 
2908  /*
2909  * We can make this a tad faster by prechecking the buffer tag before
2910  * we attempt to lock the buffer; this saves a lot of lock
2911  * acquisitions in typical cases. It should be safe because the
2912  * caller must have AccessExclusiveLock on the relation, or some other
2913  * reason to be certain that no one is loading new pages of the rel
2914  * into the buffer pool. (Otherwise we might well miss such pages
2915  * entirely.) Therefore, while the tag might be changing while we
2916  * look at it, it can't be changing *to* a value we care about, only
2917  * *away* from such a value. So false negatives are impossible, and
2918  * false positives are safe because we'll recheck after getting the
2919  * buffer lock.
2920  *
2921  * We could check forkNum and blockNum as well as the rnode, but the
2922  * incremental win from doing so seems small.
2923  */
2924  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2925  continue;
2926 
2927  buf_state = LockBufHdr(bufHdr);
2928 
2929  for (j = 0; j < nforks; j++)
2930  {
2931  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2932  bufHdr->tag.forkNum == forkNum[j] &&
2933  bufHdr->tag.blockNum >= firstDelBlock[j])
2934  {
2935  InvalidateBuffer(bufHdr); /* releases spinlock */
2936  break;
2937  }
2938  }
2939  if (j >= nforks)
2940  UnlockBufHdr(bufHdr, buf_state);
2941  }
2942 }
2943 
2944 /* ---------------------------------------------------------------------
2945  * DropRelFileNodesAllBuffers
2946  *
2947  * This function removes from the buffer pool all the pages of all
2948  * forks of the specified relations. It's equivalent to calling
2949  * DropRelFileNodeBuffers once per fork per relation with
2950  * firstDelBlock = 0.
2951  * --------------------------------------------------------------------
2952  */
2953 void
2955 {
2956  int i,
2957  n = 0;
2958  RelFileNode *nodes;
2959  bool use_bsearch;
2960 
2961  if (nnodes == 0)
2962  return;
2963 
2964  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2965 
2966  /* If it's a local relation, it's localbuf.c's problem. */
2967  for (i = 0; i < nnodes; i++)
2968  {
2969  if (RelFileNodeBackendIsTemp(rnodes[i]))
2970  {
2971  if (rnodes[i].backend == MyBackendId)
2972  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
2973  }
2974  else
2975  nodes[n++] = rnodes[i].node;
2976  }
2977 
2978  /*
2979  * If there are no non-local relations, then we're done. Release the
2980  * memory and return.
2981  */
2982  if (n == 0)
2983  {
2984  pfree(nodes);
2985  return;
2986  }
2987 
2988  /*
2989  * For low number of relations to drop just use a simple walk through, to
2990  * save the bsearch overhead. The threshold to use is rather a guess than
2991  * an exactly determined value, as it depends on many factors (CPU and RAM
2992  * speeds, amount of shared buffers etc.).
2993  */
2994  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
2995 
2996  /* sort the list of rnodes if necessary */
2997  if (use_bsearch)
2998  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
2999 
3000  for (i = 0; i < NBuffers; i++)
3001  {
3002  RelFileNode *rnode = NULL;
3003  BufferDesc *bufHdr = GetBufferDescriptor(i);
3004  uint32 buf_state;
3005 
3006  /*
3007  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3008  * and saves some cycles.
3009  */
3010 
3011  if (!use_bsearch)
3012  {
3013  int j;
3014 
3015  for (j = 0; j < n; j++)
3016  {
3017  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3018  {
3019  rnode = &nodes[j];
3020  break;
3021  }
3022  }
3023  }
3024  else
3025  {
3026  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3027  nodes, n, sizeof(RelFileNode),
3029  }
3030 
3031  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3032  if (rnode == NULL)
3033  continue;
3034 
3035  buf_state = LockBufHdr(bufHdr);
3036  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3037  InvalidateBuffer(bufHdr); /* releases spinlock */
3038  else
3039  UnlockBufHdr(bufHdr, buf_state);
3040  }
3041 
3042  pfree(nodes);
3043 }
3044 
3045 /* ---------------------------------------------------------------------
3046  * DropDatabaseBuffers
3047  *
3048  * This function removes all the buffers in the buffer cache for a
3049  * particular database. Dirty pages are simply dropped, without
3050  * bothering to write them out first. This is used when we destroy a
3051  * database, to avoid trying to flush data to disk when the directory
3052  * tree no longer exists. Implementation is pretty similar to
3053  * DropRelFileNodeBuffers() which is for destroying just one relation.
3054  * --------------------------------------------------------------------
3055  */
3056 void
3058 {
3059  int i;
3060 
3061  /*
3062  * We needn't consider local buffers, since by assumption the target
3063  * database isn't our own.
3064  */
3065 
3066  for (i = 0; i < NBuffers; i++)
3067  {
3068  BufferDesc *bufHdr = GetBufferDescriptor(i);
3069  uint32 buf_state;
3070 
3071  /*
3072  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3073  * and saves some cycles.
3074  */
3075  if (bufHdr->tag.rnode.dbNode != dbid)
3076  continue;
3077 
3078  buf_state = LockBufHdr(bufHdr);
3079  if (bufHdr->tag.rnode.dbNode == dbid)
3080  InvalidateBuffer(bufHdr); /* releases spinlock */
3081  else
3082  UnlockBufHdr(bufHdr, buf_state);
3083  }
3084 }
3085 
3086 /* -----------------------------------------------------------------
3087  * PrintBufferDescs
3088  *
3089  * this function prints all the buffer descriptors, for debugging
3090  * use only.
3091  * -----------------------------------------------------------------
3092  */
3093 #ifdef NOT_USED
3094 void
3095 PrintBufferDescs(void)
3096 {
3097  int i;
3098 
3099  for (i = 0; i < NBuffers; ++i)
3100  {
3103 
3104  /* theoretically we should lock the bufhdr here */
3105  elog(LOG,
3106  "[%02d] (freeNext=%d, rel=%s, "
3107  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3108  i, buf->freeNext,
3110  buf->tag.blockNum, buf->flags,
3111  buf->refcount, GetPrivateRefCount(b));
3112  }
3113 }
3114 #endif
3115 
3116 #ifdef NOT_USED
3117 void
3118 PrintPinnedBufs(void)
3119 {
3120  int i;
3121 
3122  for (i = 0; i < NBuffers; ++i)
3123  {
3126 
3127  if (GetPrivateRefCount(b) > 0)
3128  {
3129  /* theoretically we should lock the bufhdr here */
3130  elog(LOG,
3131  "[%02d] (freeNext=%d, rel=%s, "
3132  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3133  i, buf->freeNext,
3134  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3135  buf->tag.blockNum, buf->flags,
3136  buf->refcount, GetPrivateRefCount(b));
3137  }
3138  }
3139 }
3140 #endif
3141 
3142 /* ---------------------------------------------------------------------
3143  * FlushRelationBuffers
3144  *
3145  * This function writes all dirty pages of a relation out to disk
3146  * (or more accurately, out to kernel disk buffers), ensuring that the
3147  * kernel has an up-to-date view of the relation.
3148  *
3149  * Generally, the caller should be holding AccessExclusiveLock on the
3150  * target relation to ensure that no other backend is busy dirtying
3151  * more blocks of the relation; the effects can't be expected to last
3152  * after the lock is released.
3153  *
3154  * XXX currently it sequentially searches the buffer pool, should be
3155  * changed to more clever ways of searching. This routine is not
3156  * used in any performance-critical code paths, so it's not worth
3157  * adding additional overhead to normal paths to make it go faster;
3158  * but see also DropRelFileNodeBuffers.
3159  * --------------------------------------------------------------------
3160  */
3161 void
3163 {
3164  int i;
3165  BufferDesc *bufHdr;
3166 
3167  /* Open rel at the smgr level if not already done */
3168  RelationOpenSmgr(rel);
3169 
3170  if (RelationUsesLocalBuffers(rel))
3171  {
3172  for (i = 0; i < NLocBuffer; i++)
3173  {
3174  uint32 buf_state;
3175 
3176  bufHdr = GetLocalBufferDescriptor(i);
3177  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3178  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3179  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3180  {
3181  ErrorContextCallback errcallback;
3182  Page localpage;
3183 
3184  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3185 
3186  /* Setup error traceback support for ereport() */
3188  errcallback.arg = (void *) bufHdr;
3189  errcallback.previous = error_context_stack;
3190  error_context_stack = &errcallback;
3191 
3192  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3193 
3194  smgrwrite(rel->rd_smgr,
3195  bufHdr->tag.forkNum,
3196  bufHdr->tag.blockNum,
3197  localpage,
3198  false);
3199 
3200  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3201  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3202 
3203  /* Pop the error context stack */
3204  error_context_stack = errcallback.previous;
3205  }
3206  }
3207 
3208  return;
3209  }
3210 
3211  /* Make sure we can handle the pin inside the loop */
3213 
3214  for (i = 0; i < NBuffers; i++)
3215  {
3216  uint32 buf_state;
3217 
3218  bufHdr = GetBufferDescriptor(i);
3219 
3220  /*
3221  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3222  * and saves some cycles.
3223  */
3224  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3225  continue;
3226 
3228 
3229  buf_state = LockBufHdr(bufHdr);
3230  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3231  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3232  {
3233  PinBuffer_Locked(bufHdr);
3235  FlushBuffer(bufHdr, rel->rd_smgr);
3237  UnpinBuffer(bufHdr, true);
3238  }
3239  else
3240  UnlockBufHdr(bufHdr, buf_state);
3241  }
3242 }
3243 
3244 /* ---------------------------------------------------------------------
3245  * FlushDatabaseBuffers
3246  *
3247  * This function writes all dirty pages of a database out to disk
3248  * (or more accurately, out to kernel disk buffers), ensuring that the
3249  * kernel has an up-to-date view of the database.
3250  *
3251  * Generally, the caller should be holding an appropriate lock to ensure
3252  * no other backend is active in the target database; otherwise more
3253  * pages could get dirtied.
3254  *
3255  * Note we don't worry about flushing any pages of temporary relations.
3256  * It's assumed these wouldn't be interesting.
3257  * --------------------------------------------------------------------
3258  */
3259 void
3261 {
3262  int i;
3263  BufferDesc *bufHdr;
3264 
3265  /* Make sure we can handle the pin inside the loop */
3267 
3268  for (i = 0; i < NBuffers; i++)
3269  {
3270  uint32 buf_state;
3271 
3272  bufHdr = GetBufferDescriptor(i);
3273 
3274  /*
3275  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3276  * and saves some cycles.
3277  */
3278  if (bufHdr->tag.rnode.dbNode != dbid)
3279  continue;
3280 
3282 
3283  buf_state = LockBufHdr(bufHdr);
3284  if (bufHdr->tag.rnode.dbNode == dbid &&
3285  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3286  {
3287  PinBuffer_Locked(bufHdr);
3289  FlushBuffer(bufHdr, NULL);
3291  UnpinBuffer(bufHdr, true);
3292  }
3293  else
3294  UnlockBufHdr(bufHdr, buf_state);
3295  }
3296 }
3297 
3298 /*
3299  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3300  * OS.
3301  */
3302 void
3304 {
3305  BufferDesc *bufHdr;
3306 
3307  /* currently not needed, but no fundamental reason not to support */
3308  Assert(!BufferIsLocal(buffer));
3309 
3310  Assert(BufferIsPinned(buffer));
3311 
3312  bufHdr = GetBufferDescriptor(buffer - 1);
3313 
3315 
3316  FlushBuffer(bufHdr, NULL);
3317 }
3318 
3319 /*
3320  * ReleaseBuffer -- release the pin on a buffer
3321  */
3322 void
3324 {
3325  if (!BufferIsValid(buffer))
3326  elog(ERROR, "bad buffer ID: %d", buffer);
3327 
3328  if (BufferIsLocal(buffer))
3329  {
3331 
3332  Assert(LocalRefCount[-buffer - 1] > 0);
3333  LocalRefCount[-buffer - 1]--;
3334  return;
3335  }
3336 
3337  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3338 }
3339 
3340 /*
3341  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3342  *
3343  * This is just a shorthand for a common combination.
3344  */
3345 void
3347 {
3348  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3349  ReleaseBuffer(buffer);
3350 }
3351 
3352 /*
3353  * IncrBufferRefCount
3354  * Increment the pin count on a buffer that we have *already* pinned
3355  * at least once.
3356  *
3357  * This function cannot be used on a buffer we do not have pinned,
3358  * because it doesn't change the shared buffer state.
3359  */
3360 void
3362 {
3363  Assert(BufferIsPinned(buffer));
3365  if (BufferIsLocal(buffer))
3366  LocalRefCount[-buffer - 1]++;
3367  else
3368  {
3369  PrivateRefCountEntry *ref;
3370 
3371  ref = GetPrivateRefCountEntry(buffer, true);
3372  Assert(ref != NULL);
3373  ref->refcount++;
3374  }
3376 }
3377 
3378 /*
3379  * MarkBufferDirtyHint
3380  *
3381  * Mark a buffer dirty for non-critical changes.
3382  *
3383  * This is essentially the same as MarkBufferDirty, except:
3384  *
3385  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3386  * to write an XLOG_FPI WAL record to protect against torn pages.
3387  * 2. The caller might have only share-lock instead of exclusive-lock on the
3388  * buffer's content lock.
3389  * 3. This function does not guarantee that the buffer is always marked dirty
3390  * (due to a race condition), so it cannot be used for important changes.
3391  */
3392 void
3394 {
3395  BufferDesc *bufHdr;
3396  Page page = BufferGetPage(buffer);
3397 
3398  if (!BufferIsValid(buffer))
3399  elog(ERROR, "bad buffer ID: %d", buffer);
3400 
3401  if (BufferIsLocal(buffer))
3402  {
3403  MarkLocalBufferDirty(buffer);
3404  return;
3405  }
3406 
3407  bufHdr = GetBufferDescriptor(buffer - 1);
3408 
3409  Assert(GetPrivateRefCount(buffer) > 0);
3410  /* here, either share or exclusive lock is OK */
3412 
3413  /*
3414  * This routine might get called many times on the same page, if we are
3415  * making the first scan after commit of an xact that added/deleted many
3416  * tuples. So, be as quick as we can if the buffer is already dirty. We
3417  * do this by not acquiring spinlock if it looks like the status bits are
3418  * already set. Since we make this test unlocked, there's a chance we
3419  * might fail to notice that the flags have just been cleared, and failed
3420  * to reset them, due to memory-ordering issues. But since this function
3421  * is only intended to be used in cases where failing to write out the
3422  * data would be harmless anyway, it doesn't really matter.
3423  */
3424  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3426  {
3428  bool dirtied = false;
3429  bool delayChkpt = false;
3430  uint32 buf_state;
3431 
3432  /*
3433  * If we need to protect hint bit updates from torn writes, WAL-log a
3434  * full page image of the page. This full page image is only necessary
3435  * if the hint bit update is the first change to the page since the
3436  * last checkpoint.
3437  *
3438  * We don't check full_page_writes here because that logic is included
3439  * when we call XLogInsert() since the value changes dynamically.
3440  */
3441  if (XLogHintBitIsNeeded() &&
3442  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3443  {
3444  /*
3445  * If we're in recovery we cannot dirty a page because of a hint.
3446  * We can set the hint, just not dirty the page as a result so the
3447  * hint is lost when we evict the page or shutdown.
3448  *
3449  * See src/backend/storage/page/README for longer discussion.
3450  */
3451  if (RecoveryInProgress())
3452  return;
3453 
3454  /*
3455  * If the block is already dirty because we either made a change
3456  * or set a hint already, then we don't need to write a full page
3457  * image. Note that aggressive cleaning of blocks dirtied by hint
3458  * bit setting would increase the call rate. Bulk setting of hint
3459  * bits would reduce the call rate...
3460  *
3461  * We must issue the WAL record before we mark the buffer dirty.
3462  * Otherwise we might write the page before we write the WAL. That
3463  * causes a race condition, since a checkpoint might occur between
3464  * writing the WAL record and marking the buffer dirty. We solve
3465  * that with a kluge, but one that is already in use during
3466  * transaction commit to prevent race conditions. Basically, we
3467  * simply prevent the checkpoint WAL record from being written
3468  * until we have marked the buffer dirty. We don't start the
3469  * checkpoint flush until we have marked dirty, so our checkpoint
3470  * must flush the change to disk successfully or the checkpoint
3471  * never gets written, so crash recovery will fix.
3472  *
3473  * It's possible we may enter here without an xid, so it is
3474  * essential that CreateCheckpoint waits for virtual transactions
3475  * rather than full transactionids.
3476  */
3477  MyPgXact->delayChkpt = delayChkpt = true;
3478  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3479  }
3480 
3481  buf_state = LockBufHdr(bufHdr);
3482 
3483  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3484 
3485  if (!(buf_state & BM_DIRTY))
3486  {
3487  dirtied = true; /* Means "will be dirtied by this action" */
3488 
3489  /*
3490  * Set the page LSN if we wrote a backup block. We aren't supposed
3491  * to set this when only holding a share lock but as long as we
3492  * serialise it somehow we're OK. We choose to set LSN while
3493  * holding the buffer header lock, which causes any reader of an
3494  * LSN who holds only a share lock to also obtain a buffer header
3495  * lock before using PageGetLSN(), which is enforced in
3496  * BufferGetLSNAtomic().
3497  *
3498  * If checksums are enabled, you might think we should reset the
3499  * checksum here. That will happen when the page is written
3500  * sometime later in this checkpoint cycle.
3501  */
3502  if (!XLogRecPtrIsInvalid(lsn))
3503  PageSetLSN(page, lsn);
3504  }
3505 
3506  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3507  UnlockBufHdr(bufHdr, buf_state);
3508 
3509  if (delayChkpt)
3510  MyPgXact->delayChkpt = false;
3511 
3512  if (dirtied)
3513  {
3514  VacuumPageDirty++;
3516  if (VacuumCostActive)
3518  }
3519  }
3520 }
3521 
3522 /*
3523  * Release buffer content locks for shared buffers.
3524  *
3525  * Used to clean up after errors.
3526  *
3527  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3528  * of releasing buffer content locks per se; the only thing we need to deal
3529  * with here is clearing any PIN_COUNT request that was in progress.
3530  */
3531 void
3533 {
3535 
3536  if (buf)
3537  {
3538  uint32 buf_state;
3539 
3540  buf_state = LockBufHdr(buf);
3541 
3542  /*
3543  * Don't complain if flag bit not set; it could have been reset but we
3544  * got a cancel/die interrupt before getting the signal.
3545  */
3546  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3547  buf->wait_backend_pid == MyProcPid)
3548  buf_state &= ~BM_PIN_COUNT_WAITER;
3549 
3550  UnlockBufHdr(buf, buf_state);
3551 
3552  PinCountWaitBuf = NULL;
3553  }
3554 }
3555 
3556 /*
3557  * Acquire or release the content_lock for the buffer.
3558  */
3559 void
3561 {
3562  BufferDesc *buf;
3563 
3564  Assert(BufferIsValid(buffer));
3565  if (BufferIsLocal(buffer))
3566  return; /* local buffers need no lock */
3567 
3568  buf = GetBufferDescriptor(buffer - 1);
3569 
3570  if (mode == BUFFER_LOCK_UNLOCK)
3572  else if (mode == BUFFER_LOCK_SHARE)
3574  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3576  else
3577  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3578 }
3579 
3580 /*
3581  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3582  *
3583  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3584  */
3585 bool
3587 {
3588  BufferDesc *buf;
3589 
3590  Assert(BufferIsValid(buffer));
3591  if (BufferIsLocal(buffer))
3592  return true; /* act as though we got it */
3593 
3594  buf = GetBufferDescriptor(buffer - 1);
3595 
3597  LW_EXCLUSIVE);
3598 }
3599 
3600 /*
3601  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3602  *
3603  * Items may be deleted from a disk page only when the caller (a) holds an
3604  * exclusive lock on the buffer and (b) has observed that no other backend
3605  * holds a pin on the buffer. If there is a pin, then the other backend
3606  * might have a pointer into the buffer (for example, a heapscan reference
3607  * to an item --- see README for more details). It's OK if a pin is added
3608  * after the cleanup starts, however; the newly-arrived backend will be
3609  * unable to look at the page until we release the exclusive lock.
3610  *
3611  * To implement this protocol, a would-be deleter must pin the buffer and
3612  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3613  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3614  * it has successfully observed pin count = 1.
3615  */
3616 void
3618 {
3619  BufferDesc *bufHdr;
3620  char *new_status = NULL;
3621 
3622  Assert(BufferIsValid(buffer));
3623  Assert(PinCountWaitBuf == NULL);
3624 
3625  if (BufferIsLocal(buffer))
3626  {
3627  /* There should be exactly one pin */
3628  if (LocalRefCount[-buffer - 1] != 1)
3629  elog(ERROR, "incorrect local pin count: %d",
3630  LocalRefCount[-buffer - 1]);
3631  /* Nobody else to wait for */
3632  return;
3633  }
3634 
3635  /* There should be exactly one local pin */
3636  if (GetPrivateRefCount(buffer) != 1)
3637  elog(ERROR, "incorrect local pin count: %d",
3638  GetPrivateRefCount(buffer));
3639 
3640  bufHdr = GetBufferDescriptor(buffer - 1);
3641 
3642  for (;;)
3643  {
3644  uint32 buf_state;
3645 
3646  /* Try to acquire lock */
3648  buf_state = LockBufHdr(bufHdr);
3649 
3650  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3651  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3652  {
3653  /* Successfully acquired exclusive lock with pincount 1 */
3654  UnlockBufHdr(bufHdr, buf_state);
3655 
3656  /* Report change to non-waiting status */
3657  if (new_status)
3658  {
3659  set_ps_display(new_status);
3660  pfree(new_status);
3661  }
3662  return;
3663  }
3664  /* Failed, so mark myself as waiting for pincount 1 */
3665  if (buf_state & BM_PIN_COUNT_WAITER)
3666  {
3667  UnlockBufHdr(bufHdr, buf_state);
3668  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3669  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3670  }
3671  bufHdr->wait_backend_pid = MyProcPid;
3672  PinCountWaitBuf = bufHdr;
3673  buf_state |= BM_PIN_COUNT_WAITER;
3674  UnlockBufHdr(bufHdr, buf_state);
3675  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3676 
3677  /* Wait to be signaled by UnpinBuffer() */
3678  if (InHotStandby)
3679  {
3680  /* Report change to waiting status */
3681  if (update_process_title && new_status == NULL)
3682  {
3683  const char *old_status;
3684  int len;
3685 
3686  old_status = get_ps_display(&len);
3687  new_status = (char *) palloc(len + 8 + 1);
3688  memcpy(new_status, old_status, len);
3689  strcpy(new_status + len, " waiting");
3690  set_ps_display(new_status);
3691  new_status[len] = '\0'; /* truncate off " waiting" */
3692  }
3693 
3694  /* Publish the bufid that Startup process waits on */
3695  SetStartupBufferPinWaitBufId(buffer - 1);
3696  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3698  /* Reset the published bufid */
3700  }
3701  else
3703 
3704  /*
3705  * Remove flag marking us as waiter. Normally this will not be set
3706  * anymore, but ProcWaitForSignal() can return for other signals as
3707  * well. We take care to only reset the flag if we're the waiter, as
3708  * theoretically another backend could have started waiting. That's
3709  * impossible with the current usages due to table level locking, but
3710  * better be safe.
3711  */
3712  buf_state = LockBufHdr(bufHdr);
3713  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3714  bufHdr->wait_backend_pid == MyProcPid)
3715  buf_state &= ~BM_PIN_COUNT_WAITER;
3716  UnlockBufHdr(bufHdr, buf_state);
3717 
3718  PinCountWaitBuf = NULL;
3719  /* Loop back and try again */
3720  }
3721 }
3722 
3723 /*
3724  * Check called from RecoveryConflictInterrupt handler when Startup
3725  * process requests cancellation of all pin holders that are blocking it.
3726  */
3727 bool
3729 {
3730  int bufid = GetStartupBufferPinWaitBufId();
3731 
3732  /*
3733  * If we get woken slowly then it's possible that the Startup process was
3734  * already woken by other backends before we got here. Also possible that
3735  * we get here by multiple interrupts or interrupts at inappropriate
3736  * times, so make sure we do nothing if the bufid is not set.
3737  */
3738  if (bufid < 0)
3739  return false;
3740 
3741  if (GetPrivateRefCount(bufid + 1) > 0)
3742  return true;
3743 
3744  return false;
3745 }
3746 
3747 /*
3748  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3749  *
3750  * We won't loop, but just check once to see if the pin count is OK. If
3751  * not, return false with no lock held.
3752  */
3753 bool
3755 {
3756  BufferDesc *bufHdr;
3757  uint32 buf_state,
3758  refcount;
3759 
3760  Assert(BufferIsValid(buffer));
3761 
3762  if (BufferIsLocal(buffer))
3763  {
3764  refcount = LocalRefCount[-buffer - 1];
3765  /* There should be exactly one pin */
3766  Assert(refcount > 0);
3767  if (refcount != 1)
3768  return false;
3769  /* Nobody else to wait for */
3770  return true;
3771  }
3772 
3773  /* There should be exactly one local pin */
3774  refcount = GetPrivateRefCount(buffer);
3775  Assert(refcount);
3776  if (refcount != 1)
3777  return false;
3778 
3779  /* Try to acquire lock */
3780  if (!ConditionalLockBuffer(buffer))
3781  return false;
3782 
3783  bufHdr = GetBufferDescriptor(buffer - 1);
3784  buf_state = LockBufHdr(bufHdr);
3785  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3786 
3787  Assert(refcount > 0);
3788  if (refcount == 1)
3789  {
3790  /* Successfully acquired exclusive lock with pincount 1 */
3791  UnlockBufHdr(bufHdr, buf_state);
3792  return true;
3793  }
3794 
3795  /* Failed, so release the lock */
3796  UnlockBufHdr(bufHdr, buf_state);
3797  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3798  return false;
3799 }
3800 
3801 /*
3802  * IsBufferCleanupOK - as above, but we already have the lock
3803  *
3804  * Check whether it's OK to perform cleanup on a buffer we've already
3805  * locked. If we observe that the pin count is 1, our exclusive lock
3806  * happens to be a cleanup lock, and we can proceed with anything that
3807  * would have been allowable had we sought a cleanup lock originally.
3808  */
3809 bool
3811 {
3812  BufferDesc *bufHdr;
3813  uint32 buf_state;
3814 
3815  Assert(BufferIsValid(buffer));
3816 
3817  if (BufferIsLocal(buffer))
3818  {
3819  /* There should be exactly one pin */
3820  if (LocalRefCount[-buffer - 1] != 1)
3821  return false;
3822  /* Nobody else to wait for */
3823  return true;
3824  }
3825 
3826  /* There should be exactly one local pin */
3827  if (GetPrivateRefCount(buffer) != 1)
3828  return false;
3829 
3830  bufHdr = GetBufferDescriptor(buffer - 1);
3831 
3832  /* caller must hold exclusive lock on buffer */
3834  LW_EXCLUSIVE));
3835 
3836  buf_state = LockBufHdr(bufHdr);
3837 
3838  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3839  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3840  {
3841  /* pincount is OK. */
3842  UnlockBufHdr(bufHdr, buf_state);
3843  return true;
3844  }
3845 
3846  UnlockBufHdr(bufHdr, buf_state);
3847  return false;
3848 }
3849 
3850 
3851 /*
3852  * Functions for buffer I/O handling
3853  *
3854  * Note: We assume that nested buffer I/O never occurs.
3855  * i.e at most one io_in_progress lock is held per proc.
3856  *
3857  * Also note that these are used only for shared buffers, not local ones.
3858  */
3859 
3860 /*
3861  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
3862  */
3863 static void
3865 {
3866  /*
3867  * Changed to wait until there's no IO - Inoue 01/13/2000
3868  *
3869  * Note this is *necessary* because an error abort in the process doing
3870  * I/O could release the io_in_progress_lock prematurely. See
3871  * AbortBufferIO.
3872  */
3873  for (;;)
3874  {
3875  uint32 buf_state;
3876 
3877  /*
3878  * It may not be necessary to acquire the spinlock to check the flag
3879  * here, but since this test is essential for correctness, we'd better
3880  * play it safe.
3881  */
3882  buf_state = LockBufHdr(buf);
3883  UnlockBufHdr(buf, buf_state);
3884 
3885  if (!(buf_state & BM_IO_IN_PROGRESS))
3886  break;
3889  }
3890 }
3891 
3892 /*
3893  * StartBufferIO: begin I/O on this buffer
3894  * (Assumptions)
3895  * My process is executing no IO
3896  * The buffer is Pinned
3897  *
3898  * In some scenarios there are race conditions in which multiple backends
3899  * could attempt the same I/O operation concurrently. If someone else
3900  * has already started I/O on this buffer then we will block on the
3901  * io_in_progress lock until he's done.
3902  *
3903  * Input operations are only attempted on buffers that are not BM_VALID,
3904  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
3905  * so we can always tell if the work is already done.
3906  *
3907  * Returns true if we successfully marked the buffer as I/O busy,
3908  * false if someone else already did the work.
3909  */
3910 static bool
3911 StartBufferIO(BufferDesc *buf, bool forInput)
3912 {
3913  uint32 buf_state;
3914 
3915  Assert(!InProgressBuf);
3916 
3917  for (;;)
3918  {
3919  /*
3920  * Grab the io_in_progress lock so that other processes can wait for
3921  * me to finish the I/O.
3922  */
3924 
3925  buf_state = LockBufHdr(buf);
3926 
3927  if (!(buf_state & BM_IO_IN_PROGRESS))
3928  break;
3929 
3930  /*
3931  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
3932  * lock isn't held is if the process doing the I/O is recovering from
3933  * an error (see AbortBufferIO). If that's the case, we must wait for
3934  * him to get unwedged.
3935  */
3936  UnlockBufHdr(buf, buf_state);
3938  WaitIO(buf);
3939  }
3940 
3941  /* Once we get here, there is definitely no I/O active on this buffer */
3942 
3943  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
3944  {
3945  /* someone else already did the I/O */
3946  UnlockBufHdr(buf, buf_state);
3948  return false;
3949  }
3950 
3951  buf_state |= BM_IO_IN_PROGRESS;
3952  UnlockBufHdr(buf, buf_state);
3953 
3954  InProgressBuf = buf;
3955  IsForInput = forInput;
3956 
3957  return true;
3958 }
3959 
3960 /*
3961  * TerminateBufferIO: release a buffer we were doing I/O on
3962  * (Assumptions)
3963  * My process is executing IO for the buffer
3964  * BM_IO_IN_PROGRESS bit is set for the buffer
3965  * We hold the buffer's io_in_progress lock
3966  * The buffer is Pinned
3967  *
3968  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
3969  * buffer's BM_DIRTY flag. This is appropriate when terminating a
3970  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
3971  * marking the buffer clean if it was re-dirtied while we were writing.
3972  *
3973  * set_flag_bits gets ORed into the buffer's flags. It must include
3974  * BM_IO_ERROR in a failure case. For successful completion it could
3975  * be 0, or BM_VALID if we just finished reading in the page.
3976  */
3977 static void
3978 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
3979 {
3980  uint32 buf_state;
3981 
3982  Assert(buf == InProgressBuf);
3983 
3984  buf_state = LockBufHdr(buf);
3985 
3986  Assert(buf_state & BM_IO_IN_PROGRESS);
3987 
3988  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
3989  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
3990  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
3991 
3992  buf_state |= set_flag_bits;
3993  UnlockBufHdr(buf, buf_state);
3994 
3995  InProgressBuf = NULL;
3996 
3998 }
3999 
4000 /*
4001  * AbortBufferIO: Clean up any active buffer I/O after an error.
4002  *
4003  * All LWLocks we might have held have been released,
4004  * but we haven't yet released buffer pins, so the buffer is still pinned.
4005  *
4006  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4007  * possible the error condition wasn't related to the I/O.
4008  */
4009 void
4011 {
4013 
4014  if (buf)
4015  {
4016  uint32 buf_state;
4017 
4018  /*
4019  * Since LWLockReleaseAll has already been called, we're not holding
4020  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4021  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4022  * buffer will be in a busy spin until we succeed in doing this.
4023  */
4025 
4026  buf_state = LockBufHdr(buf);
4027  Assert(buf_state & BM_IO_IN_PROGRESS);
4028  if (IsForInput)
4029  {
4030  Assert(!(buf_state & BM_DIRTY));
4031 
4032  /* We'd better not think buffer is valid yet */
4033  Assert(!(buf_state & BM_VALID));
4034  UnlockBufHdr(buf, buf_state);
4035  }
4036  else
4037  {
4038  Assert(buf_state & BM_DIRTY);
4039  UnlockBufHdr(buf, buf_state);
4040  /* Issue notice if this is not the first failure... */
4041  if (buf_state & BM_IO_ERROR)
4042  {
4043  /* Buffer is pinned, so we can read tag without spinlock */
4044  char *path;
4045 
4046  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4047  ereport(WARNING,
4048  (errcode(ERRCODE_IO_ERROR),
4049  errmsg("could not write block %u of %s",
4050  buf->tag.blockNum, path),
4051  errdetail("Multiple failures --- write error might be permanent.")));
4052  pfree(path);
4053  }
4054  }
4055  TerminateBufferIO(buf, false, BM_IO_ERROR);
4056  }
4057 }
4058 
4059 /*
4060  * Error context callback for errors occurring during shared buffer writes.
4061  */
4062 static void
4064 {
4065  BufferDesc *bufHdr = (BufferDesc *) arg;
4066 
4067  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4068  if (bufHdr != NULL)
4069  {
4070  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4071 
4072  errcontext("writing block %u of relation %s",
4073  bufHdr->tag.blockNum, path);
4074  pfree(path);
4075  }
4076 }
4077 
4078 /*
4079  * Error context callback for errors occurring during local buffer writes.
4080  */
4081 static void
4083 {
4084  BufferDesc *bufHdr = (BufferDesc *) arg;
4085 
4086  if (bufHdr != NULL)
4087  {
4088  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4089  bufHdr->tag.forkNum);
4090 
4091  errcontext("writing block %u of relation %s",
4092  bufHdr->tag.blockNum, path);
4093  pfree(path);
4094  }
4095 }
4096 
4097 /*
4098  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4099  */
4100 static int
4101 rnode_comparator(const void *p1, const void *p2)
4102 {
4103  RelFileNode n1 = *(const RelFileNode *) p1;
4104  RelFileNode n2 = *(const RelFileNode *) p2;
4105 
4106  if (n1.relNode < n2.relNode)
4107  return -1;
4108  else if (n1.relNode > n2.relNode)
4109  return 1;
4110 
4111  if (n1.dbNode < n2.dbNode)
4112  return -1;
4113  else if (n1.dbNode > n2.dbNode)
4114  return 1;
4115 
4116  if (n1.spcNode < n2.spcNode)
4117  return -1;
4118  else if (n1.spcNode > n2.spcNode)
4119  return 1;
4120  else
4121  return 0;
4122 }
4123 
4124 /*
4125  * Lock buffer header - set BM_LOCKED in buffer state.
4126  */
4127 uint32
4129 {
4130  SpinDelayStatus delayStatus;
4131  uint32 old_buf_state;
4132 
4133  init_local_spin_delay(&delayStatus);
4134 
4135  while (true)
4136  {
4137  /* set BM_LOCKED flag */
4138  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4139  /* if it wasn't set before we're OK */
4140  if (!(old_buf_state & BM_LOCKED))
4141  break;
4142  perform_spin_delay(&delayStatus);
4143  }
4144  finish_spin_delay(&delayStatus);
4145  return old_buf_state | BM_LOCKED;
4146 }
4147 
4148 /*
4149  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4150  * state at that point.
4151  *
4152  * Obviously the buffer could be locked by the time the value is returned, so
4153  * this is primarily useful in CAS style loops.
4154  */
4155 static uint32
4157 {
4158  SpinDelayStatus delayStatus;
4159  uint32 buf_state;
4160 
4161  init_local_spin_delay(&delayStatus);
4162 
4163  buf_state = pg_atomic_read_u32(&buf->state);
4164 
4165  while (buf_state & BM_LOCKED)
4166  {
4167  perform_spin_delay(&delayStatus);
4168  buf_state = pg_atomic_read_u32(&buf->state);
4169  }
4170 
4171  finish_spin_delay(&delayStatus);
4172 
4173  return buf_state;
4174 }
4175 
4176 /*
4177  * BufferTag comparator.
4178  */
4179 static int
4180 buffertag_comparator(const void *a, const void *b)
4181 {
4182  const BufferTag *ba = (const BufferTag *) a;
4183  const BufferTag *bb = (const BufferTag *) b;
4184  int ret;
4185 
4186  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4187 
4188  if (ret != 0)
4189  return ret;
4190 
4191  if (ba->forkNum < bb->forkNum)
4192  return -1;
4193  if (ba->forkNum > bb->forkNum)
4194  return 1;
4195 
4196  if (ba->blockNum < bb->blockNum)
4197  return -1;
4198  if (ba->blockNum > bb->blockNum)
4199  return 1;
4200 
4201  return 0;
4202 }
4203 
4204 /*
4205  * Comparator determining the writeout order in a checkpoint.
4206  *
4207  * It is important that tablespaces are compared first, the logic balancing
4208  * writes between tablespaces relies on it.
4209  */
4210 static int
4211 ckpt_buforder_comparator(const void *pa, const void *pb)
4212 {
4213  const CkptSortItem *a = (const CkptSortItem *) pa;
4214  const CkptSortItem *b = (const CkptSortItem *) pb;
4215 
4216  /* compare tablespace */
4217  if (a->tsId < b->tsId)
4218  return -1;
4219  else if (a->tsId > b->tsId)
4220  return 1;
4221  /* compare relation */
4222  if (a->relNode < b->relNode)
4223  return -1;
4224  else if (a->relNode > b->relNode)
4225  return 1;
4226  /* compare fork */
4227  else if (a->forkNum < b->forkNum)
4228  return -1;
4229  else if (a->forkNum > b->forkNum)
4230  return 1;
4231  /* compare block number */
4232  else if (a->blockNum < b->blockNum)
4233  return -1;
4234  else if (a->blockNum > b->blockNum)
4235  return 1;
4236  /* equal page IDs are unlikely, but not impossible */
4237  return 0;
4238 }
4239 
4240 /*
4241  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4242  * progress.
4243  */
4244 static int
4246 {
4247  CkptTsStatus *sa = (CkptTsStatus *) a;
4248  CkptTsStatus *sb = (CkptTsStatus *) b;
4249 
4250  /* we want a min-heap, so return 1 for the a < b */
4251  if (sa->progress < sb->progress)
4252  return 1;
4253  else if (sa->progress == sb->progress)
4254  return 0;
4255  else
4256  return -1;
4257 }
4258 
4259 /*
4260  * Initialize a writeback context, discarding potential previous state.
4261  *
4262  * *max_pending is a pointer instead of an immediate value, so the coalesce
4263  * limits can easily changed by the GUC mechanism, and so calling code does
4264  * not have to check the current configuration. A value of 0 means that no
4265  * writeback control will be performed.
4266  */
4267 void
4268 WritebackContextInit(WritebackContext *context, int *max_pending)
4269 {
4270  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4271 
4272  context->max_pending = max_pending;
4273  context->nr_pending = 0;
4274 }
4275 
4276 /*
4277  * Add buffer to list of pending writeback requests.
4278  */
4279 void
4281 {
4282  PendingWriteback *pending;
4283 
4284  /*
4285  * Add buffer to the pending writeback array, unless writeback control is
4286  * disabled.
4287  */
4288  if (*context->max_pending > 0)
4289  {
4291 
4292  pending = &context->pending_writebacks[context->nr_pending++];
4293 
4294  pending->tag = *tag;
4295  }
4296 
4297  /*
4298  * Perform pending flushes if the writeback limit is exceeded. This
4299  * includes the case where previously an item has been added, but control
4300  * is now disabled.
4301  */
4302  if (context->nr_pending >= *context->max_pending)
4303  IssuePendingWritebacks(context);
4304 }
4305 
4306 /*
4307  * Issue all pending writeback requests, previously scheduled with
4308  * ScheduleBufferTagForWriteback, to the OS.
4309  *
4310  * Because this is only used to improve the OSs IO scheduling we try to never
4311  * error out - it's just a hint.
4312  */
4313 void
4315 {
4316  int i;
4317 
4318  if (context->nr_pending == 0)
4319  return;
4320 
4321  /*
4322  * Executing the writes in-order can make them a lot faster, and allows to
4323  * merge writeback requests to consecutive blocks into larger writebacks.
4324  */
4325  qsort(&context->pending_writebacks, context->nr_pending,
4327 
4328  /*
4329  * Coalesce neighbouring writes, but nothing else. For that we iterate
4330  * through the, now sorted, array of pending flushes, and look forward to
4331  * find all neighbouring (or identical) writes.
4332  */
4333  for (i = 0; i < context->nr_pending; i++)
4334  {
4337  SMgrRelation reln;
4338  int ahead;
4339  BufferTag tag;
4340  Size nblocks = 1;
4341 
4342  cur = &context->pending_writebacks[i];
4343  tag = cur->tag;
4344 
4345  /*
4346  * Peek ahead, into following writeback requests, to see if they can
4347  * be combined with the current one.
4348  */
4349  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4350  {
4351  next = &context->pending_writebacks[i + ahead + 1];
4352 
4353  /* different file, stop */
4354  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4355  cur->tag.forkNum != next->tag.forkNum)
4356  break;
4357 
4358  /* ok, block queued twice, skip */
4359  if (cur->tag.blockNum == next->tag.blockNum)
4360  continue;
4361 
4362  /* only merge consecutive writes */
4363  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4364  break;
4365 
4366  nblocks++;
4367  cur = next;
4368  }
4369 
4370  i += ahead;
4371 
4372  /* and finally tell the kernel to write the data to storage */
4373  reln = smgropen(tag.rnode, InvalidBackendId);
4374  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4375  }
4376 
4377  context->nr_pending = 0;
4378 }
4379 
4380 
4381 /*
4382  * Implement slower/larger portions of TestForOldSnapshot
4383  *
4384  * Smaller/faster portions are put inline, but the entire set of logic is too
4385  * big for that.
4386  */
4387 void
4389 {
4390  if (RelationAllowsEarlyPruning(relation)
4391  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4392  ereport(ERROR,
4393  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4394  errmsg("snapshot too old")));
4395 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
void ProcessSyncRequests(void)
Definition: sync.c:236
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1527
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:285
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:426
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:84
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:663
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3617
Definition: lwlock.h:32
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:416
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2540
PgStat_Counter m_buf_alloc
Definition: pgstat.h:421
TimestampTz ckpt_sync_end_t
Definition: xlog.h:239
#define BM_PERMANENT
Definition: buf_internals.h:66
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1861
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
int64 VacuumPageMiss
Definition: globals.c:144
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:217
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:1071
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1745
long local_blks_read
Definition: instrument.h:26
int maintenance_io_concurrency
Definition: bufmgr.c:128
#define BM_TAG_VALID
Definition: buf_internals.h:60
Oid tsId
Definition: bufmgr.c:88
static int32 next
Definition: blutils.c:218
int VacuumCostBalance
Definition: globals.c:147
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2011
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2497
int BgWriterDelay
Definition: bgwriter.c:64
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
#define HASH_ELEM
Definition: hsearch.h:87
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
static uint32 PrivateRefCountClock
Definition: bufmgr.c:178
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3393
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:177
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1843
instr_time blk_read_time
Definition: instrument.h:31
bool update_process_title
Definition: ps_status.c:36
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1308
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4082
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3155
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1406
int backend_flush_after
Definition: bufmgr.c:136
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2381
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:418
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:590
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:893
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:202
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3864
int64 VacuumPageHit
Definition: globals.c:143
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:429
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4245
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:388
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:417
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:632
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1165
int checkpoint_flush_after
Definition: bufmgr.c:134
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:971
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
int64 VacuumPageDirty
Definition: globals.c:145
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3323
#define P_NEW
Definition: bufmgr.h:79
double bgwriter_lru_multiplier
Definition: bufmgr.c:112
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:907
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:86
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:89
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8051
#define BM_DIRTY
Definition: buf_internals.h:58
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
int VacuumCostPageDirty
Definition: globals.c:139
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2834
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2631
int effective_io_concurrency
Definition: bufmgr.c:121
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2314
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4314
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:139
signed int int32
Definition: c.h:355
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4268
PGXACT * MyPgXact
Definition: proc.c:68
int bgwriter_flush_after
Definition: bufmgr.c:135
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:508
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:259
ErrorContextCallback * error_context_stack
Definition: elog.c:92
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define RelationOpenSmgr(relation)
Definition: rel.h:493
void ProcSendSignal(int pid)
Definition: proc.c:1812
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
#define BUF_REUSABLE
Definition: bufmgr.c:68
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:209
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3911
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:1056
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2403
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3346
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3754
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:491
#define RelationIsValid(relation)
Definition: rel.h:409
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:475
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4280
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
int bgwriter_lru_maxpages
Definition: bufmgr.c:111
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1371
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
int num_to_scan
Definition: bufmgr.c:101
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:98
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2834
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1295
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
static char * buf
Definition: pg_test_fsync.c:67
int index
Definition: bufmgr.c:106
float8 progress
Definition: bufmgr.c:97
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3260
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:530
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1624
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4211
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:213
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:365
unsigned int uint32
Definition: c.h:367
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:543
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4180
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2804
#define BUF_WRITTEN
Definition: bufmgr.c:67
#define BufferGetPage(buffer)
Definition: bufmgr.h:157
static bool IsForInput
Definition: bufmgr.c:140
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:175
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3586
bool delayChkpt
Definition: proc.h:235
int VacuumCostPageHit
Definition: globals.c:137
static void BufferSync(int flags)
Definition: bufmgr.c:1734
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3810
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1650
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1800
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:242
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:179
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1376
void UnlockBuffers(void)
Definition: bufmgr.c:3532
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3978
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4101
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:59
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:2954
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:317
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3560
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:653
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:144
void PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:479
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2758
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:943
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:739
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4128
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:555
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1381
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2456
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:566
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3728
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3162
CheckpointStatsData CheckpointStats
Definition: xlog.c:183
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:544
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1612
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:466
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:237
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1383
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1194
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1469
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
void AbortBufferIO(void)
Definition: bufmgr.c:4010
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:111
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4156
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1390
TimestampTz ckpt_sync_t
Definition: xlog.h:238
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3303
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:545
#define BM_IO_ERROR
Definition: buf_internals.h:62
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:2885
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:483
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2571
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:176
#define elog(elevel,...)
Definition: elog.h:214
int i
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:143
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:494
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:185
int NBuffers
Definition: globals.c:131
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:85
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:193
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
int num_scanned
Definition: bufmgr.c:103
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2427
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61
int VacuumCostPageMiss
Definition: globals.c:138
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
#define qsort(a, b, c, d)
Definition: port.h:478
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2592
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3057
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4063
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4388
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2557
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3361
#define XLogHintBitIsNeeded()
Definition: xlog.h:193
bool track_io_timing
Definition: bufmgr.c:113
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:212
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:110
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:939
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2437