PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/ps_status.h"
53 #include "utils/rel.h"
54 #include "utils/resowner_private.h"
55 #include "utils/timestamp.h"
56 
57 
58 /* Note: these two macros only work on shared buffers, not local ones! */
59 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
60 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
61 
62 /* Note: this macro only works on local buffers, not shared ones! */
63 #define LocalBufHdrGetBlock(bufHdr) \
64  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
65 
66 /* Bits in SyncOneBuffer's return value */
67 #define BUF_WRITTEN 0x01
68 #define BUF_REUSABLE 0x02
69 
70 #define RELS_BSEARCH_THRESHOLD 20
71 
72 typedef struct PrivateRefCountEntry
73 {
77 
78 /* 64 bytes, about the size of a cache line on common systems */
79 #define REFCOUNT_ARRAY_ENTRIES 8
80 
81 /*
82  * Status of buffers to checkpoint for a particular tablespace, used
83  * internally in BufferSync.
84  */
85 typedef struct CkptTsStatus
86 {
87  /* oid of the tablespace */
89 
90  /*
91  * Checkpoint progress for this tablespace. To make progress comparable
92  * between tablespaces the progress is, for each tablespace, measured as a
93  * number between 0 and the total number of to-be-checkpointed pages. Each
94  * page checkpointed in this tablespace increments this space's progress
95  * by progress_slice.
96  */
99 
100  /* number of to-be checkpointed pages in this tablespace */
102  /* already processed pages in this tablespace */
104 
105  /* current offset in CkptBufferIds for this tablespace */
106  int index;
107 } CkptTsStatus;
108 
109 /*
110  * Type for array used to sort SMgrRelations
111  *
112  * FlushRelationsAllBuffers shares the same comparator function with
113  * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
114  * compatible.
115  */
116 typedef struct SMgrSortArray
117 {
118  RelFileNode rnode; /* This must be the first member */
120 } SMgrSortArray;
121 
122 /* GUC variables */
123 bool zero_damaged_pages = false;
126 bool track_io_timing = false;
127 
128 /*
129  * How many buffers PrefetchBuffer callers should try to stay ahead of their
130  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
131  * for buffers not belonging to tablespaces that have their
132  * effective_io_concurrency parameter set.
133  */
135 
136 /*
137  * Like effective_io_concurrency, but used by maintenance code paths that might
138  * benefit from a higher setting because they work on behalf of many sessions.
139  * Overridden by the tablespace setting of the same name.
140  */
142 
143 /*
144  * GUC variables about triggering kernel writeback for buffers written; OS
145  * dependent defaults are set via the GUC mechanism.
146  */
150 
151 /* local state for StartBufferIO and related functions */
152 static BufferDesc *InProgressBuf = NULL;
153 static bool IsForInput;
154 
155 /* local state for LockBufferForCleanup */
157 
158 /*
159  * Backend-Private refcount management:
160  *
161  * Each buffer also has a private refcount that keeps track of the number of
162  * times the buffer is pinned in the current process. This is so that the
163  * shared refcount needs to be modified only once if a buffer is pinned more
164  * than once by an individual backend. It's also used to check that no buffers
165  * are still pinned at the end of transactions and when exiting.
166  *
167  *
168  * To avoid - as we used to - requiring an array with NBuffers entries to keep
169  * track of local buffers, we use a small sequentially searched array
170  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
171  * keep track of backend local pins.
172  *
173  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
174  * refcounts are kept track of in the array; after that, new array entries
175  * displace old ones into the hash table. That way a frequently used entry
176  * can't get "stuck" in the hashtable while infrequent ones clog the array.
177  *
178  * Note that in most scenarios the number of pinned buffers will not exceed
179  * REFCOUNT_ARRAY_ENTRIES.
180  *
181  *
182  * To enter a buffer into the refcount tracking mechanism first reserve a free
183  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
184  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
185  * memory allocations in NewPrivateRefCountEntry() which can be important
186  * because in some scenarios it's called with a spinlock held...
187  */
189 static HTAB *PrivateRefCountHash = NULL;
193 
194 static void ReservePrivateRefCountEntry(void);
197 static inline int32 GetPrivateRefCount(Buffer buffer);
199 
200 /*
201  * Ensure that the PrivateRefCountArray has sufficient space to store one more
202  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
203  * a new entry - but it's perfectly fine to not use a reserved entry.
204  */
205 static void
207 {
208  /* Already reserved (or freed), nothing to do */
209  if (ReservedRefCountEntry != NULL)
210  return;
211 
212  /*
213  * First search for a free entry the array, that'll be sufficient in the
214  * majority of cases.
215  */
216  {
217  int i;
218 
219  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
220  {
222 
223  res = &PrivateRefCountArray[i];
224 
225  if (res->buffer == InvalidBuffer)
226  {
227  ReservedRefCountEntry = res;
228  return;
229  }
230  }
231  }
232 
233  /*
234  * No luck. All array entries are full. Move one array entry into the hash
235  * table.
236  */
237  {
238  /*
239  * Move entry from the current clock position in the array into the
240  * hashtable. Use that slot.
241  */
242  PrivateRefCountEntry *hashent;
243  bool found;
244 
245  /* select victim slot */
246  ReservedRefCountEntry =
248 
249  /* Better be used, otherwise we shouldn't get here. */
250  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
251 
252  /* enter victim array entry into hashtable */
253  hashent = hash_search(PrivateRefCountHash,
254  (void *) &(ReservedRefCountEntry->buffer),
255  HASH_ENTER,
256  &found);
257  Assert(!found);
258  hashent->refcount = ReservedRefCountEntry->refcount;
259 
260  /* clear the now free array slot */
261  ReservedRefCountEntry->buffer = InvalidBuffer;
262  ReservedRefCountEntry->refcount = 0;
263 
265  }
266 }
267 
268 /*
269  * Fill a previously reserved refcount entry.
270  */
271 static PrivateRefCountEntry *
273 {
275 
276  /* only allowed to be called when a reservation has been made */
277  Assert(ReservedRefCountEntry != NULL);
278 
279  /* use up the reserved entry */
280  res = ReservedRefCountEntry;
281  ReservedRefCountEntry = NULL;
282 
283  /* and fill it */
284  res->buffer = buffer;
285  res->refcount = 0;
286 
287  return res;
288 }
289 
290 /*
291  * Return the PrivateRefCount entry for the passed buffer.
292  *
293  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
294  * do_move is true, and the entry resides in the hashtable the entry is
295  * optimized for frequent access by moving it to the array.
296  */
297 static PrivateRefCountEntry *
299 {
301  int i;
302 
303  Assert(BufferIsValid(buffer));
304  Assert(!BufferIsLocal(buffer));
305 
306  /*
307  * First search for references in the array, that'll be sufficient in the
308  * majority of cases.
309  */
310  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
311  {
312  res = &PrivateRefCountArray[i];
313 
314  if (res->buffer == buffer)
315  return res;
316  }
317 
318  /*
319  * By here we know that the buffer, if already pinned, isn't residing in
320  * the array.
321  *
322  * Only look up the buffer in the hashtable if we've previously overflowed
323  * into it.
324  */
325  if (PrivateRefCountOverflowed == 0)
326  return NULL;
327 
328  res = hash_search(PrivateRefCountHash,
329  (void *) &buffer,
330  HASH_FIND,
331  NULL);
332 
333  if (res == NULL)
334  return NULL;
335  else if (!do_move)
336  {
337  /* caller doesn't want us to move the hash entry into the array */
338  return res;
339  }
340  else
341  {
342  /* move buffer from hashtable into the free array slot */
343  bool found;
345 
346  /* Ensure there's a free array slot */
348 
349  /* Use up the reserved slot */
350  Assert(ReservedRefCountEntry != NULL);
351  free = ReservedRefCountEntry;
352  ReservedRefCountEntry = NULL;
353  Assert(free->buffer == InvalidBuffer);
354 
355  /* and fill it */
356  free->buffer = buffer;
357  free->refcount = res->refcount;
358 
359  /* delete from hashtable */
360  hash_search(PrivateRefCountHash,
361  (void *) &buffer,
362  HASH_REMOVE,
363  &found);
364  Assert(found);
367 
368  return free;
369  }
370 }
371 
372 /*
373  * Returns how many times the passed buffer is pinned by this backend.
374  *
375  * Only works for shared memory buffers!
376  */
377 static inline int32
379 {
381 
382  Assert(BufferIsValid(buffer));
383  Assert(!BufferIsLocal(buffer));
384 
385  /*
386  * Not moving the entry - that's ok for the current users, but we might
387  * want to change this one day.
388  */
389  ref = GetPrivateRefCountEntry(buffer, false);
390 
391  if (ref == NULL)
392  return 0;
393  return ref->refcount;
394 }
395 
396 /*
397  * Release resources used to track the reference count of a buffer which we no
398  * longer have pinned and don't want to pin again immediately.
399  */
400 static void
402 {
403  Assert(ref->refcount == 0);
404 
405  if (ref >= &PrivateRefCountArray[0] &&
407  {
408  ref->buffer = InvalidBuffer;
409 
410  /*
411  * Mark the just used entry as reserved - in many scenarios that
412  * allows us to avoid ever having to search the array/hash for free
413  * entries.
414  */
415  ReservedRefCountEntry = ref;
416  }
417  else
418  {
419  bool found;
420  Buffer buffer = ref->buffer;
421 
422  hash_search(PrivateRefCountHash,
423  (void *) &buffer,
424  HASH_REMOVE,
425  &found);
426  Assert(found);
429  }
430 }
431 
432 /*
433  * BufferIsPinned
434  * True iff the buffer is pinned (also checks for valid buffer number).
435  *
436  * NOTE: what we check here is that *this* backend holds a pin on
437  * the buffer. We do not care whether some other backend does.
438  */
439 #define BufferIsPinned(bufnum) \
440 ( \
441  !BufferIsValid(bufnum) ? \
442  false \
443  : \
444  BufferIsLocal(bufnum) ? \
445  (LocalRefCount[-(bufnum) - 1] > 0) \
446  : \
447  (GetPrivateRefCount(bufnum) > 0) \
448 )
449 
450 
451 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
452  ForkNumber forkNum, BlockNumber blockNum,
454  bool *hit);
455 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
456 static void PinBuffer_Locked(BufferDesc *buf);
457 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
458 static void BufferSync(int flags);
460 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
461  WritebackContext *wb_context);
462 static void WaitIO(BufferDesc *buf);
463 static bool StartBufferIO(BufferDesc *buf, bool forInput);
464 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
465  uint32 set_flag_bits);
466 static void shared_buffer_write_error_callback(void *arg);
467 static void local_buffer_write_error_callback(void *arg);
468 static BufferDesc *BufferAlloc(SMgrRelation smgr,
469  char relpersistence,
470  ForkNumber forkNum,
471  BlockNumber blockNum,
472  BufferAccessStrategy strategy,
473  bool *foundPtr);
474 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
475 static void AtProcExit_Buffers(int code, Datum arg);
476 static void CheckForBufferLeaks(void);
477 static int rnode_comparator(const void *p1, const void *p2);
478 static int buffertag_comparator(const void *p1, const void *p2);
479 static int ckpt_buforder_comparator(const void *pa, const void *pb);
480 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
481 
482 
483 /*
484  * Implementation of PrefetchBuffer() for shared buffers.
485  */
488  ForkNumber forkNum,
489  BlockNumber blockNum)
490 {
491  PrefetchBufferResult result = {InvalidBuffer, false};
492  BufferTag newTag; /* identity of requested block */
493  uint32 newHash; /* hash value for newTag */
494  LWLock *newPartitionLock; /* buffer partition lock for it */
495  int buf_id;
496 
497  Assert(BlockNumberIsValid(blockNum));
498 
499  /* create a tag so we can lookup the buffer */
500  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
501  forkNum, blockNum);
502 
503  /* determine its hash code and partition lock ID */
504  newHash = BufTableHashCode(&newTag);
505  newPartitionLock = BufMappingPartitionLock(newHash);
506 
507  /* see if the block is in the buffer pool already */
508  LWLockAcquire(newPartitionLock, LW_SHARED);
509  buf_id = BufTableLookup(&newTag, newHash);
510  LWLockRelease(newPartitionLock);
511 
512  /* If not in buffers, initiate prefetch */
513  if (buf_id < 0)
514  {
515 #ifdef USE_PREFETCH
516  /*
517  * Try to initiate an asynchronous read. This returns false in
518  * recovery if the relation file doesn't exist.
519  */
520  if (smgrprefetch(smgr_reln, forkNum, blockNum))
521  result.initiated_io = true;
522 #endif /* USE_PREFETCH */
523  }
524  else
525  {
526  /*
527  * Report the buffer it was in at that time. The caller may be able
528  * to avoid a buffer table lookup, but it's not pinned and it must be
529  * rechecked!
530  */
531  result.recent_buffer = buf_id + 1;
532  }
533 
534  /*
535  * If the block *is* in buffers, we do nothing. This is not really ideal:
536  * the block might be just about to be evicted, which would be stupid
537  * since we know we are going to need it soon. But the only easy answer
538  * is to bump the usage_count, which does not seem like a great solution:
539  * when the caller does ultimately touch the block, usage_count would get
540  * bumped again, resulting in too much favoritism for blocks that are
541  * involved in a prefetch sequence. A real fix would involve some
542  * additional per-buffer state, and it's not clear that there's enough of
543  * a problem to justify that.
544  */
545 
546  return result;
547 }
548 
549 /*
550  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
551  *
552  * This is named by analogy to ReadBuffer but doesn't actually allocate a
553  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
554  * block will not be delayed by the I/O. Prefetching is optional.
555  *
556  * There are three possible outcomes:
557  *
558  * 1. If the block is already cached, the result includes a valid buffer that
559  * could be used by the caller to avoid the need for a later buffer lookup, but
560  * it's not pinned, so the caller must recheck it.
561  *
562  * 2. If the kernel has been asked to initiate I/O, the initated_io member is
563  * true. Currently there is no way to know if the data was already cached by
564  * the kernel and therefore didn't really initiate I/O, and no way to know when
565  * the I/O completes other than using synchronous ReadBuffer().
566  *
567  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
568  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
569  * lack of a kernel facility), or the underlying relation file wasn't found and
570  * we are in recovery. (If the relation file wasn't found and we are not in
571  * recovery, an error is raised).
572  */
575 {
576  Assert(RelationIsValid(reln));
577  Assert(BlockNumberIsValid(blockNum));
578 
579  /* Open it at the smgr level if not already done */
580  RelationOpenSmgr(reln);
581 
582  if (RelationUsesLocalBuffers(reln))
583  {
584  /* see comments in ReadBufferExtended */
585  if (RELATION_IS_OTHER_TEMP(reln))
586  ereport(ERROR,
587  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
588  errmsg("cannot access temporary tables of other sessions")));
589 
590  /* pass it off to localbuf.c */
591  return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
592  }
593  else
594  {
595  /* pass it to the shared buffer version */
596  return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
597  }
598 }
599 
600 
601 /*
602  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
603  * fork with RBM_NORMAL mode and default strategy.
604  */
605 Buffer
607 {
608  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
609 }
610 
611 /*
612  * ReadBufferExtended -- returns a buffer containing the requested
613  * block of the requested relation. If the blknum
614  * requested is P_NEW, extend the relation file and
615  * allocate a new block. (Caller is responsible for
616  * ensuring that only one backend tries to extend a
617  * relation at the same time!)
618  *
619  * Returns: the buffer number for the buffer containing
620  * the block read. The returned buffer has been pinned.
621  * Does not return on error --- elog's instead.
622  *
623  * Assume when this function is called, that reln has been opened already.
624  *
625  * In RBM_NORMAL mode, the page is read from disk, and the page header is
626  * validated. An error is thrown if the page header is not valid. (But
627  * note that an all-zero page is considered "valid"; see PageIsVerified().)
628  *
629  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
630  * valid, the page is zeroed instead of throwing an error. This is intended
631  * for non-critical data, where the caller is prepared to repair errors.
632  *
633  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
634  * filled with zeros instead of reading it from disk. Useful when the caller
635  * is going to fill the page from scratch, since this saves I/O and avoids
636  * unnecessary failure if the page-on-disk has corrupt page headers.
637  * The page is returned locked to ensure that the caller has a chance to
638  * initialize the page before it's made visible to others.
639  * Caution: do not use this mode to read a page that is beyond the relation's
640  * current physical EOF; that is likely to cause problems in md.c when
641  * the page is modified and written out. P_NEW is OK, though.
642  *
643  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
644  * a cleanup-strength lock on the page.
645  *
646  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
647  *
648  * If strategy is not NULL, a nondefault buffer access strategy is used.
649  * See buffer/README for details.
650  */
651 Buffer
654 {
655  bool hit;
656  Buffer buf;
657 
658  /* Open it at the smgr level if not already done */
659  RelationOpenSmgr(reln);
660 
661  /*
662  * Reject attempts to read non-local temporary relations; we would be
663  * likely to get wrong data since we have no visibility into the owning
664  * session's local buffers.
665  */
666  if (RELATION_IS_OTHER_TEMP(reln))
667  ereport(ERROR,
668  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
669  errmsg("cannot access temporary tables of other sessions")));
670 
671  /*
672  * Read the buffer, and update pgstat counters to reflect a cache hit or
673  * miss.
674  */
676  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
677  forkNum, blockNum, mode, strategy, &hit);
678  if (hit)
680  return buf;
681 }
682 
683 
684 /*
685  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
686  * a relcache entry for the relation.
687  *
688  * NB: At present, this function may only be used on permanent relations, which
689  * is OK, because we only use it during XLOG replay. If in the future we
690  * want to use it on temporary or unlogged relations, we could pass additional
691  * parameters.
692  */
693 Buffer
695  BlockNumber blockNum, ReadBufferMode mode,
696  BufferAccessStrategy strategy)
697 {
698  bool hit;
699 
700  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
701 
703 
704  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
705  mode, strategy, &hit);
706 }
707 
708 
709 /*
710  * ReadBuffer_common -- common logic for all ReadBuffer variants
711  *
712  * *hit is set to true if the request was satisfied from shared buffer cache.
713  */
714 static Buffer
715 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
716  BlockNumber blockNum, ReadBufferMode mode,
717  BufferAccessStrategy strategy, bool *hit)
718 {
719  BufferDesc *bufHdr;
720  Block bufBlock;
721  bool found;
722  bool isExtend;
723  bool isLocalBuf = SmgrIsTemp(smgr);
724 
725  *hit = false;
726 
727  /* Make sure we will have room to remember the buffer pin */
729 
730  isExtend = (blockNum == P_NEW);
731 
732  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
733  smgr->smgr_rnode.node.spcNode,
734  smgr->smgr_rnode.node.dbNode,
735  smgr->smgr_rnode.node.relNode,
736  smgr->smgr_rnode.backend,
737  isExtend);
738 
739  /* Substitute proper block number if caller asked for P_NEW */
740  if (isExtend)
741  blockNum = smgrnblocks(smgr, forkNum);
742 
743  if (isLocalBuf)
744  {
745  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
746  if (found)
748  else if (isExtend)
750  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
751  mode == RBM_ZERO_ON_ERROR)
753  }
754  else
755  {
756  /*
757  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
758  * not currently in memory.
759  */
760  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
761  strategy, &found);
762  if (found)
764  else if (isExtend)
766  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
767  mode == RBM_ZERO_ON_ERROR)
769  }
770 
771  /* At this point we do NOT hold any locks. */
772 
773  /* if it was already in the buffer pool, we're done */
774  if (found)
775  {
776  if (!isExtend)
777  {
778  /* Just need to update stats before we exit */
779  *hit = true;
780  VacuumPageHit++;
781 
782  if (VacuumCostActive)
784 
785  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
786  smgr->smgr_rnode.node.spcNode,
787  smgr->smgr_rnode.node.dbNode,
788  smgr->smgr_rnode.node.relNode,
789  smgr->smgr_rnode.backend,
790  isExtend,
791  found);
792 
793  /*
794  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
795  * locked on return.
796  */
797  if (!isLocalBuf)
798  {
799  if (mode == RBM_ZERO_AND_LOCK)
801  LW_EXCLUSIVE);
802  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
804  }
805 
806  return BufferDescriptorGetBuffer(bufHdr);
807  }
808 
809  /*
810  * We get here only in the corner case where we are trying to extend
811  * the relation but we found a pre-existing buffer marked BM_VALID.
812  * This can happen because mdread doesn't complain about reads beyond
813  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
814  * read a block beyond EOF could have left a "valid" zero-filled
815  * buffer. Unfortunately, we have also seen this case occurring
816  * because of buggy Linux kernels that sometimes return an
817  * lseek(SEEK_END) result that doesn't account for a recent write. In
818  * that situation, the pre-existing buffer would contain valid data
819  * that we don't want to overwrite. Since the legitimate case should
820  * always have left a zero-filled buffer, complain if not PageIsNew.
821  */
822  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
823  if (!PageIsNew((Page) bufBlock))
824  ereport(ERROR,
825  (errmsg("unexpected data beyond EOF in block %u of relation %s",
826  blockNum, relpath(smgr->smgr_rnode, forkNum)),
827  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
828 
829  /*
830  * We *must* do smgrextend before succeeding, else the page will not
831  * be reserved by the kernel, and the next P_NEW call will decide to
832  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
833  * call that BufferAlloc didn't, and proceed.
834  */
835  if (isLocalBuf)
836  {
837  /* Only need to adjust flags */
838  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
839 
840  Assert(buf_state & BM_VALID);
841  buf_state &= ~BM_VALID;
842  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
843  }
844  else
845  {
846  /*
847  * Loop to handle the very small possibility that someone re-sets
848  * BM_VALID between our clearing it and StartBufferIO inspecting
849  * it.
850  */
851  do
852  {
853  uint32 buf_state = LockBufHdr(bufHdr);
854 
855  Assert(buf_state & BM_VALID);
856  buf_state &= ~BM_VALID;
857  UnlockBufHdr(bufHdr, buf_state);
858  } while (!StartBufferIO(bufHdr, true));
859  }
860  }
861 
862  /*
863  * if we have gotten to this point, we have allocated a buffer for the
864  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
865  * if it's a shared buffer.
866  *
867  * Note: if smgrextend fails, we will end up with a buffer that is
868  * allocated but not marked BM_VALID. P_NEW will still select the same
869  * block number (because the relation didn't get any longer on disk) and
870  * so future attempts to extend the relation will find the same buffer (if
871  * it's not been recycled) but come right back here to try smgrextend
872  * again.
873  */
874  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
875 
876  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
877 
878  if (isExtend)
879  {
880  /* new buffers are zero-filled */
881  MemSet((char *) bufBlock, 0, BLCKSZ);
882  /* don't set checksum for all-zero page */
883  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
884 
885  /*
886  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
887  * although we're essentially performing a write. At least on linux
888  * doing so defeats the 'delayed allocation' mechanism, leading to
889  * increased file fragmentation.
890  */
891  }
892  else
893  {
894  /*
895  * Read in the page, unless the caller intends to overwrite it and
896  * just wants us to allocate a buffer.
897  */
898  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
899  MemSet((char *) bufBlock, 0, BLCKSZ);
900  else
901  {
902  instr_time io_start,
903  io_time;
904 
905  if (track_io_timing)
906  INSTR_TIME_SET_CURRENT(io_start);
907 
908  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
909 
910  if (track_io_timing)
911  {
912  INSTR_TIME_SET_CURRENT(io_time);
913  INSTR_TIME_SUBTRACT(io_time, io_start);
916  }
917 
918  /* check for garbage data */
919  if (!PageIsVerified((Page) bufBlock, blockNum))
920  {
921  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
922  {
925  errmsg("invalid page in block %u of relation %s; zeroing out page",
926  blockNum,
927  relpath(smgr->smgr_rnode, forkNum))));
928  MemSet((char *) bufBlock, 0, BLCKSZ);
929  }
930  else
931  ereport(ERROR,
933  errmsg("invalid page in block %u of relation %s",
934  blockNum,
935  relpath(smgr->smgr_rnode, forkNum))));
936  }
937  }
938  }
939 
940  /*
941  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
942  * the page as valid, to make sure that no other backend sees the zeroed
943  * page before the caller has had a chance to initialize it.
944  *
945  * Since no-one else can be looking at the page contents yet, there is no
946  * difference between an exclusive lock and a cleanup-strength lock. (Note
947  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
948  * they assert that the buffer is already valid.)
949  */
950  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
951  !isLocalBuf)
952  {
954  }
955 
956  if (isLocalBuf)
957  {
958  /* Only need to adjust flags */
959  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
960 
961  buf_state |= BM_VALID;
962  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
963  }
964  else
965  {
966  /* Set BM_VALID, terminate IO, and wake up any waiters */
967  TerminateBufferIO(bufHdr, false, BM_VALID);
968  }
969 
970  VacuumPageMiss++;
971  if (VacuumCostActive)
973 
974  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
975  smgr->smgr_rnode.node.spcNode,
976  smgr->smgr_rnode.node.dbNode,
977  smgr->smgr_rnode.node.relNode,
978  smgr->smgr_rnode.backend,
979  isExtend,
980  found);
981 
982  return BufferDescriptorGetBuffer(bufHdr);
983 }
984 
985 /*
986  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
987  * buffer. If no buffer exists already, selects a replacement
988  * victim and evicts the old page, but does NOT read in new page.
989  *
990  * "strategy" can be a buffer replacement strategy object, or NULL for
991  * the default strategy. The selected buffer's usage_count is advanced when
992  * using the default strategy, but otherwise possibly not (see PinBuffer).
993  *
994  * The returned buffer is pinned and is already marked as holding the
995  * desired page. If it already did have the desired page, *foundPtr is
996  * set true. Otherwise, *foundPtr is set false and the buffer is marked
997  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
998  *
999  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1000  * we keep it for simplicity in ReadBuffer.
1001  *
1002  * No locks are held either at entry or exit.
1003  */
1004 static BufferDesc *
1005 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1006  BlockNumber blockNum,
1007  BufferAccessStrategy strategy,
1008  bool *foundPtr)
1009 {
1010  BufferTag newTag; /* identity of requested block */
1011  uint32 newHash; /* hash value for newTag */
1012  LWLock *newPartitionLock; /* buffer partition lock for it */
1013  BufferTag oldTag; /* previous identity of selected buffer */
1014  uint32 oldHash; /* hash value for oldTag */
1015  LWLock *oldPartitionLock; /* buffer partition lock for it */
1016  uint32 oldFlags;
1017  int buf_id;
1018  BufferDesc *buf;
1019  bool valid;
1020  uint32 buf_state;
1021 
1022  /* create a tag so we can lookup the buffer */
1023  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1024 
1025  /* determine its hash code and partition lock ID */
1026  newHash = BufTableHashCode(&newTag);
1027  newPartitionLock = BufMappingPartitionLock(newHash);
1028 
1029  /* see if the block is in the buffer pool already */
1030  LWLockAcquire(newPartitionLock, LW_SHARED);
1031  buf_id = BufTableLookup(&newTag, newHash);
1032  if (buf_id >= 0)
1033  {
1034  /*
1035  * Found it. Now, pin the buffer so no one can steal it from the
1036  * buffer pool, and check to see if the correct data has been loaded
1037  * into the buffer.
1038  */
1039  buf = GetBufferDescriptor(buf_id);
1040 
1041  valid = PinBuffer(buf, strategy);
1042 
1043  /* Can release the mapping lock as soon as we've pinned it */
1044  LWLockRelease(newPartitionLock);
1045 
1046  *foundPtr = true;
1047 
1048  if (!valid)
1049  {
1050  /*
1051  * We can only get here if (a) someone else is still reading in
1052  * the page, or (b) a previous read attempt failed. We have to
1053  * wait for any active read attempt to finish, and then set up our
1054  * own read attempt if the page is still not BM_VALID.
1055  * StartBufferIO does it all.
1056  */
1057  if (StartBufferIO(buf, true))
1058  {
1059  /*
1060  * If we get here, previous attempts to read the buffer must
1061  * have failed ... but we shall bravely try again.
1062  */
1063  *foundPtr = false;
1064  }
1065  }
1066 
1067  return buf;
1068  }
1069 
1070  /*
1071  * Didn't find it in the buffer pool. We'll have to initialize a new
1072  * buffer. Remember to unlock the mapping lock while doing the work.
1073  */
1074  LWLockRelease(newPartitionLock);
1075 
1076  /* Loop here in case we have to try another victim buffer */
1077  for (;;)
1078  {
1079  /*
1080  * Ensure, while the spinlock's not yet held, that there's a free
1081  * refcount entry.
1082  */
1084 
1085  /*
1086  * Select a victim buffer. The buffer is returned with its header
1087  * spinlock still held!
1088  */
1089  buf = StrategyGetBuffer(strategy, &buf_state);
1090 
1091  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1092 
1093  /* Must copy buffer flags while we still hold the spinlock */
1094  oldFlags = buf_state & BUF_FLAG_MASK;
1095 
1096  /* Pin the buffer and then release the buffer spinlock */
1097  PinBuffer_Locked(buf);
1098 
1099  /*
1100  * If the buffer was dirty, try to write it out. There is a race
1101  * condition here, in that someone might dirty it after we released it
1102  * above, or even while we are writing it out (since our share-lock
1103  * won't prevent hint-bit updates). We will recheck the dirty bit
1104  * after re-locking the buffer header.
1105  */
1106  if (oldFlags & BM_DIRTY)
1107  {
1108  /*
1109  * We need a share-lock on the buffer contents to write it out
1110  * (else we might write invalid data, eg because someone else is
1111  * compacting the page contents while we write). We must use a
1112  * conditional lock acquisition here to avoid deadlock. Even
1113  * though the buffer was not pinned (and therefore surely not
1114  * locked) when StrategyGetBuffer returned it, someone else could
1115  * have pinned and exclusive-locked it by the time we get here. If
1116  * we try to get the lock unconditionally, we'd block waiting for
1117  * them; if they later block waiting for us, deadlock ensues.
1118  * (This has been observed to happen when two backends are both
1119  * trying to split btree index pages, and the second one just
1120  * happens to be trying to split the page the first one got from
1121  * StrategyGetBuffer.)
1122  */
1124  LW_SHARED))
1125  {
1126  /*
1127  * If using a nondefault strategy, and writing the buffer
1128  * would require a WAL flush, let the strategy decide whether
1129  * to go ahead and write/reuse the buffer or to choose another
1130  * victim. We need lock to inspect the page LSN, so this
1131  * can't be done inside StrategyGetBuffer.
1132  */
1133  if (strategy != NULL)
1134  {
1135  XLogRecPtr lsn;
1136 
1137  /* Read the LSN while holding buffer header lock */
1138  buf_state = LockBufHdr(buf);
1139  lsn = BufferGetLSN(buf);
1140  UnlockBufHdr(buf, buf_state);
1141 
1142  if (XLogNeedsFlush(lsn) &&
1143  StrategyRejectBuffer(strategy, buf))
1144  {
1145  /* Drop lock/pin and loop around for another buffer */
1147  UnpinBuffer(buf, true);
1148  continue;
1149  }
1150  }
1151 
1152  /* OK, do the I/O */
1153  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1154  smgr->smgr_rnode.node.spcNode,
1155  smgr->smgr_rnode.node.dbNode,
1156  smgr->smgr_rnode.node.relNode);
1157 
1158  FlushBuffer(buf, NULL);
1160 
1162  &buf->tag);
1163 
1164  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1165  smgr->smgr_rnode.node.spcNode,
1166  smgr->smgr_rnode.node.dbNode,
1167  smgr->smgr_rnode.node.relNode);
1168  }
1169  else
1170  {
1171  /*
1172  * Someone else has locked the buffer, so give it up and loop
1173  * back to get another one.
1174  */
1175  UnpinBuffer(buf, true);
1176  continue;
1177  }
1178  }
1179 
1180  /*
1181  * To change the association of a valid buffer, we'll need to have
1182  * exclusive lock on both the old and new mapping partitions.
1183  */
1184  if (oldFlags & BM_TAG_VALID)
1185  {
1186  /*
1187  * Need to compute the old tag's hashcode and partition lock ID.
1188  * XXX is it worth storing the hashcode in BufferDesc so we need
1189  * not recompute it here? Probably not.
1190  */
1191  oldTag = buf->tag;
1192  oldHash = BufTableHashCode(&oldTag);
1193  oldPartitionLock = BufMappingPartitionLock(oldHash);
1194 
1195  /*
1196  * Must lock the lower-numbered partition first to avoid
1197  * deadlocks.
1198  */
1199  if (oldPartitionLock < newPartitionLock)
1200  {
1201  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1202  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1203  }
1204  else if (oldPartitionLock > newPartitionLock)
1205  {
1206  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1207  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1208  }
1209  else
1210  {
1211  /* only one partition, only one lock */
1212  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1213  }
1214  }
1215  else
1216  {
1217  /* if it wasn't valid, we need only the new partition */
1218  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1219  /* remember we have no old-partition lock or tag */
1220  oldPartitionLock = NULL;
1221  /* keep the compiler quiet about uninitialized variables */
1222  oldHash = 0;
1223  }
1224 
1225  /*
1226  * Try to make a hashtable entry for the buffer under its new tag.
1227  * This could fail because while we were writing someone else
1228  * allocated another buffer for the same block we want to read in.
1229  * Note that we have not yet removed the hashtable entry for the old
1230  * tag.
1231  */
1232  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1233 
1234  if (buf_id >= 0)
1235  {
1236  /*
1237  * Got a collision. Someone has already done what we were about to
1238  * do. We'll just handle this as if it were found in the buffer
1239  * pool in the first place. First, give up the buffer we were
1240  * planning to use.
1241  */
1242  UnpinBuffer(buf, true);
1243 
1244  /* Can give up that buffer's mapping partition lock now */
1245  if (oldPartitionLock != NULL &&
1246  oldPartitionLock != newPartitionLock)
1247  LWLockRelease(oldPartitionLock);
1248 
1249  /* remaining code should match code at top of routine */
1250 
1251  buf = GetBufferDescriptor(buf_id);
1252 
1253  valid = PinBuffer(buf, strategy);
1254 
1255  /* Can release the mapping lock as soon as we've pinned it */
1256  LWLockRelease(newPartitionLock);
1257 
1258  *foundPtr = true;
1259 
1260  if (!valid)
1261  {
1262  /*
1263  * We can only get here if (a) someone else is still reading
1264  * in the page, or (b) a previous read attempt failed. We
1265  * have to wait for any active read attempt to finish, and
1266  * then set up our own read attempt if the page is still not
1267  * BM_VALID. StartBufferIO does it all.
1268  */
1269  if (StartBufferIO(buf, true))
1270  {
1271  /*
1272  * If we get here, previous attempts to read the buffer
1273  * must have failed ... but we shall bravely try again.
1274  */
1275  *foundPtr = false;
1276  }
1277  }
1278 
1279  return buf;
1280  }
1281 
1282  /*
1283  * Need to lock the buffer header too in order to change its tag.
1284  */
1285  buf_state = LockBufHdr(buf);
1286 
1287  /*
1288  * Somebody could have pinned or re-dirtied the buffer while we were
1289  * doing the I/O and making the new hashtable entry. If so, we can't
1290  * recycle this buffer; we must undo everything we've done and start
1291  * over with a new victim buffer.
1292  */
1293  oldFlags = buf_state & BUF_FLAG_MASK;
1294  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1295  break;
1296 
1297  UnlockBufHdr(buf, buf_state);
1298  BufTableDelete(&newTag, newHash);
1299  if (oldPartitionLock != NULL &&
1300  oldPartitionLock != newPartitionLock)
1301  LWLockRelease(oldPartitionLock);
1302  LWLockRelease(newPartitionLock);
1303  UnpinBuffer(buf, true);
1304  }
1305 
1306  /*
1307  * Okay, it's finally safe to rename the buffer.
1308  *
1309  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1310  * paranoia. We also reset the usage_count since any recency of use of
1311  * the old content is no longer relevant. (The usage_count starts out at
1312  * 1 so that the buffer can survive one clock-sweep pass.)
1313  *
1314  * Make sure BM_PERMANENT is set for buffers that must be written at every
1315  * checkpoint. Unlogged buffers only need to be written at shutdown
1316  * checkpoints, except for their "init" forks, which need to be treated
1317  * just like permanent relations.
1318  */
1319  buf->tag = newTag;
1320  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1323  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1324  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1325  else
1326  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1327 
1328  UnlockBufHdr(buf, buf_state);
1329 
1330  if (oldPartitionLock != NULL)
1331  {
1332  BufTableDelete(&oldTag, oldHash);
1333  if (oldPartitionLock != newPartitionLock)
1334  LWLockRelease(oldPartitionLock);
1335  }
1336 
1337  LWLockRelease(newPartitionLock);
1338 
1339  /*
1340  * Buffer contents are currently invalid. Try to get the io_in_progress
1341  * lock. If StartBufferIO returns false, then someone else managed to
1342  * read it before we did, so there's nothing left for BufferAlloc() to do.
1343  */
1344  if (StartBufferIO(buf, true))
1345  *foundPtr = false;
1346  else
1347  *foundPtr = true;
1348 
1349  return buf;
1350 }
1351 
1352 /*
1353  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1354  * freelist.
1355  *
1356  * The buffer header spinlock must be held at entry. We drop it before
1357  * returning. (This is sane because the caller must have locked the
1358  * buffer in order to be sure it should be dropped.)
1359  *
1360  * This is used only in contexts such as dropping a relation. We assume
1361  * that no other backend could possibly be interested in using the page,
1362  * so the only reason the buffer might be pinned is if someone else is
1363  * trying to write it out. We have to let them finish before we can
1364  * reclaim the buffer.
1365  *
1366  * The buffer could get reclaimed by someone else while we are waiting
1367  * to acquire the necessary locks; if so, don't mess it up.
1368  */
1369 static void
1371 {
1372  BufferTag oldTag;
1373  uint32 oldHash; /* hash value for oldTag */
1374  LWLock *oldPartitionLock; /* buffer partition lock for it */
1375  uint32 oldFlags;
1376  uint32 buf_state;
1377 
1378  /* Save the original buffer tag before dropping the spinlock */
1379  oldTag = buf->tag;
1380 
1381  buf_state = pg_atomic_read_u32(&buf->state);
1382  Assert(buf_state & BM_LOCKED);
1383  UnlockBufHdr(buf, buf_state);
1384 
1385  /*
1386  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1387  * worth storing the hashcode in BufferDesc so we need not recompute it
1388  * here? Probably not.
1389  */
1390  oldHash = BufTableHashCode(&oldTag);
1391  oldPartitionLock = BufMappingPartitionLock(oldHash);
1392 
1393 retry:
1394 
1395  /*
1396  * Acquire exclusive mapping lock in preparation for changing the buffer's
1397  * association.
1398  */
1399  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1400 
1401  /* Re-lock the buffer header */
1402  buf_state = LockBufHdr(buf);
1403 
1404  /* If it's changed while we were waiting for lock, do nothing */
1405  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1406  {
1407  UnlockBufHdr(buf, buf_state);
1408  LWLockRelease(oldPartitionLock);
1409  return;
1410  }
1411 
1412  /*
1413  * We assume the only reason for it to be pinned is that someone else is
1414  * flushing the page out. Wait for them to finish. (This could be an
1415  * infinite loop if the refcount is messed up... it would be nice to time
1416  * out after awhile, but there seems no way to be sure how many loops may
1417  * be needed. Note that if the other guy has pinned the buffer but not
1418  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1419  * be busy-looping here.)
1420  */
1421  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1422  {
1423  UnlockBufHdr(buf, buf_state);
1424  LWLockRelease(oldPartitionLock);
1425  /* safety check: should definitely not be our *own* pin */
1427  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1428  WaitIO(buf);
1429  goto retry;
1430  }
1431 
1432  /*
1433  * Clear out the buffer's tag and flags. We must do this to ensure that
1434  * linear scans of the buffer array don't think the buffer is valid.
1435  */
1436  oldFlags = buf_state & BUF_FLAG_MASK;
1437  CLEAR_BUFFERTAG(buf->tag);
1438  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1439  UnlockBufHdr(buf, buf_state);
1440 
1441  /*
1442  * Remove the buffer from the lookup hashtable, if it was in there.
1443  */
1444  if (oldFlags & BM_TAG_VALID)
1445  BufTableDelete(&oldTag, oldHash);
1446 
1447  /*
1448  * Done with mapping lock.
1449  */
1450  LWLockRelease(oldPartitionLock);
1451 
1452  /*
1453  * Insert the buffer at the head of the list of free buffers.
1454  */
1455  StrategyFreeBuffer(buf);
1456 }
1457 
1458 /*
1459  * MarkBufferDirty
1460  *
1461  * Marks buffer contents as dirty (actual write happens later).
1462  *
1463  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1464  * exclusive lock, then somebody could be in process of writing the buffer,
1465  * leading to risk of bad data written to disk.)
1466  */
1467 void
1469 {
1470  BufferDesc *bufHdr;
1471  uint32 buf_state;
1472  uint32 old_buf_state;
1473 
1474  if (!BufferIsValid(buffer))
1475  elog(ERROR, "bad buffer ID: %d", buffer);
1476 
1477  if (BufferIsLocal(buffer))
1478  {
1479  MarkLocalBufferDirty(buffer);
1480  return;
1481  }
1482 
1483  bufHdr = GetBufferDescriptor(buffer - 1);
1484 
1485  Assert(BufferIsPinned(buffer));
1487  LW_EXCLUSIVE));
1488 
1489  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1490  for (;;)
1491  {
1492  if (old_buf_state & BM_LOCKED)
1493  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1494 
1495  buf_state = old_buf_state;
1496 
1497  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1498  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1499 
1500  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1501  buf_state))
1502  break;
1503  }
1504 
1505  /*
1506  * If the buffer was not dirty already, do vacuum accounting.
1507  */
1508  if (!(old_buf_state & BM_DIRTY))
1509  {
1510  VacuumPageDirty++;
1512  if (VacuumCostActive)
1514  }
1515 }
1516 
1517 /*
1518  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1519  *
1520  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1521  * compared to calling the two routines separately. Now it's mainly just
1522  * a convenience function. However, if the passed buffer is valid and
1523  * already contains the desired block, we just return it as-is; and that
1524  * does save considerable work compared to a full release and reacquire.
1525  *
1526  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1527  * buffer actually needs to be released. This case is the same as ReadBuffer,
1528  * but can save some tests in the caller.
1529  */
1530 Buffer
1532  Relation relation,
1533  BlockNumber blockNum)
1534 {
1535  ForkNumber forkNum = MAIN_FORKNUM;
1536  BufferDesc *bufHdr;
1537 
1538  if (BufferIsValid(buffer))
1539  {
1540  Assert(BufferIsPinned(buffer));
1541  if (BufferIsLocal(buffer))
1542  {
1543  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1544  if (bufHdr->tag.blockNum == blockNum &&
1545  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1546  bufHdr->tag.forkNum == forkNum)
1547  return buffer;
1549  LocalRefCount[-buffer - 1]--;
1550  }
1551  else
1552  {
1553  bufHdr = GetBufferDescriptor(buffer - 1);
1554  /* we have pin, so it's ok to examine tag without spinlock */
1555  if (bufHdr->tag.blockNum == blockNum &&
1556  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1557  bufHdr->tag.forkNum == forkNum)
1558  return buffer;
1559  UnpinBuffer(bufHdr, true);
1560  }
1561  }
1562 
1563  return ReadBuffer(relation, blockNum);
1564 }
1565 
1566 /*
1567  * PinBuffer -- make buffer unavailable for replacement.
1568  *
1569  * For the default access strategy, the buffer's usage_count is incremented
1570  * when we first pin it; for other strategies we just make sure the usage_count
1571  * isn't zero. (The idea of the latter is that we don't want synchronized
1572  * heap scans to inflate the count, but we need it to not be zero to discourage
1573  * other backends from stealing buffers from our ring. As long as we cycle
1574  * through the ring faster than the global clock-sweep cycles, buffers in
1575  * our ring won't be chosen as victims for replacement by other backends.)
1576  *
1577  * This should be applied only to shared buffers, never local ones.
1578  *
1579  * Since buffers are pinned/unpinned very frequently, pin buffers without
1580  * taking the buffer header lock; instead update the state variable in loop of
1581  * CAS operations. Hopefully it's just a single CAS.
1582  *
1583  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1584  *
1585  * Returns true if buffer is BM_VALID, else false. This provision allows
1586  * some callers to avoid an extra spinlock cycle.
1587  */
1588 static bool
1590 {
1592  bool result;
1593  PrivateRefCountEntry *ref;
1594 
1595  ref = GetPrivateRefCountEntry(b, true);
1596 
1597  if (ref == NULL)
1598  {
1599  uint32 buf_state;
1600  uint32 old_buf_state;
1601 
1603  ref = NewPrivateRefCountEntry(b);
1604 
1605  old_buf_state = pg_atomic_read_u32(&buf->state);
1606  for (;;)
1607  {
1608  if (old_buf_state & BM_LOCKED)
1609  old_buf_state = WaitBufHdrUnlocked(buf);
1610 
1611  buf_state = old_buf_state;
1612 
1613  /* increase refcount */
1614  buf_state += BUF_REFCOUNT_ONE;
1615 
1616  if (strategy == NULL)
1617  {
1618  /* Default case: increase usagecount unless already max. */
1620  buf_state += BUF_USAGECOUNT_ONE;
1621  }
1622  else
1623  {
1624  /*
1625  * Ring buffers shouldn't evict others from pool. Thus we
1626  * don't make usagecount more than 1.
1627  */
1628  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1629  buf_state += BUF_USAGECOUNT_ONE;
1630  }
1631 
1632  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1633  buf_state))
1634  {
1635  result = (buf_state & BM_VALID) != 0;
1636  break;
1637  }
1638  }
1639  }
1640  else
1641  {
1642  /* If we previously pinned the buffer, it must surely be valid */
1643  result = true;
1644  }
1645 
1646  ref->refcount++;
1647  Assert(ref->refcount > 0);
1649  return result;
1650 }
1651 
1652 /*
1653  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1654  * The spinlock is released before return.
1655  *
1656  * As this function is called with the spinlock held, the caller has to
1657  * previously call ReservePrivateRefCountEntry().
1658  *
1659  * Currently, no callers of this function want to modify the buffer's
1660  * usage_count at all, so there's no need for a strategy parameter.
1661  * Also we don't bother with a BM_VALID test (the caller could check that for
1662  * itself).
1663  *
1664  * Also all callers only ever use this function when it's known that the
1665  * buffer can't have a preexisting pin by this backend. That allows us to skip
1666  * searching the private refcount array & hash, which is a boon, because the
1667  * spinlock is still held.
1668  *
1669  * Note: use of this routine is frequently mandatory, not just an optimization
1670  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1671  * its state can change under us.
1672  */
1673 static void
1675 {
1676  Buffer b;
1677  PrivateRefCountEntry *ref;
1678  uint32 buf_state;
1679 
1680  /*
1681  * As explained, We don't expect any preexisting pins. That allows us to
1682  * manipulate the PrivateRefCount after releasing the spinlock
1683  */
1685 
1686  /*
1687  * Since we hold the buffer spinlock, we can update the buffer state and
1688  * release the lock in one operation.
1689  */
1690  buf_state = pg_atomic_read_u32(&buf->state);
1691  Assert(buf_state & BM_LOCKED);
1692  buf_state += BUF_REFCOUNT_ONE;
1693  UnlockBufHdr(buf, buf_state);
1694 
1695  b = BufferDescriptorGetBuffer(buf);
1696 
1697  ref = NewPrivateRefCountEntry(b);
1698  ref->refcount++;
1699 
1701 }
1702 
1703 /*
1704  * UnpinBuffer -- make buffer available for replacement.
1705  *
1706  * This should be applied only to shared buffers, never local ones.
1707  *
1708  * Most but not all callers want CurrentResourceOwner to be adjusted.
1709  * Those that don't should pass fixOwner = false.
1710  */
1711 static void
1712 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1713 {
1714  PrivateRefCountEntry *ref;
1716 
1717  /* not moving as we're likely deleting it soon anyway */
1718  ref = GetPrivateRefCountEntry(b, false);
1719  Assert(ref != NULL);
1720 
1721  if (fixOwner)
1723 
1724  Assert(ref->refcount > 0);
1725  ref->refcount--;
1726  if (ref->refcount == 0)
1727  {
1728  uint32 buf_state;
1729  uint32 old_buf_state;
1730 
1731  /* I'd better not still hold any locks on the buffer */
1734 
1735  /*
1736  * Decrement the shared reference count.
1737  *
1738  * Since buffer spinlock holder can update status using just write,
1739  * it's not safe to use atomic decrement here; thus use a CAS loop.
1740  */
1741  old_buf_state = pg_atomic_read_u32(&buf->state);
1742  for (;;)
1743  {
1744  if (old_buf_state & BM_LOCKED)
1745  old_buf_state = WaitBufHdrUnlocked(buf);
1746 
1747  buf_state = old_buf_state;
1748 
1749  buf_state -= BUF_REFCOUNT_ONE;
1750 
1751  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1752  buf_state))
1753  break;
1754  }
1755 
1756  /* Support LockBufferForCleanup() */
1757  if (buf_state & BM_PIN_COUNT_WAITER)
1758  {
1759  /*
1760  * Acquire the buffer header lock, re-check that there's a waiter.
1761  * Another backend could have unpinned this buffer, and already
1762  * woken up the waiter. There's no danger of the buffer being
1763  * replaced after we unpinned it above, as it's pinned by the
1764  * waiter.
1765  */
1766  buf_state = LockBufHdr(buf);
1767 
1768  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1769  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1770  {
1771  /* we just released the last pin other than the waiter's */
1772  int wait_backend_pid = buf->wait_backend_pid;
1773 
1774  buf_state &= ~BM_PIN_COUNT_WAITER;
1775  UnlockBufHdr(buf, buf_state);
1776  ProcSendSignal(wait_backend_pid);
1777  }
1778  else
1779  UnlockBufHdr(buf, buf_state);
1780  }
1782  }
1783 }
1784 
1785 /*
1786  * BufferSync -- Write out all dirty buffers in the pool.
1787  *
1788  * This is called at checkpoint time to write out all dirty shared buffers.
1789  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1790  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1791  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1792  * unlogged buffers, which are otherwise skipped. The remaining flags
1793  * currently have no effect here.
1794  */
1795 static void
1796 BufferSync(int flags)
1797 {
1798  uint32 buf_state;
1799  int buf_id;
1800  int num_to_scan;
1801  int num_spaces;
1802  int num_processed;
1803  int num_written;
1804  CkptTsStatus *per_ts_stat = NULL;
1805  Oid last_tsid;
1806  binaryheap *ts_heap;
1807  int i;
1808  int mask = BM_DIRTY;
1809  WritebackContext wb_context;
1810 
1811  /* Make sure we can handle the pin inside SyncOneBuffer */
1813 
1814  /*
1815  * Unless this is a shutdown checkpoint or we have been explicitly told,
1816  * we write only permanent, dirty buffers. But at shutdown or end of
1817  * recovery, we write all dirty buffers.
1818  */
1821  mask |= BM_PERMANENT;
1822 
1823  /*
1824  * Loop over all buffers, and mark the ones that need to be written with
1825  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1826  * can estimate how much work needs to be done.
1827  *
1828  * This allows us to write only those pages that were dirty when the
1829  * checkpoint began, and not those that get dirtied while it proceeds.
1830  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1831  * later in this function, or by normal backends or the bgwriter cleaning
1832  * scan, the flag is cleared. Any buffer dirtied after this point won't
1833  * have the flag set.
1834  *
1835  * Note that if we fail to write some buffer, we may leave buffers with
1836  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1837  * certainly need to be written for the next checkpoint attempt, too.
1838  */
1839  num_to_scan = 0;
1840  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1841  {
1842  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1843 
1844  /*
1845  * Header spinlock is enough to examine BM_DIRTY, see comment in
1846  * SyncOneBuffer.
1847  */
1848  buf_state = LockBufHdr(bufHdr);
1849 
1850  if ((buf_state & mask) == mask)
1851  {
1852  CkptSortItem *item;
1853 
1854  buf_state |= BM_CHECKPOINT_NEEDED;
1855 
1856  item = &CkptBufferIds[num_to_scan++];
1857  item->buf_id = buf_id;
1858  item->tsId = bufHdr->tag.rnode.spcNode;
1859  item->relNode = bufHdr->tag.rnode.relNode;
1860  item->forkNum = bufHdr->tag.forkNum;
1861  item->blockNum = bufHdr->tag.blockNum;
1862  }
1863 
1864  UnlockBufHdr(bufHdr, buf_state);
1865 
1866  /* Check for barrier events in case NBuffers is large. */
1869  }
1870 
1871  if (num_to_scan == 0)
1872  return; /* nothing to do */
1873 
1875 
1876  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1877 
1878  /*
1879  * Sort buffers that need to be written to reduce the likelihood of random
1880  * IO. The sorting is also important for the implementation of balancing
1881  * writes between tablespaces. Without balancing writes we'd potentially
1882  * end up writing to the tablespaces one-by-one; possibly overloading the
1883  * underlying system.
1884  */
1885  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1887 
1888  num_spaces = 0;
1889 
1890  /*
1891  * Allocate progress status for each tablespace with buffers that need to
1892  * be flushed. This requires the to-be-flushed array to be sorted.
1893  */
1894  last_tsid = InvalidOid;
1895  for (i = 0; i < num_to_scan; i++)
1896  {
1897  CkptTsStatus *s;
1898  Oid cur_tsid;
1899 
1900  cur_tsid = CkptBufferIds[i].tsId;
1901 
1902  /*
1903  * Grow array of per-tablespace status structs, every time a new
1904  * tablespace is found.
1905  */
1906  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1907  {
1908  Size sz;
1909 
1910  num_spaces++;
1911 
1912  /*
1913  * Not worth adding grow-by-power-of-2 logic here - even with a
1914  * few hundred tablespaces this should be fine.
1915  */
1916  sz = sizeof(CkptTsStatus) * num_spaces;
1917 
1918  if (per_ts_stat == NULL)
1919  per_ts_stat = (CkptTsStatus *) palloc(sz);
1920  else
1921  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1922 
1923  s = &per_ts_stat[num_spaces - 1];
1924  memset(s, 0, sizeof(*s));
1925  s->tsId = cur_tsid;
1926 
1927  /*
1928  * The first buffer in this tablespace. As CkptBufferIds is sorted
1929  * by tablespace all (s->num_to_scan) buffers in this tablespace
1930  * will follow afterwards.
1931  */
1932  s->index = i;
1933 
1934  /*
1935  * progress_slice will be determined once we know how many buffers
1936  * are in each tablespace, i.e. after this loop.
1937  */
1938 
1939  last_tsid = cur_tsid;
1940  }
1941  else
1942  {
1943  s = &per_ts_stat[num_spaces - 1];
1944  }
1945 
1946  s->num_to_scan++;
1947 
1948  /* Check for barrier events. */
1951  }
1952 
1953  Assert(num_spaces > 0);
1954 
1955  /*
1956  * Build a min-heap over the write-progress in the individual tablespaces,
1957  * and compute how large a portion of the total progress a single
1958  * processed buffer is.
1959  */
1960  ts_heap = binaryheap_allocate(num_spaces,
1962  NULL);
1963 
1964  for (i = 0; i < num_spaces; i++)
1965  {
1966  CkptTsStatus *ts_stat = &per_ts_stat[i];
1967 
1968  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1969 
1970  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1971  }
1972 
1973  binaryheap_build(ts_heap);
1974 
1975  /*
1976  * Iterate through to-be-checkpointed buffers and write the ones (still)
1977  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1978  * tablespaces; otherwise the sorting would lead to only one tablespace
1979  * receiving writes at a time, making inefficient use of the hardware.
1980  */
1981  num_processed = 0;
1982  num_written = 0;
1983  while (!binaryheap_empty(ts_heap))
1984  {
1985  BufferDesc *bufHdr = NULL;
1986  CkptTsStatus *ts_stat = (CkptTsStatus *)
1988 
1989  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1990  Assert(buf_id != -1);
1991 
1992  bufHdr = GetBufferDescriptor(buf_id);
1993 
1994  num_processed++;
1995 
1996  /*
1997  * We don't need to acquire the lock here, because we're only looking
1998  * at a single bit. It's possible that someone else writes the buffer
1999  * and clears the flag right after we check, but that doesn't matter
2000  * since SyncOneBuffer will then do nothing. However, there is a
2001  * further race condition: it's conceivable that between the time we
2002  * examine the bit here and the time SyncOneBuffer acquires the lock,
2003  * someone else not only wrote the buffer but replaced it with another
2004  * page and dirtied it. In that improbable case, SyncOneBuffer will
2005  * write the buffer though we didn't need to. It doesn't seem worth
2006  * guarding against this, though.
2007  */
2009  {
2010  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2011  {
2012  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2014  num_written++;
2015  }
2016  }
2017 
2018  /*
2019  * Measure progress independent of actually having to flush the buffer
2020  * - otherwise writing become unbalanced.
2021  */
2022  ts_stat->progress += ts_stat->progress_slice;
2023  ts_stat->num_scanned++;
2024  ts_stat->index++;
2025 
2026  /* Have all the buffers from the tablespace been processed? */
2027  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2028  {
2029  binaryheap_remove_first(ts_heap);
2030  }
2031  else
2032  {
2033  /* update heap with the new progress */
2034  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2035  }
2036 
2037  /*
2038  * Sleep to throttle our I/O rate.
2039  *
2040  * (This will check for barrier events even if it doesn't sleep.)
2041  */
2042  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2043  }
2044 
2045  /* issue all pending flushes */
2046  IssuePendingWritebacks(&wb_context);
2047 
2048  pfree(per_ts_stat);
2049  per_ts_stat = NULL;
2050  binaryheap_free(ts_heap);
2051 
2052  /*
2053  * Update checkpoint statistics. As noted above, this doesn't include
2054  * buffers written by other backends or bgwriter scan.
2055  */
2056  CheckpointStats.ckpt_bufs_written += num_written;
2057 
2058  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2059 }
2060 
2061 /*
2062  * BgBufferSync -- Write out some dirty buffers in the pool.
2063  *
2064  * This is called periodically by the background writer process.
2065  *
2066  * Returns true if it's appropriate for the bgwriter process to go into
2067  * low-power hibernation mode. (This happens if the strategy clock sweep
2068  * has been "lapped" and no buffer allocations have occurred recently,
2069  * or if the bgwriter has been effectively disabled by setting
2070  * bgwriter_lru_maxpages to 0.)
2071  */
2072 bool
2074 {
2075  /* info obtained from freelist.c */
2076  int strategy_buf_id;
2077  uint32 strategy_passes;
2078  uint32 recent_alloc;
2079 
2080  /*
2081  * Information saved between calls so we can determine the strategy
2082  * point's advance rate and avoid scanning already-cleaned buffers.
2083  */
2084  static bool saved_info_valid = false;
2085  static int prev_strategy_buf_id;
2086  static uint32 prev_strategy_passes;
2087  static int next_to_clean;
2088  static uint32 next_passes;
2089 
2090  /* Moving averages of allocation rate and clean-buffer density */
2091  static float smoothed_alloc = 0;
2092  static float smoothed_density = 10.0;
2093 
2094  /* Potentially these could be tunables, but for now, not */
2095  float smoothing_samples = 16;
2096  float scan_whole_pool_milliseconds = 120000.0;
2097 
2098  /* Used to compute how far we scan ahead */
2099  long strategy_delta;
2100  int bufs_to_lap;
2101  int bufs_ahead;
2102  float scans_per_alloc;
2103  int reusable_buffers_est;
2104  int upcoming_alloc_est;
2105  int min_scan_buffers;
2106 
2107  /* Variables for the scanning loop proper */
2108  int num_to_scan;
2109  int num_written;
2110  int reusable_buffers;
2111 
2112  /* Variables for final smoothed_density update */
2113  long new_strategy_delta;
2114  uint32 new_recent_alloc;
2115 
2116  /*
2117  * Find out where the freelist clock sweep currently is, and how many
2118  * buffer allocations have happened since our last call.
2119  */
2120  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2121 
2122  /* Report buffer alloc counts to pgstat */
2123  BgWriterStats.m_buf_alloc += recent_alloc;
2124 
2125  /*
2126  * If we're not running the LRU scan, just stop after doing the stats
2127  * stuff. We mark the saved state invalid so that we can recover sanely
2128  * if LRU scan is turned back on later.
2129  */
2130  if (bgwriter_lru_maxpages <= 0)
2131  {
2132  saved_info_valid = false;
2133  return true;
2134  }
2135 
2136  /*
2137  * Compute strategy_delta = how many buffers have been scanned by the
2138  * clock sweep since last time. If first time through, assume none. Then
2139  * see if we are still ahead of the clock sweep, and if so, how many
2140  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2141  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2142  * behavior when the passes counts wrap around.
2143  */
2144  if (saved_info_valid)
2145  {
2146  int32 passes_delta = strategy_passes - prev_strategy_passes;
2147 
2148  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2149  strategy_delta += (long) passes_delta * NBuffers;
2150 
2151  Assert(strategy_delta >= 0);
2152 
2153  if ((int32) (next_passes - strategy_passes) > 0)
2154  {
2155  /* we're one pass ahead of the strategy point */
2156  bufs_to_lap = strategy_buf_id - next_to_clean;
2157 #ifdef BGW_DEBUG
2158  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2159  next_passes, next_to_clean,
2160  strategy_passes, strategy_buf_id,
2161  strategy_delta, bufs_to_lap);
2162 #endif
2163  }
2164  else if (next_passes == strategy_passes &&
2165  next_to_clean >= strategy_buf_id)
2166  {
2167  /* on same pass, but ahead or at least not behind */
2168  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2169 #ifdef BGW_DEBUG
2170  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2171  next_passes, next_to_clean,
2172  strategy_passes, strategy_buf_id,
2173  strategy_delta, bufs_to_lap);
2174 #endif
2175  }
2176  else
2177  {
2178  /*
2179  * We're behind, so skip forward to the strategy point and start
2180  * cleaning from there.
2181  */
2182 #ifdef BGW_DEBUG
2183  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2184  next_passes, next_to_clean,
2185  strategy_passes, strategy_buf_id,
2186  strategy_delta);
2187 #endif
2188  next_to_clean = strategy_buf_id;
2189  next_passes = strategy_passes;
2190  bufs_to_lap = NBuffers;
2191  }
2192  }
2193  else
2194  {
2195  /*
2196  * Initializing at startup or after LRU scanning had been off. Always
2197  * start at the strategy point.
2198  */
2199 #ifdef BGW_DEBUG
2200  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2201  strategy_passes, strategy_buf_id);
2202 #endif
2203  strategy_delta = 0;
2204  next_to_clean = strategy_buf_id;
2205  next_passes = strategy_passes;
2206  bufs_to_lap = NBuffers;
2207  }
2208 
2209  /* Update saved info for next time */
2210  prev_strategy_buf_id = strategy_buf_id;
2211  prev_strategy_passes = strategy_passes;
2212  saved_info_valid = true;
2213 
2214  /*
2215  * Compute how many buffers had to be scanned for each new allocation, ie,
2216  * 1/density of reusable buffers, and track a moving average of that.
2217  *
2218  * If the strategy point didn't move, we don't update the density estimate
2219  */
2220  if (strategy_delta > 0 && recent_alloc > 0)
2221  {
2222  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2223  smoothed_density += (scans_per_alloc - smoothed_density) /
2224  smoothing_samples;
2225  }
2226 
2227  /*
2228  * Estimate how many reusable buffers there are between the current
2229  * strategy point and where we've scanned ahead to, based on the smoothed
2230  * density estimate.
2231  */
2232  bufs_ahead = NBuffers - bufs_to_lap;
2233  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2234 
2235  /*
2236  * Track a moving average of recent buffer allocations. Here, rather than
2237  * a true average we want a fast-attack, slow-decline behavior: we
2238  * immediately follow any increase.
2239  */
2240  if (smoothed_alloc <= (float) recent_alloc)
2241  smoothed_alloc = recent_alloc;
2242  else
2243  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2244  smoothing_samples;
2245 
2246  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2247  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2248 
2249  /*
2250  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2251  * eventually underflow to zero, and the underflows produce annoying
2252  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2253  * zero, there's no point in tracking smaller and smaller values of
2254  * smoothed_alloc, so just reset it to exactly zero to avoid this
2255  * syndrome. It will pop back up as soon as recent_alloc increases.
2256  */
2257  if (upcoming_alloc_est == 0)
2258  smoothed_alloc = 0;
2259 
2260  /*
2261  * Even in cases where there's been little or no buffer allocation
2262  * activity, we want to make a small amount of progress through the buffer
2263  * cache so that as many reusable buffers as possible are clean after an
2264  * idle period.
2265  *
2266  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2267  * the BGW will be called during the scan_whole_pool time; slice the
2268  * buffer pool into that many sections.
2269  */
2270  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2271 
2272  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2273  {
2274 #ifdef BGW_DEBUG
2275  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2276  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2277 #endif
2278  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2279  }
2280 
2281  /*
2282  * Now write out dirty reusable buffers, working forward from the
2283  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2284  * enough buffers to match our estimate of the next cycle's allocation
2285  * requirements, or hit the bgwriter_lru_maxpages limit.
2286  */
2287 
2288  /* Make sure we can handle the pin inside SyncOneBuffer */
2290 
2291  num_to_scan = bufs_to_lap;
2292  num_written = 0;
2293  reusable_buffers = reusable_buffers_est;
2294 
2295  /* Execute the LRU scan */
2296  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2297  {
2298  int sync_state = SyncOneBuffer(next_to_clean, true,
2299  wb_context);
2300 
2301  if (++next_to_clean >= NBuffers)
2302  {
2303  next_to_clean = 0;
2304  next_passes++;
2305  }
2306  num_to_scan--;
2307 
2308  if (sync_state & BUF_WRITTEN)
2309  {
2310  reusable_buffers++;
2311  if (++num_written >= bgwriter_lru_maxpages)
2312  {
2314  break;
2315  }
2316  }
2317  else if (sync_state & BUF_REUSABLE)
2318  reusable_buffers++;
2319  }
2320 
2321  BgWriterStats.m_buf_written_clean += num_written;
2322 
2323 #ifdef BGW_DEBUG
2324  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2325  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2326  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2327  bufs_to_lap - num_to_scan,
2328  num_written,
2329  reusable_buffers - reusable_buffers_est);
2330 #endif
2331 
2332  /*
2333  * Consider the above scan as being like a new allocation scan.
2334  * Characterize its density and update the smoothed one based on it. This
2335  * effectively halves the moving average period in cases where both the
2336  * strategy and the background writer are doing some useful scanning,
2337  * which is helpful because a long memory isn't as desirable on the
2338  * density estimates.
2339  */
2340  new_strategy_delta = bufs_to_lap - num_to_scan;
2341  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2342  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2343  {
2344  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2345  smoothed_density += (scans_per_alloc - smoothed_density) /
2346  smoothing_samples;
2347 
2348 #ifdef BGW_DEBUG
2349  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2350  new_recent_alloc, new_strategy_delta,
2351  scans_per_alloc, smoothed_density);
2352 #endif
2353  }
2354 
2355  /* Return true if OK to hibernate */
2356  return (bufs_to_lap == 0 && recent_alloc == 0);
2357 }
2358 
2359 /*
2360  * SyncOneBuffer -- process a single buffer during syncing.
2361  *
2362  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2363  * buffers marked recently used, as these are not replacement candidates.
2364  *
2365  * Returns a bitmask containing the following flag bits:
2366  * BUF_WRITTEN: we wrote the buffer.
2367  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2368  * pin count 0 and usage count 0.
2369  *
2370  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2371  * after locking it, but we don't care all that much.)
2372  *
2373  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2374  */
2375 static int
2376 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2377 {
2378  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2379  int result = 0;
2380  uint32 buf_state;
2381  BufferTag tag;
2382 
2384 
2385  /*
2386  * Check whether buffer needs writing.
2387  *
2388  * We can make this check without taking the buffer content lock so long
2389  * as we mark pages dirty in access methods *before* logging changes with
2390  * XLogInsert(): if someone marks the buffer dirty just after our check we
2391  * don't worry because our checkpoint.redo points before log record for
2392  * upcoming changes and so we are not required to write such dirty buffer.
2393  */
2394  buf_state = LockBufHdr(bufHdr);
2395 
2396  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2397  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2398  {
2399  result |= BUF_REUSABLE;
2400  }
2401  else if (skip_recently_used)
2402  {
2403  /* Caller told us not to write recently-used buffers */
2404  UnlockBufHdr(bufHdr, buf_state);
2405  return result;
2406  }
2407 
2408  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2409  {
2410  /* It's clean, so nothing to do */
2411  UnlockBufHdr(bufHdr, buf_state);
2412  return result;
2413  }
2414 
2415  /*
2416  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2417  * buffer is clean by the time we've locked it.)
2418  */
2419  PinBuffer_Locked(bufHdr);
2421 
2422  FlushBuffer(bufHdr, NULL);
2423 
2425 
2426  tag = bufHdr->tag;
2427 
2428  UnpinBuffer(bufHdr, true);
2429 
2430  ScheduleBufferTagForWriteback(wb_context, &tag);
2431 
2432  return result | BUF_WRITTEN;
2433 }
2434 
2435 /*
2436  * AtEOXact_Buffers - clean up at end of transaction.
2437  *
2438  * As of PostgreSQL 8.0, buffer pins should get released by the
2439  * ResourceOwner mechanism. This routine is just a debugging
2440  * cross-check that no pins remain.
2441  */
2442 void
2443 AtEOXact_Buffers(bool isCommit)
2444 {
2446 
2447  AtEOXact_LocalBuffers(isCommit);
2448 
2450 }
2451 
2452 /*
2453  * Initialize access to shared buffer pool
2454  *
2455  * This is called during backend startup (whether standalone or under the
2456  * postmaster). It sets up for this backend's access to the already-existing
2457  * buffer pool.
2458  *
2459  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2460  * cannot do LWLockAcquire; hence we can't actually access stuff in
2461  * shared memory yet. We are only initializing local data here.
2462  * (See also InitBufferPoolBackend)
2463  */
2464 void
2466 {
2467  HASHCTL hash_ctl;
2468 
2469  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2470 
2471  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2472  hash_ctl.keysize = sizeof(int32);
2473  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2474 
2475  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2476  HASH_ELEM | HASH_BLOBS);
2477 }
2478 
2479 /*
2480  * InitBufferPoolBackend --- second-stage initialization of a new backend
2481  *
2482  * This is called after we have acquired a PGPROC and so can safely get
2483  * LWLocks. We don't currently need to do anything at this stage ...
2484  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2485  * access, and thereby has to be called at the corresponding phase of
2486  * backend shutdown.
2487  */
2488 void
2490 {
2492 }
2493 
2494 /*
2495  * During backend exit, ensure that we released all shared-buffer locks and
2496  * assert that we have no remaining pins.
2497  */
2498 static void
2500 {
2501  AbortBufferIO();
2502  UnlockBuffers();
2503 
2505 
2506  /* localbuf.c needs a chance too */
2508 }
2509 
2510 /*
2511  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2512  *
2513  * As of PostgreSQL 8.0, buffer pins should get released by the
2514  * ResourceOwner mechanism. This routine is just a debugging
2515  * cross-check that no pins remain.
2516  */
2517 static void
2519 {
2520 #ifdef USE_ASSERT_CHECKING
2521  int RefCountErrors = 0;
2522  PrivateRefCountEntry *res;
2523  int i;
2524 
2525  /* check the array */
2526  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2527  {
2528  res = &PrivateRefCountArray[i];
2529 
2530  if (res->buffer != InvalidBuffer)
2531  {
2533  RefCountErrors++;
2534  }
2535  }
2536 
2537  /* if necessary search the hash */
2539  {
2540  HASH_SEQ_STATUS hstat;
2541 
2542  hash_seq_init(&hstat, PrivateRefCountHash);
2543  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2544  {
2546  RefCountErrors++;
2547  }
2548 
2549  }
2550 
2551  Assert(RefCountErrors == 0);
2552 #endif
2553 }
2554 
2555 /*
2556  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2557  */
2558 void
2560 {
2561  BufferDesc *buf;
2562  int32 loccount;
2563  char *path;
2564  BackendId backend;
2565  uint32 buf_state;
2566 
2567  Assert(BufferIsValid(buffer));
2568  if (BufferIsLocal(buffer))
2569  {
2570  buf = GetLocalBufferDescriptor(-buffer - 1);
2571  loccount = LocalRefCount[-buffer - 1];
2572  backend = MyBackendId;
2573  }
2574  else
2575  {
2576  buf = GetBufferDescriptor(buffer - 1);
2577  loccount = GetPrivateRefCount(buffer);
2578  backend = InvalidBackendId;
2579  }
2580 
2581  /* theoretically we should lock the bufhdr here */
2582  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2583  buf_state = pg_atomic_read_u32(&buf->state);
2584  elog(WARNING,
2585  "buffer refcount leak: [%03d] "
2586  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2587  buffer, path,
2588  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2589  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2590  pfree(path);
2591 }
2592 
2593 /*
2594  * CheckPointBuffers
2595  *
2596  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2597  *
2598  * Note: temporary relations do not participate in checkpoints, so they don't
2599  * need to be flushed.
2600  */
2601 void
2603 {
2604  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2606  BufferSync(flags);
2608  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2611  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2612 }
2613 
2614 
2615 /*
2616  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2617  */
2618 void
2620 {
2621  /* Nothing to do in bufmgr anymore... */
2622 }
2623 
2624 /*
2625  * BufferGetBlockNumber
2626  * Returns the block number associated with a buffer.
2627  *
2628  * Note:
2629  * Assumes that the buffer is valid and pinned, else the
2630  * value may be obsolete immediately...
2631  */
2634 {
2635  BufferDesc *bufHdr;
2636 
2637  Assert(BufferIsPinned(buffer));
2638 
2639  if (BufferIsLocal(buffer))
2640  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2641  else
2642  bufHdr = GetBufferDescriptor(buffer - 1);
2643 
2644  /* pinned, so OK to read tag without spinlock */
2645  return bufHdr->tag.blockNum;
2646 }
2647 
2648 /*
2649  * BufferGetTag
2650  * Returns the relfilenode, fork number and block number associated with
2651  * a buffer.
2652  */
2653 void
2655  BlockNumber *blknum)
2656 {
2657  BufferDesc *bufHdr;
2658 
2659  /* Do the same checks as BufferGetBlockNumber. */
2660  Assert(BufferIsPinned(buffer));
2661 
2662  if (BufferIsLocal(buffer))
2663  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2664  else
2665  bufHdr = GetBufferDescriptor(buffer - 1);
2666 
2667  /* pinned, so OK to read tag without spinlock */
2668  *rnode = bufHdr->tag.rnode;
2669  *forknum = bufHdr->tag.forkNum;
2670  *blknum = bufHdr->tag.blockNum;
2671 }
2672 
2673 /*
2674  * FlushBuffer
2675  * Physically write out a shared buffer.
2676  *
2677  * NOTE: this actually just passes the buffer contents to the kernel; the
2678  * real write to disk won't happen until the kernel feels like it. This
2679  * is okay from our point of view since we can redo the changes from WAL.
2680  * However, we will need to force the changes to disk via fsync before
2681  * we can checkpoint WAL.
2682  *
2683  * The caller must hold a pin on the buffer and have share-locked the
2684  * buffer contents. (Note: a share-lock does not prevent updates of
2685  * hint bits in the buffer, so the page could change while the write
2686  * is in progress, but we assume that that will not invalidate the data
2687  * written.)
2688  *
2689  * If the caller has an smgr reference for the buffer's relation, pass it
2690  * as the second parameter. If not, pass NULL.
2691  */
2692 static void
2694 {
2695  XLogRecPtr recptr;
2696  ErrorContextCallback errcallback;
2697  instr_time io_start,
2698  io_time;
2699  Block bufBlock;
2700  char *bufToWrite;
2701  uint32 buf_state;
2702 
2703  /*
2704  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2705  * false, then someone else flushed the buffer before we could, so we need
2706  * not do anything.
2707  */
2708  if (!StartBufferIO(buf, false))
2709  return;
2710 
2711  /* Setup error traceback support for ereport() */
2713  errcallback.arg = (void *) buf;
2714  errcallback.previous = error_context_stack;
2715  error_context_stack = &errcallback;
2716 
2717  /* Find smgr relation for buffer */
2718  if (reln == NULL)
2719  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2720 
2721  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2722  buf->tag.blockNum,
2723  reln->smgr_rnode.node.spcNode,
2724  reln->smgr_rnode.node.dbNode,
2725  reln->smgr_rnode.node.relNode);
2726 
2727  buf_state = LockBufHdr(buf);
2728 
2729  /*
2730  * Run PageGetLSN while holding header lock, since we don't have the
2731  * buffer locked exclusively in all cases.
2732  */
2733  recptr = BufferGetLSN(buf);
2734 
2735  /* To check if block content changes while flushing. - vadim 01/17/97 */
2736  buf_state &= ~BM_JUST_DIRTIED;
2737  UnlockBufHdr(buf, buf_state);
2738 
2739  /*
2740  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2741  * rule that log updates must hit disk before any of the data-file changes
2742  * they describe do.
2743  *
2744  * However, this rule does not apply to unlogged relations, which will be
2745  * lost after a crash anyway. Most unlogged relation pages do not bear
2746  * LSNs since we never emit WAL records for them, and therefore flushing
2747  * up through the buffer LSN would be useless, but harmless. However,
2748  * GiST indexes use LSNs internally to track page-splits, and therefore
2749  * unlogged GiST pages bear "fake" LSNs generated by
2750  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2751  * LSN counter could advance past the WAL insertion point; and if it did
2752  * happen, attempting to flush WAL through that location would fail, with
2753  * disastrous system-wide consequences. To make sure that can't happen,
2754  * skip the flush if the buffer isn't permanent.
2755  */
2756  if (buf_state & BM_PERMANENT)
2757  XLogFlush(recptr);
2758 
2759  /*
2760  * Now it's safe to write buffer to disk. Note that no one else should
2761  * have been able to write it while we were busy with log flushing because
2762  * we have the io_in_progress lock.
2763  */
2764  bufBlock = BufHdrGetBlock(buf);
2765 
2766  /*
2767  * Update page checksum if desired. Since we have only shared lock on the
2768  * buffer, other processes might be updating hint bits in it, so we must
2769  * copy the page to private storage if we do checksumming.
2770  */
2771  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2772 
2773  if (track_io_timing)
2774  INSTR_TIME_SET_CURRENT(io_start);
2775 
2776  /*
2777  * bufToWrite is either the shared buffer or a copy, as appropriate.
2778  */
2779  smgrwrite(reln,
2780  buf->tag.forkNum,
2781  buf->tag.blockNum,
2782  bufToWrite,
2783  false);
2784 
2785  if (track_io_timing)
2786  {
2787  INSTR_TIME_SET_CURRENT(io_time);
2788  INSTR_TIME_SUBTRACT(io_time, io_start);
2791  }
2792 
2794 
2795  /*
2796  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2797  * end the io_in_progress state.
2798  */
2799  TerminateBufferIO(buf, true, 0);
2800 
2801  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2802  buf->tag.blockNum,
2803  reln->smgr_rnode.node.spcNode,
2804  reln->smgr_rnode.node.dbNode,
2805  reln->smgr_rnode.node.relNode);
2806 
2807  /* Pop the error context stack */
2808  error_context_stack = errcallback.previous;
2809 }
2810 
2811 /*
2812  * RelationGetNumberOfBlocksInFork
2813  * Determines the current number of pages in the specified relation fork.
2814  *
2815  * Note that the accuracy of the result will depend on the details of the
2816  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2817  * it might not be.
2818  */
2821 {
2822  switch (relation->rd_rel->relkind)
2823  {
2824  case RELKIND_SEQUENCE:
2825  case RELKIND_INDEX:
2826  case RELKIND_PARTITIONED_INDEX:
2827  /* Open it at the smgr level if not already done */
2828  RelationOpenSmgr(relation);
2829 
2830  return smgrnblocks(relation->rd_smgr, forkNum);
2831 
2832  case RELKIND_RELATION:
2833  case RELKIND_TOASTVALUE:
2834  case RELKIND_MATVIEW:
2835  {
2836  /*
2837  * Not every table AM uses BLCKSZ wide fixed size blocks.
2838  * Therefore tableam returns the size in bytes - but for the
2839  * purpose of this routine, we want the number of blocks.
2840  * Therefore divide, rounding up.
2841  */
2842  uint64 szbytes;
2843 
2844  szbytes = table_relation_size(relation, forkNum);
2845 
2846  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2847  }
2848  case RELKIND_VIEW:
2849  case RELKIND_COMPOSITE_TYPE:
2850  case RELKIND_FOREIGN_TABLE:
2851  case RELKIND_PARTITIONED_TABLE:
2852  default:
2853  Assert(false);
2854  break;
2855  }
2856 
2857  return 0; /* keep compiler quiet */
2858 }
2859 
2860 /*
2861  * BufferIsPermanent
2862  * Determines whether a buffer will potentially still be around after
2863  * a crash. Caller must hold a buffer pin.
2864  */
2865 bool
2867 {
2868  BufferDesc *bufHdr;
2869 
2870  /* Local buffers are used only for temp relations. */
2871  if (BufferIsLocal(buffer))
2872  return false;
2873 
2874  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2875  Assert(BufferIsValid(buffer));
2876  Assert(BufferIsPinned(buffer));
2877 
2878  /*
2879  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2880  * need not bother with the buffer header spinlock. Even if someone else
2881  * changes the buffer header state while we're doing this, the state is
2882  * changed atomically, so we'll read the old value or the new value, but
2883  * not random garbage.
2884  */
2885  bufHdr = GetBufferDescriptor(buffer - 1);
2886  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2887 }
2888 
2889 /*
2890  * BufferGetLSNAtomic
2891  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2892  * This is necessary for some callers who may not have an exclusive lock
2893  * on the buffer.
2894  */
2895 XLogRecPtr
2897 {
2898  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2899  char *page = BufferGetPage(buffer);
2900  XLogRecPtr lsn;
2901  uint32 buf_state;
2902 
2903  /*
2904  * If we don't need locking for correctness, fastpath out.
2905  */
2906  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2907  return PageGetLSN(page);
2908 
2909  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2910  Assert(BufferIsValid(buffer));
2911  Assert(BufferIsPinned(buffer));
2912 
2913  buf_state = LockBufHdr(bufHdr);
2914  lsn = PageGetLSN(page);
2915  UnlockBufHdr(bufHdr, buf_state);
2916 
2917  return lsn;
2918 }
2919 
2920 /* ---------------------------------------------------------------------
2921  * DropRelFileNodeBuffers
2922  *
2923  * This function removes from the buffer pool all the pages of the
2924  * specified relation forks that have block numbers >= firstDelBlock.
2925  * (In particular, with firstDelBlock = 0, all pages are removed.)
2926  * Dirty pages are simply dropped, without bothering to write them
2927  * out first. Therefore, this is NOT rollback-able, and so should be
2928  * used only with extreme caution!
2929  *
2930  * Currently, this is called only from smgr.c when the underlying file
2931  * is about to be deleted or truncated (firstDelBlock is needed for
2932  * the truncation case). The data in the affected pages would therefore
2933  * be deleted momentarily anyway, and there is no point in writing it.
2934  * It is the responsibility of higher-level code to ensure that the
2935  * deletion or truncation does not lose any data that could be needed
2936  * later. It is also the responsibility of higher-level code to ensure
2937  * that no other process could be trying to load more pages of the
2938  * relation into buffers.
2939  *
2940  * XXX currently it sequentially searches the buffer pool, should be
2941  * changed to more clever ways of searching. However, this routine
2942  * is used only in code paths that aren't very performance-critical,
2943  * and we shouldn't slow down the hot paths to make it faster ...
2944  * --------------------------------------------------------------------
2945  */
2946 void
2948  int nforks, BlockNumber *firstDelBlock)
2949 {
2950  int i;
2951  int j;
2952 
2953  /* If it's a local relation, it's localbuf.c's problem. */
2954  if (RelFileNodeBackendIsTemp(rnode))
2955  {
2956  if (rnode.backend == MyBackendId)
2957  {
2958  for (j = 0; j < nforks; j++)
2959  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2960  firstDelBlock[j]);
2961  }
2962  return;
2963  }
2964 
2965  for (i = 0; i < NBuffers; i++)
2966  {
2967  BufferDesc *bufHdr = GetBufferDescriptor(i);
2968  uint32 buf_state;
2969 
2970  /*
2971  * We can make this a tad faster by prechecking the buffer tag before
2972  * we attempt to lock the buffer; this saves a lot of lock
2973  * acquisitions in typical cases. It should be safe because the
2974  * caller must have AccessExclusiveLock on the relation, or some other
2975  * reason to be certain that no one is loading new pages of the rel
2976  * into the buffer pool. (Otherwise we might well miss such pages
2977  * entirely.) Therefore, while the tag might be changing while we
2978  * look at it, it can't be changing *to* a value we care about, only
2979  * *away* from such a value. So false negatives are impossible, and
2980  * false positives are safe because we'll recheck after getting the
2981  * buffer lock.
2982  *
2983  * We could check forkNum and blockNum as well as the rnode, but the
2984  * incremental win from doing so seems small.
2985  */
2986  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2987  continue;
2988 
2989  buf_state = LockBufHdr(bufHdr);
2990 
2991  for (j = 0; j < nforks; j++)
2992  {
2993  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2994  bufHdr->tag.forkNum == forkNum[j] &&
2995  bufHdr->tag.blockNum >= firstDelBlock[j])
2996  {
2997  InvalidateBuffer(bufHdr); /* releases spinlock */
2998  break;
2999  }
3000  }
3001  if (j >= nforks)
3002  UnlockBufHdr(bufHdr, buf_state);
3003  }
3004 }
3005 
3006 /* ---------------------------------------------------------------------
3007  * DropRelFileNodesAllBuffers
3008  *
3009  * This function removes from the buffer pool all the pages of all
3010  * forks of the specified relations. It's equivalent to calling
3011  * DropRelFileNodeBuffers once per fork per relation with
3012  * firstDelBlock = 0.
3013  * --------------------------------------------------------------------
3014  */
3015 void
3017 {
3018  int i,
3019  n = 0;
3020  RelFileNode *nodes;
3021  bool use_bsearch;
3022 
3023  if (nnodes == 0)
3024  return;
3025 
3026  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3027 
3028  /* If it's a local relation, it's localbuf.c's problem. */
3029  for (i = 0; i < nnodes; i++)
3030  {
3031  if (RelFileNodeBackendIsTemp(rnodes[i]))
3032  {
3033  if (rnodes[i].backend == MyBackendId)
3034  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3035  }
3036  else
3037  nodes[n++] = rnodes[i].node;
3038  }
3039 
3040  /*
3041  * If there are no non-local relations, then we're done. Release the
3042  * memory and return.
3043  */
3044  if (n == 0)
3045  {
3046  pfree(nodes);
3047  return;
3048  }
3049 
3050  /*
3051  * For low number of relations to drop just use a simple walk through, to
3052  * save the bsearch overhead. The threshold to use is rather a guess than
3053  * an exactly determined value, as it depends on many factors (CPU and RAM
3054  * speeds, amount of shared buffers etc.).
3055  */
3056  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3057 
3058  /* sort the list of rnodes if necessary */
3059  if (use_bsearch)
3060  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3061 
3062  for (i = 0; i < NBuffers; i++)
3063  {
3064  RelFileNode *rnode = NULL;
3065  BufferDesc *bufHdr = GetBufferDescriptor(i);
3066  uint32 buf_state;
3067 
3068  /*
3069  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3070  * and saves some cycles.
3071  */
3072 
3073  if (!use_bsearch)
3074  {
3075  int j;
3076 
3077  for (j = 0; j < n; j++)
3078  {
3079  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3080  {
3081  rnode = &nodes[j];
3082  break;
3083  }
3084  }
3085  }
3086  else
3087  {
3088  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3089  nodes, n, sizeof(RelFileNode),
3091  }
3092 
3093  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3094  if (rnode == NULL)
3095  continue;
3096 
3097  buf_state = LockBufHdr(bufHdr);
3098  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3099  InvalidateBuffer(bufHdr); /* releases spinlock */
3100  else
3101  UnlockBufHdr(bufHdr, buf_state);
3102  }
3103 
3104  pfree(nodes);
3105 }
3106 
3107 /* ---------------------------------------------------------------------
3108  * DropDatabaseBuffers
3109  *
3110  * This function removes all the buffers in the buffer cache for a
3111  * particular database. Dirty pages are simply dropped, without
3112  * bothering to write them out first. This is used when we destroy a
3113  * database, to avoid trying to flush data to disk when the directory
3114  * tree no longer exists. Implementation is pretty similar to
3115  * DropRelFileNodeBuffers() which is for destroying just one relation.
3116  * --------------------------------------------------------------------
3117  */
3118 void
3120 {
3121  int i;
3122 
3123  /*
3124  * We needn't consider local buffers, since by assumption the target
3125  * database isn't our own.
3126  */
3127 
3128  for (i = 0; i < NBuffers; i++)
3129  {
3130  BufferDesc *bufHdr = GetBufferDescriptor(i);
3131  uint32 buf_state;
3132 
3133  /*
3134  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3135  * and saves some cycles.
3136  */
3137  if (bufHdr->tag.rnode.dbNode != dbid)
3138  continue;
3139 
3140  buf_state = LockBufHdr(bufHdr);
3141  if (bufHdr->tag.rnode.dbNode == dbid)
3142  InvalidateBuffer(bufHdr); /* releases spinlock */
3143  else
3144  UnlockBufHdr(bufHdr, buf_state);
3145  }
3146 }
3147 
3148 /* -----------------------------------------------------------------
3149  * PrintBufferDescs
3150  *
3151  * this function prints all the buffer descriptors, for debugging
3152  * use only.
3153  * -----------------------------------------------------------------
3154  */
3155 #ifdef NOT_USED
3156 void
3157 PrintBufferDescs(void)
3158 {
3159  int i;
3160 
3161  for (i = 0; i < NBuffers; ++i)
3162  {
3165 
3166  /* theoretically we should lock the bufhdr here */
3167  elog(LOG,
3168  "[%02d] (freeNext=%d, rel=%s, "
3169  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3170  i, buf->freeNext,
3172  buf->tag.blockNum, buf->flags,
3173  buf->refcount, GetPrivateRefCount(b));
3174  }
3175 }
3176 #endif
3177 
3178 #ifdef NOT_USED
3179 void
3180 PrintPinnedBufs(void)
3181 {
3182  int i;
3183 
3184  for (i = 0; i < NBuffers; ++i)
3185  {
3188 
3189  if (GetPrivateRefCount(b) > 0)
3190  {
3191  /* theoretically we should lock the bufhdr here */
3192  elog(LOG,
3193  "[%02d] (freeNext=%d, rel=%s, "
3194  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3195  i, buf->freeNext,
3196  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3197  buf->tag.blockNum, buf->flags,
3198  buf->refcount, GetPrivateRefCount(b));
3199  }
3200  }
3201 }
3202 #endif
3203 
3204 /* ---------------------------------------------------------------------
3205  * FlushRelationBuffers
3206  *
3207  * This function writes all dirty pages of a relation out to disk
3208  * (or more accurately, out to kernel disk buffers), ensuring that the
3209  * kernel has an up-to-date view of the relation.
3210  *
3211  * Generally, the caller should be holding AccessExclusiveLock on the
3212  * target relation to ensure that no other backend is busy dirtying
3213  * more blocks of the relation; the effects can't be expected to last
3214  * after the lock is released.
3215  *
3216  * XXX currently it sequentially searches the buffer pool, should be
3217  * changed to more clever ways of searching. This routine is not
3218  * used in any performance-critical code paths, so it's not worth
3219  * adding additional overhead to normal paths to make it go faster;
3220  * but see also DropRelFileNodeBuffers.
3221  * --------------------------------------------------------------------
3222  */
3223 void
3225 {
3226  int i;
3227  BufferDesc *bufHdr;
3228 
3229  /* Open rel at the smgr level if not already done */
3230  RelationOpenSmgr(rel);
3231 
3232  if (RelationUsesLocalBuffers(rel))
3233  {
3234  for (i = 0; i < NLocBuffer; i++)
3235  {
3236  uint32 buf_state;
3237 
3238  bufHdr = GetLocalBufferDescriptor(i);
3239  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3240  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3241  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3242  {
3243  ErrorContextCallback errcallback;
3244  Page localpage;
3245 
3246  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3247 
3248  /* Setup error traceback support for ereport() */
3250  errcallback.arg = (void *) bufHdr;
3251  errcallback.previous = error_context_stack;
3252  error_context_stack = &errcallback;
3253 
3254  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3255 
3256  smgrwrite(rel->rd_smgr,
3257  bufHdr->tag.forkNum,
3258  bufHdr->tag.blockNum,
3259  localpage,
3260  false);
3261 
3262  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3263  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3264 
3265  /* Pop the error context stack */
3266  error_context_stack = errcallback.previous;
3267  }
3268  }
3269 
3270  return;
3271  }
3272 
3273  /* Make sure we can handle the pin inside the loop */
3275 
3276  for (i = 0; i < NBuffers; i++)
3277  {
3278  uint32 buf_state;
3279 
3280  bufHdr = GetBufferDescriptor(i);
3281 
3282  /*
3283  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3284  * and saves some cycles.
3285  */
3286  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3287  continue;
3288 
3290 
3291  buf_state = LockBufHdr(bufHdr);
3292  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3293  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3294  {
3295  PinBuffer_Locked(bufHdr);
3297  FlushBuffer(bufHdr, rel->rd_smgr);
3299  UnpinBuffer(bufHdr, true);
3300  }
3301  else
3302  UnlockBufHdr(bufHdr, buf_state);
3303  }
3304 }
3305 
3306 /* ---------------------------------------------------------------------
3307  * FlushRelationsAllBuffers
3308  *
3309  * This function flushes out of the buffer pool all the pages of all
3310  * forks of the specified smgr relations. It's equivalent to calling
3311  * FlushRelationBuffers once per fork per relation. The relations are
3312  * assumed not to use local buffers.
3313  * --------------------------------------------------------------------
3314  */
3315 void
3317 {
3318  int i;
3319  SMgrSortArray *srels;
3320  bool use_bsearch;
3321 
3322  if (nrels == 0)
3323  return;
3324 
3325  /* fill-in array for qsort */
3326  srels = palloc(sizeof(SMgrSortArray) * nrels);
3327 
3328  for (i = 0; i < nrels; i++)
3329  {
3330  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3331 
3332  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3333  srels[i].srel = smgrs[i];
3334  }
3335 
3336  /*
3337  * Save the bsearch overhead for low number of relations to sync. See
3338  * DropRelFileNodesAllBuffers for details.
3339  */
3340  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3341 
3342  /* sort the list of SMgrRelations if necessary */
3343  if (use_bsearch)
3344  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3345 
3346  /* Make sure we can handle the pin inside the loop */
3348 
3349  for (i = 0; i < NBuffers; i++)
3350  {
3351  SMgrSortArray *srelent = NULL;
3352  BufferDesc *bufHdr = GetBufferDescriptor(i);
3353  uint32 buf_state;
3354 
3355  /*
3356  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3357  * and saves some cycles.
3358  */
3359 
3360  if (!use_bsearch)
3361  {
3362  int j;
3363 
3364  for (j = 0; j < nrels; j++)
3365  {
3366  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3367  {
3368  srelent = &srels[j];
3369  break;
3370  }
3371  }
3372 
3373  }
3374  else
3375  {
3376  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3377  srels, nrels, sizeof(SMgrSortArray),
3379  }
3380 
3381  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3382  if (srelent == NULL)
3383  continue;
3384 
3386 
3387  buf_state = LockBufHdr(bufHdr);
3388  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3389  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3390  {
3391  PinBuffer_Locked(bufHdr);
3393  FlushBuffer(bufHdr, srelent->srel);
3395  UnpinBuffer(bufHdr, true);
3396  }
3397  else
3398  UnlockBufHdr(bufHdr, buf_state);
3399  }
3400 
3401  pfree(srels);
3402 }
3403 
3404 /* ---------------------------------------------------------------------
3405  * FlushDatabaseBuffers
3406  *
3407  * This function writes all dirty pages of a database out to disk
3408  * (or more accurately, out to kernel disk buffers), ensuring that the
3409  * kernel has an up-to-date view of the database.
3410  *
3411  * Generally, the caller should be holding an appropriate lock to ensure
3412  * no other backend is active in the target database; otherwise more
3413  * pages could get dirtied.
3414  *
3415  * Note we don't worry about flushing any pages of temporary relations.
3416  * It's assumed these wouldn't be interesting.
3417  * --------------------------------------------------------------------
3418  */
3419 void
3421 {
3422  int i;
3423  BufferDesc *bufHdr;
3424 
3425  /* Make sure we can handle the pin inside the loop */
3427 
3428  for (i = 0; i < NBuffers; i++)
3429  {
3430  uint32 buf_state;
3431 
3432  bufHdr = GetBufferDescriptor(i);
3433 
3434  /*
3435  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3436  * and saves some cycles.
3437  */
3438  if (bufHdr->tag.rnode.dbNode != dbid)
3439  continue;
3440 
3442 
3443  buf_state = LockBufHdr(bufHdr);
3444  if (bufHdr->tag.rnode.dbNode == dbid &&
3445  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3446  {
3447  PinBuffer_Locked(bufHdr);
3449  FlushBuffer(bufHdr, NULL);
3451  UnpinBuffer(bufHdr, true);
3452  }
3453  else
3454  UnlockBufHdr(bufHdr, buf_state);
3455  }
3456 }
3457 
3458 /*
3459  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3460  * OS.
3461  */
3462 void
3464 {
3465  BufferDesc *bufHdr;
3466 
3467  /* currently not needed, but no fundamental reason not to support */
3468  Assert(!BufferIsLocal(buffer));
3469 
3470  Assert(BufferIsPinned(buffer));
3471 
3472  bufHdr = GetBufferDescriptor(buffer - 1);
3473 
3475 
3476  FlushBuffer(bufHdr, NULL);
3477 }
3478 
3479 /*
3480  * ReleaseBuffer -- release the pin on a buffer
3481  */
3482 void
3484 {
3485  if (!BufferIsValid(buffer))
3486  elog(ERROR, "bad buffer ID: %d", buffer);
3487 
3488  if (BufferIsLocal(buffer))
3489  {
3491 
3492  Assert(LocalRefCount[-buffer - 1] > 0);
3493  LocalRefCount[-buffer - 1]--;
3494  return;
3495  }
3496 
3497  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3498 }
3499 
3500 /*
3501  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3502  *
3503  * This is just a shorthand for a common combination.
3504  */
3505 void
3507 {
3508  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3509  ReleaseBuffer(buffer);
3510 }
3511 
3512 /*
3513  * IncrBufferRefCount
3514  * Increment the pin count on a buffer that we have *already* pinned
3515  * at least once.
3516  *
3517  * This function cannot be used on a buffer we do not have pinned,
3518  * because it doesn't change the shared buffer state.
3519  */
3520 void
3522 {
3523  Assert(BufferIsPinned(buffer));
3525  if (BufferIsLocal(buffer))
3526  LocalRefCount[-buffer - 1]++;
3527  else
3528  {
3529  PrivateRefCountEntry *ref;
3530 
3531  ref = GetPrivateRefCountEntry(buffer, true);
3532  Assert(ref != NULL);
3533  ref->refcount++;
3534  }
3536 }
3537 
3538 /*
3539  * MarkBufferDirtyHint
3540  *
3541  * Mark a buffer dirty for non-critical changes.
3542  *
3543  * This is essentially the same as MarkBufferDirty, except:
3544  *
3545  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3546  * to write an XLOG_FPI WAL record to protect against torn pages.
3547  * 2. The caller might have only share-lock instead of exclusive-lock on the
3548  * buffer's content lock.
3549  * 3. This function does not guarantee that the buffer is always marked dirty
3550  * (due to a race condition), so it cannot be used for important changes.
3551  */
3552 void
3554 {
3555  BufferDesc *bufHdr;
3556  Page page = BufferGetPage(buffer);
3557 
3558  if (!BufferIsValid(buffer))
3559  elog(ERROR, "bad buffer ID: %d", buffer);
3560 
3561  if (BufferIsLocal(buffer))
3562  {
3563  MarkLocalBufferDirty(buffer);
3564  return;
3565  }
3566 
3567  bufHdr = GetBufferDescriptor(buffer - 1);
3568 
3569  Assert(GetPrivateRefCount(buffer) > 0);
3570  /* here, either share or exclusive lock is OK */
3572 
3573  /*
3574  * This routine might get called many times on the same page, if we are
3575  * making the first scan after commit of an xact that added/deleted many
3576  * tuples. So, be as quick as we can if the buffer is already dirty. We
3577  * do this by not acquiring spinlock if it looks like the status bits are
3578  * already set. Since we make this test unlocked, there's a chance we
3579  * might fail to notice that the flags have just been cleared, and failed
3580  * to reset them, due to memory-ordering issues. But since this function
3581  * is only intended to be used in cases where failing to write out the
3582  * data would be harmless anyway, it doesn't really matter.
3583  */
3584  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3586  {
3588  bool dirtied = false;
3589  bool delayChkpt = false;
3590  uint32 buf_state;
3591 
3592  /*
3593  * If we need to protect hint bit updates from torn writes, WAL-log a
3594  * full page image of the page. This full page image is only necessary
3595  * if the hint bit update is the first change to the page since the
3596  * last checkpoint.
3597  *
3598  * We don't check full_page_writes here because that logic is included
3599  * when we call XLogInsert() since the value changes dynamically.
3600  */
3601  if (XLogHintBitIsNeeded() &&
3602  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3603  {
3604  /*
3605  * If we must not write WAL, due to a relfilenode-specific
3606  * condition or being in recovery, don't dirty the page. We can
3607  * set the hint, just not dirty the page as a result so the hint
3608  * is lost when we evict the page or shutdown.
3609  *
3610  * See src/backend/storage/page/README for longer discussion.
3611  */
3612  if (RecoveryInProgress() ||
3613  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3614  return;
3615 
3616  /*
3617  * If the block is already dirty because we either made a change
3618  * or set a hint already, then we don't need to write a full page
3619  * image. Note that aggressive cleaning of blocks dirtied by hint
3620  * bit setting would increase the call rate. Bulk setting of hint
3621  * bits would reduce the call rate...
3622  *
3623  * We must issue the WAL record before we mark the buffer dirty.
3624  * Otherwise we might write the page before we write the WAL. That
3625  * causes a race condition, since a checkpoint might occur between
3626  * writing the WAL record and marking the buffer dirty. We solve
3627  * that with a kluge, but one that is already in use during
3628  * transaction commit to prevent race conditions. Basically, we
3629  * simply prevent the checkpoint WAL record from being written
3630  * until we have marked the buffer dirty. We don't start the
3631  * checkpoint flush until we have marked dirty, so our checkpoint
3632  * must flush the change to disk successfully or the checkpoint
3633  * never gets written, so crash recovery will fix.
3634  *
3635  * It's possible we may enter here without an xid, so it is
3636  * essential that CreateCheckpoint waits for virtual transactions
3637  * rather than full transactionids.
3638  */
3639  MyProc->delayChkpt = delayChkpt = true;
3640  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3641  }
3642 
3643  buf_state = LockBufHdr(bufHdr);
3644 
3645  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3646 
3647  if (!(buf_state & BM_DIRTY))
3648  {
3649  dirtied = true; /* Means "will be dirtied by this action" */
3650 
3651  /*
3652  * Set the page LSN if we wrote a backup block. We aren't supposed
3653  * to set this when only holding a share lock but as long as we
3654  * serialise it somehow we're OK. We choose to set LSN while
3655  * holding the buffer header lock, which causes any reader of an
3656  * LSN who holds only a share lock to also obtain a buffer header
3657  * lock before using PageGetLSN(), which is enforced in
3658  * BufferGetLSNAtomic().
3659  *
3660  * If checksums are enabled, you might think we should reset the
3661  * checksum here. That will happen when the page is written
3662  * sometime later in this checkpoint cycle.
3663  */
3664  if (!XLogRecPtrIsInvalid(lsn))
3665  PageSetLSN(page, lsn);
3666  }
3667 
3668  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3669  UnlockBufHdr(bufHdr, buf_state);
3670 
3671  if (delayChkpt)
3672  MyProc->delayChkpt = false;
3673 
3674  if (dirtied)
3675  {
3676  VacuumPageDirty++;
3678  if (VacuumCostActive)
3680  }
3681  }
3682 }
3683 
3684 /*
3685  * Release buffer content locks for shared buffers.
3686  *
3687  * Used to clean up after errors.
3688  *
3689  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3690  * of releasing buffer content locks per se; the only thing we need to deal
3691  * with here is clearing any PIN_COUNT request that was in progress.
3692  */
3693 void
3695 {
3697 
3698  if (buf)
3699  {
3700  uint32 buf_state;
3701 
3702  buf_state = LockBufHdr(buf);
3703 
3704  /*
3705  * Don't complain if flag bit not set; it could have been reset but we
3706  * got a cancel/die interrupt before getting the signal.
3707  */
3708  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3709  buf->wait_backend_pid == MyProcPid)
3710  buf_state &= ~BM_PIN_COUNT_WAITER;
3711 
3712  UnlockBufHdr(buf, buf_state);
3713 
3714  PinCountWaitBuf = NULL;
3715  }
3716 }
3717 
3718 /*
3719  * Acquire or release the content_lock for the buffer.
3720  */
3721 void
3723 {
3724  BufferDesc *buf;
3725 
3726  Assert(BufferIsValid(buffer));
3727  if (BufferIsLocal(buffer))
3728  return; /* local buffers need no lock */
3729 
3730  buf = GetBufferDescriptor(buffer - 1);
3731 
3732  if (mode == BUFFER_LOCK_UNLOCK)
3734  else if (mode == BUFFER_LOCK_SHARE)
3736  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3738  else
3739  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3740 }
3741 
3742 /*
3743  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3744  *
3745  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3746  */
3747 bool
3749 {
3750  BufferDesc *buf;
3751 
3752  Assert(BufferIsValid(buffer));
3753  if (BufferIsLocal(buffer))
3754  return true; /* act as though we got it */
3755 
3756  buf = GetBufferDescriptor(buffer - 1);
3757 
3759  LW_EXCLUSIVE);
3760 }
3761 
3762 /*
3763  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3764  *
3765  * Items may be deleted from a disk page only when the caller (a) holds an
3766  * exclusive lock on the buffer and (b) has observed that no other backend
3767  * holds a pin on the buffer. If there is a pin, then the other backend
3768  * might have a pointer into the buffer (for example, a heapscan reference
3769  * to an item --- see README for more details). It's OK if a pin is added
3770  * after the cleanup starts, however; the newly-arrived backend will be
3771  * unable to look at the page until we release the exclusive lock.
3772  *
3773  * To implement this protocol, a would-be deleter must pin the buffer and
3774  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3775  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3776  * it has successfully observed pin count = 1.
3777  */
3778 void
3780 {
3781  BufferDesc *bufHdr;
3782  char *new_status = NULL;
3783 
3784  Assert(BufferIsValid(buffer));
3785  Assert(PinCountWaitBuf == NULL);
3786 
3787  if (BufferIsLocal(buffer))
3788  {
3789  /* There should be exactly one pin */
3790  if (LocalRefCount[-buffer - 1] != 1)
3791  elog(ERROR, "incorrect local pin count: %d",
3792  LocalRefCount[-buffer - 1]);
3793  /* Nobody else to wait for */
3794  return;
3795  }
3796 
3797  /* There should be exactly one local pin */
3798  if (GetPrivateRefCount(buffer) != 1)
3799  elog(ERROR, "incorrect local pin count: %d",
3800  GetPrivateRefCount(buffer));
3801 
3802  bufHdr = GetBufferDescriptor(buffer - 1);
3803 
3804  for (;;)
3805  {
3806  uint32 buf_state;
3807 
3808  /* Try to acquire lock */
3810  buf_state = LockBufHdr(bufHdr);
3811 
3812  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3813  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3814  {
3815  /* Successfully acquired exclusive lock with pincount 1 */
3816  UnlockBufHdr(bufHdr, buf_state);
3817 
3818  /* Report change to non-waiting status */
3819  if (new_status)
3820  {
3821  set_ps_display(new_status);
3822  pfree(new_status);
3823  }
3824  return;
3825  }
3826  /* Failed, so mark myself as waiting for pincount 1 */
3827  if (buf_state & BM_PIN_COUNT_WAITER)
3828  {
3829  UnlockBufHdr(bufHdr, buf_state);
3830  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3831  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3832  }
3833  bufHdr->wait_backend_pid = MyProcPid;
3834  PinCountWaitBuf = bufHdr;
3835  buf_state |= BM_PIN_COUNT_WAITER;
3836  UnlockBufHdr(bufHdr, buf_state);
3837  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3838 
3839  /* Wait to be signaled by UnpinBuffer() */
3840  if (InHotStandby)
3841  {
3842  /* Report change to waiting status */
3843  if (update_process_title && new_status == NULL)
3844  {
3845  const char *old_status;
3846  int len;
3847 
3848  old_status = get_ps_display(&len);
3849  new_status = (char *) palloc(len + 8 + 1);
3850  memcpy(new_status, old_status, len);
3851  strcpy(new_status + len, " waiting");
3852  set_ps_display(new_status);
3853  new_status[len] = '\0'; /* truncate off " waiting" */
3854  }
3855 
3856  /* Publish the bufid that Startup process waits on */
3857  SetStartupBufferPinWaitBufId(buffer - 1);
3858  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3860  /* Reset the published bufid */
3862  }
3863  else
3865 
3866  /*
3867  * Remove flag marking us as waiter. Normally this will not be set
3868  * anymore, but ProcWaitForSignal() can return for other signals as
3869  * well. We take care to only reset the flag if we're the waiter, as
3870  * theoretically another backend could have started waiting. That's
3871  * impossible with the current usages due to table level locking, but
3872  * better be safe.
3873  */
3874  buf_state = LockBufHdr(bufHdr);
3875  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3876  bufHdr->wait_backend_pid == MyProcPid)
3877  buf_state &= ~BM_PIN_COUNT_WAITER;
3878  UnlockBufHdr(bufHdr, buf_state);
3879 
3880  PinCountWaitBuf = NULL;
3881  /* Loop back and try again */
3882  }
3883 }
3884 
3885 /*
3886  * Check called from RecoveryConflictInterrupt handler when Startup
3887  * process requests cancellation of all pin holders that are blocking it.
3888  */
3889 bool
3891 {
3892  int bufid = GetStartupBufferPinWaitBufId();
3893 
3894  /*
3895  * If we get woken slowly then it's possible that the Startup process was
3896  * already woken by other backends before we got here. Also possible that
3897  * we get here by multiple interrupts or interrupts at inappropriate
3898  * times, so make sure we do nothing if the bufid is not set.
3899  */
3900  if (bufid < 0)
3901  return false;
3902 
3903  if (GetPrivateRefCount(bufid + 1) > 0)
3904  return true;
3905 
3906  return false;
3907 }
3908 
3909 /*
3910  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3911  *
3912  * We won't loop, but just check once to see if the pin count is OK. If
3913  * not, return false with no lock held.
3914  */
3915 bool
3917 {
3918  BufferDesc *bufHdr;
3919  uint32 buf_state,
3920  refcount;
3921 
3922  Assert(BufferIsValid(buffer));
3923 
3924  if (BufferIsLocal(buffer))
3925  {
3926  refcount = LocalRefCount[-buffer - 1];
3927  /* There should be exactly one pin */
3928  Assert(refcount > 0);
3929  if (refcount != 1)
3930  return false;
3931  /* Nobody else to wait for */
3932  return true;
3933  }
3934 
3935  /* There should be exactly one local pin */
3936  refcount = GetPrivateRefCount(buffer);
3937  Assert(refcount);
3938  if (refcount != 1)
3939  return false;
3940 
3941  /* Try to acquire lock */
3942  if (!ConditionalLockBuffer(buffer))
3943  return false;
3944 
3945  bufHdr = GetBufferDescriptor(buffer - 1);
3946  buf_state = LockBufHdr(bufHdr);
3947  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3948 
3949  Assert(refcount > 0);
3950  if (refcount == 1)
3951  {
3952  /* Successfully acquired exclusive lock with pincount 1 */
3953  UnlockBufHdr(bufHdr, buf_state);
3954  return true;
3955  }
3956 
3957  /* Failed, so release the lock */
3958  UnlockBufHdr(bufHdr, buf_state);
3959  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3960  return false;
3961 }
3962 
3963 /*
3964  * IsBufferCleanupOK - as above, but we already have the lock
3965  *
3966  * Check whether it's OK to perform cleanup on a buffer we've already
3967  * locked. If we observe that the pin count is 1, our exclusive lock
3968  * happens to be a cleanup lock, and we can proceed with anything that
3969  * would have been allowable had we sought a cleanup lock originally.
3970  */
3971 bool
3973 {
3974  BufferDesc *bufHdr;
3975  uint32 buf_state;
3976 
3977  Assert(BufferIsValid(buffer));
3978 
3979  if (BufferIsLocal(buffer))
3980  {
3981  /* There should be exactly one pin */
3982  if (LocalRefCount[-buffer - 1] != 1)
3983  return false;
3984  /* Nobody else to wait for */
3985  return true;
3986  }
3987 
3988  /* There should be exactly one local pin */
3989  if (GetPrivateRefCount(buffer) != 1)
3990  return false;
3991 
3992  bufHdr = GetBufferDescriptor(buffer - 1);
3993 
3994  /* caller must hold exclusive lock on buffer */
3996  LW_EXCLUSIVE));
3997 
3998  buf_state = LockBufHdr(bufHdr);
3999 
4000  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4001  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4002  {
4003  /* pincount is OK. */
4004  UnlockBufHdr(bufHdr, buf_state);
4005  return true;
4006  }
4007 
4008  UnlockBufHdr(bufHdr, buf_state);
4009  return false;
4010 }
4011 
4012 
4013 /*
4014  * Functions for buffer I/O handling
4015  *
4016  * Note: We assume that nested buffer I/O never occurs.
4017  * i.e at most one io_in_progress lock is held per proc.
4018  *
4019  * Also note that these are used only for shared buffers, not local ones.
4020  */
4021 
4022 /*
4023  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
4024  */
4025 static void
4027 {
4028  /*
4029  * Changed to wait until there's no IO - Inoue 01/13/2000
4030  *
4031  * Note this is *necessary* because an error abort in the process doing
4032  * I/O could release the io_in_progress_lock prematurely. See
4033  * AbortBufferIO.
4034  */
4035  for (;;)
4036  {
4037  uint32 buf_state;
4038 
4039  /*
4040  * It may not be necessary to acquire the spinlock to check the flag
4041  * here, but since this test is essential for correctness, we'd better
4042  * play it safe.
4043  */
4044  buf_state = LockBufHdr(buf);
4045  UnlockBufHdr(buf, buf_state);
4046 
4047  if (!(buf_state & BM_IO_IN_PROGRESS))
4048  break;
4051  }
4052 }
4053 
4054 /*
4055  * StartBufferIO: begin I/O on this buffer
4056  * (Assumptions)
4057  * My process is executing no IO
4058  * The buffer is Pinned
4059  *
4060  * In some scenarios there are race conditions in which multiple backends
4061  * could attempt the same I/O operation concurrently. If someone else
4062  * has already started I/O on this buffer then we will block on the
4063  * io_in_progress lock until he's done.
4064  *
4065  * Input operations are only attempted on buffers that are not BM_VALID,
4066  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
4067  * so we can always tell if the work is already done.
4068  *
4069  * Returns true if we successfully marked the buffer as I/O busy,
4070  * false if someone else already did the work.
4071  */
4072 static bool
4073 StartBufferIO(BufferDesc *buf, bool forInput)
4074 {
4075  uint32 buf_state;
4076 
4077  Assert(!InProgressBuf);
4078 
4079  for (;;)
4080  {
4081  /*
4082  * Grab the io_in_progress lock so that other processes can wait for
4083  * me to finish the I/O.
4084  */
4086 
4087  buf_state = LockBufHdr(buf);
4088 
4089  if (!(buf_state & BM_IO_IN_PROGRESS))
4090  break;
4091 
4092  /*
4093  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
4094  * lock isn't held is if the process doing the I/O is recovering from
4095  * an error (see AbortBufferIO). If that's the case, we must wait for
4096  * him to get unwedged.
4097  */
4098  UnlockBufHdr(buf, buf_state);
4100  WaitIO(buf);
4101  }
4102 
4103  /* Once we get here, there is definitely no I/O active on this buffer */
4104 
4105  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
4106  {
4107  /* someone else already did the I/O */
4108  UnlockBufHdr(buf, buf_state);
4110  return false;
4111  }
4112 
4113  buf_state |= BM_IO_IN_PROGRESS;
4114  UnlockBufHdr(buf, buf_state);
4115 
4116  InProgressBuf = buf;
4117  IsForInput = forInput;
4118 
4119  return true;
4120 }
4121 
4122 /*
4123  * TerminateBufferIO: release a buffer we were doing I/O on
4124  * (Assumptions)
4125  * My process is executing IO for the buffer
4126  * BM_IO_IN_PROGRESS bit is set for the buffer
4127  * We hold the buffer's io_in_progress lock
4128  * The buffer is Pinned
4129  *
4130  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
4131  * buffer's BM_DIRTY flag. This is appropriate when terminating a
4132  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
4133  * marking the buffer clean if it was re-dirtied while we were writing.
4134  *
4135  * set_flag_bits gets ORed into the buffer's flags. It must include
4136  * BM_IO_ERROR in a failure case. For successful completion it could
4137  * be 0, or BM_VALID if we just finished reading in the page.
4138  */
4139 static void
4140 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
4141 {
4142  uint32 buf_state;
4143 
4144  Assert(buf == InProgressBuf);
4145 
4146  buf_state = LockBufHdr(buf);
4147 
4148  Assert(buf_state & BM_IO_IN_PROGRESS);
4149 
4150  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
4151  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
4152  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
4153 
4154  buf_state |= set_flag_bits;
4155  UnlockBufHdr(buf, buf_state);
4156 
4157  InProgressBuf = NULL;
4158 
4160 }
4161 
4162 /*
4163  * AbortBufferIO: Clean up any active buffer I/O after an error.
4164  *
4165  * All LWLocks we might have held have been released,
4166  * but we haven't yet released buffer pins, so the buffer is still pinned.
4167  *
4168  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4169  * possible the error condition wasn't related to the I/O.
4170  */
4171 void
4173 {
4175 
4176  if (buf)
4177  {
4178  uint32 buf_state;
4179 
4180  /*
4181  * Since LWLockReleaseAll has already been called, we're not holding
4182  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4183  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4184  * buffer will be in a busy spin until we succeed in doing this.
4185  */
4187 
4188  buf_state = LockBufHdr(buf);
4189  Assert(buf_state & BM_IO_IN_PROGRESS);
4190  if (IsForInput)
4191  {
4192  Assert(!(buf_state & BM_DIRTY));
4193 
4194  /* We'd better not think buffer is valid yet */
4195  Assert(!(buf_state & BM_VALID));
4196  UnlockBufHdr(buf, buf_state);
4197  }
4198  else
4199  {
4200  Assert(buf_state & BM_DIRTY);
4201  UnlockBufHdr(buf, buf_state);
4202  /* Issue notice if this is not the first failure... */
4203  if (buf_state & BM_IO_ERROR)
4204  {
4205  /* Buffer is pinned, so we can read tag without spinlock */
4206  char *path;
4207 
4208  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4209  ereport(WARNING,
4210  (errcode(ERRCODE_IO_ERROR),
4211  errmsg("could not write block %u of %s",
4212  buf->tag.blockNum, path),
4213  errdetail("Multiple failures --- write error might be permanent.")));
4214  pfree(path);
4215  }
4216  }
4217  TerminateBufferIO(buf, false, BM_IO_ERROR);
4218  }
4219 }
4220 
4221 /*
4222  * Error context callback for errors occurring during shared buffer writes.
4223  */
4224 static void
4226 {
4227  BufferDesc *bufHdr = (BufferDesc *) arg;
4228 
4229  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4230  if (bufHdr != NULL)
4231  {
4232  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4233 
4234  errcontext("writing block %u of relation %s",
4235  bufHdr->tag.blockNum, path);
4236  pfree(path);
4237  }
4238 }
4239 
4240 /*
4241  * Error context callback for errors occurring during local buffer writes.
4242  */
4243 static void
4245 {
4246  BufferDesc *bufHdr = (BufferDesc *) arg;
4247 
4248  if (bufHdr != NULL)
4249  {
4250  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4251  bufHdr->tag.forkNum);
4252 
4253  errcontext("writing block %u of relation %s",
4254  bufHdr->tag.blockNum, path);
4255  pfree(path);
4256  }
4257 }
4258 
4259 /*
4260  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4261  */
4262 static int
4263 rnode_comparator(const void *p1, const void *p2)
4264 {
4265  RelFileNode n1 = *(const RelFileNode *) p1;
4266  RelFileNode n2 = *(const RelFileNode *) p2;
4267 
4268  if (n1.relNode < n2.relNode)
4269  return -1;
4270  else if (n1.relNode > n2.relNode)
4271  return 1;
4272 
4273  if (n1.dbNode < n2.dbNode)
4274  return -1;
4275  else if (n1.dbNode > n2.dbNode)
4276  return 1;
4277 
4278  if (n1.spcNode < n2.spcNode)
4279  return -1;
4280  else if (n1.spcNode > n2.spcNode)
4281  return 1;
4282  else
4283  return 0;
4284 }
4285 
4286 /*
4287  * Lock buffer header - set BM_LOCKED in buffer state.
4288  */
4289 uint32
4291 {
4292  SpinDelayStatus delayStatus;
4293  uint32 old_buf_state;
4294 
4295  init_local_spin_delay(&delayStatus);
4296 
4297  while (true)
4298  {
4299  /* set BM_LOCKED flag */
4300  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4301  /* if it wasn't set before we're OK */
4302  if (!(old_buf_state & BM_LOCKED))
4303  break;
4304  perform_spin_delay(&delayStatus);
4305  }
4306  finish_spin_delay(&delayStatus);
4307  return old_buf_state | BM_LOCKED;
4308 }
4309 
4310 /*
4311  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4312  * state at that point.
4313  *
4314  * Obviously the buffer could be locked by the time the value is returned, so
4315  * this is primarily useful in CAS style loops.
4316  */
4317 static uint32
4319 {
4320  SpinDelayStatus delayStatus;
4321  uint32 buf_state;
4322 
4323  init_local_spin_delay(&delayStatus);
4324 
4325  buf_state = pg_atomic_read_u32(&buf->state);
4326 
4327  while (buf_state & BM_LOCKED)
4328  {
4329  perform_spin_delay(&delayStatus);
4330  buf_state = pg_atomic_read_u32(&buf->state);
4331  }
4332 
4333  finish_spin_delay(&delayStatus);
4334 
4335  return buf_state;
4336 }
4337 
4338 /*
4339  * BufferTag comparator.
4340  */
4341 static int
4342 buffertag_comparator(const void *a, const void *b)
4343 {
4344  const BufferTag *ba = (const BufferTag *) a;
4345  const BufferTag *bb = (const BufferTag *) b;
4346  int ret;
4347 
4348  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4349 
4350  if (ret != 0)
4351  return ret;
4352 
4353  if (ba->forkNum < bb->forkNum)
4354  return -1;
4355  if (ba->forkNum > bb->forkNum)
4356  return 1;
4357 
4358  if (ba->blockNum < bb->blockNum)
4359  return -1;
4360  if (ba->blockNum > bb->blockNum)
4361  return 1;
4362 
4363  return 0;
4364 }
4365 
4366 /*
4367  * Comparator determining the writeout order in a checkpoint.
4368  *
4369  * It is important that tablespaces are compared first, the logic balancing
4370  * writes between tablespaces relies on it.
4371  */
4372 static int
4373 ckpt_buforder_comparator(const void *pa, const void *pb)
4374 {
4375  const CkptSortItem *a = (const CkptSortItem *) pa;
4376  const CkptSortItem *b = (const CkptSortItem *) pb;
4377 
4378  /* compare tablespace */
4379  if (a->tsId < b->tsId)
4380  return -1;
4381  else if (a->tsId > b->tsId)
4382  return 1;
4383  /* compare relation */
4384  if (a->relNode < b->relNode)
4385  return -1;
4386  else if (a->relNode > b->relNode)
4387  return 1;
4388  /* compare fork */
4389  else if (a->forkNum < b->forkNum)
4390  return -1;
4391  else if (a->forkNum > b->forkNum)
4392  return 1;
4393  /* compare block number */
4394  else if (a->blockNum < b->blockNum)
4395  return -1;
4396  else if (a->blockNum > b->blockNum)
4397  return 1;
4398  /* equal page IDs are unlikely, but not impossible */
4399  return 0;
4400 }
4401 
4402 /*
4403  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4404  * progress.
4405  */
4406 static int
4408 {
4409  CkptTsStatus *sa = (CkptTsStatus *) a;
4410  CkptTsStatus *sb = (CkptTsStatus *) b;
4411 
4412  /* we want a min-heap, so return 1 for the a < b */
4413  if (sa->progress < sb->progress)
4414  return 1;
4415  else if (sa->progress == sb->progress)
4416  return 0;
4417  else
4418  return -1;
4419 }
4420 
4421 /*
4422  * Initialize a writeback context, discarding potential previous state.
4423  *
4424  * *max_pending is a pointer instead of an immediate value, so the coalesce
4425  * limits can easily changed by the GUC mechanism, and so calling code does
4426  * not have to check the current configuration. A value of 0 means that no
4427  * writeback control will be performed.
4428  */
4429 void
4430 WritebackContextInit(WritebackContext *context, int *max_pending)
4431 {
4432  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4433 
4434  context->max_pending = max_pending;
4435  context->nr_pending = 0;
4436 }
4437 
4438 /*
4439  * Add buffer to list of pending writeback requests.
4440  */
4441 void
4443 {
4444  PendingWriteback *pending;
4445 
4446  /*
4447  * Add buffer to the pending writeback array, unless writeback control is
4448  * disabled.
4449  */
4450  if (*context->max_pending > 0)
4451  {
4453 
4454  pending = &context->pending_writebacks[context->nr_pending++];
4455 
4456  pending->tag = *tag;
4457  }
4458 
4459  /*
4460  * Perform pending flushes if the writeback limit is exceeded. This
4461  * includes the case where previously an item has been added, but control
4462  * is now disabled.
4463  */
4464  if (context->nr_pending >= *context->max_pending)
4465  IssuePendingWritebacks(context);
4466 }
4467 
4468 /*
4469  * Issue all pending writeback requests, previously scheduled with
4470  * ScheduleBufferTagForWriteback, to the OS.
4471  *
4472  * Because this is only used to improve the OSs IO scheduling we try to never
4473  * error out - it's just a hint.
4474  */
4475 void
4477 {
4478  int i;
4479 
4480  if (context->nr_pending == 0)
4481  return;
4482 
4483  /*
4484  * Executing the writes in-order can make them a lot faster, and allows to
4485  * merge writeback requests to consecutive blocks into larger writebacks.
4486  */
4487  qsort(&context->pending_writebacks, context->nr_pending,
4489 
4490  /*
4491  * Coalesce neighbouring writes, but nothing else. For that we iterate
4492  * through the, now sorted, array of pending flushes, and look forward to
4493  * find all neighbouring (or identical) writes.
4494  */
4495  for (i = 0; i < context->nr_pending; i++)
4496  {
4499  SMgrRelation reln;
4500  int ahead;
4501  BufferTag tag;
4502  Size nblocks = 1;
4503 
4504  cur = &context->pending_writebacks[i];
4505  tag = cur->tag;
4506 
4507  /*
4508  * Peek ahead, into following writeback requests, to see if they can
4509  * be combined with the current one.
4510  */
4511  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4512  {
4513  next = &context->pending_writebacks[i + ahead + 1];
4514 
4515  /* different file, stop */
4516  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4517  cur->tag.forkNum != next->tag.forkNum)
4518  break;
4519 
4520  /* ok, block queued twice, skip */
4521  if (cur->tag.blockNum == next->tag.blockNum)
4522  continue;
4523 
4524  /* only merge consecutive writes */
4525  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4526  break;
4527 
4528  nblocks++;
4529  cur = next;
4530  }
4531 
4532  i += ahead;
4533 
4534  /* and finally tell the kernel to write the data to storage */
4535  reln = smgropen(tag.rnode, InvalidBackendId);
4536  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4537  }
4538 
4539  context->nr_pending = 0;
4540 }
4541 
4542 
4543 /*
4544  * Implement slower/larger portions of TestForOldSnapshot
4545  *
4546  * Smaller/faster portions are put inline, but the entire set of logic is too
4547  * big for that.
4548  */
4549 void
4551 {
4552  if (RelationAllowsEarlyPruning(relation)
4553  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4554  ereport(ERROR,
4555  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4556  errmsg("snapshot too old")));
4557 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
void ProcessSyncRequests(void)
Definition: sync.c:236
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1589
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:670
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3779
Definition: lwlock.h:31
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:429
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2602
PgStat_Counter m_buf_alloc
Definition: pgstat.h:434
TimestampTz ckpt_sync_end_t
Definition: xlog.h:248
#define BM_PERMANENT
Definition: buf_internals.h:66
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
int64 VacuumPageMiss
Definition: globals.c:144
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:226
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
int errhint(const char *fmt,...)
Definition: elog.c:1071
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1745
long local_blks_read
Definition: instrument.h:26
int maintenance_io_concurrency
Definition: bufmgr.c:141
#define BM_TAG_VALID
Definition: buf_internals.h:60
Oid tsId
Definition: bufmgr.c:88
static int32 next
Definition: blutils.c:218
int VacuumCostBalance
Definition: globals.c:147
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2073
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2559
int BgWriterDelay
Definition: bgwriter.c:64
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
#define HASH_ELEM
Definition: hsearch.h:87
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
static uint32 PrivateRefCountClock
Definition: bufmgr.c:191
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3553
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
instr_time blk_read_time
Definition: instrument.h:31
bool update_process_title
Definition: ps_status.c:36
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1370
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4244
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3168
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1468
PGPROC * MyProc
Definition: proc.c:67
int backend_flush_after
Definition: bufmgr.c:149
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2443
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:431
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:652
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:898
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:204
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4026
int64 VacuumPageHit
Definition: globals.c:143
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:452
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4407
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:401
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:430
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:73
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:694
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1165
int checkpoint_flush_after
Definition: bufmgr.c:147
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:978
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
int64 VacuumPageDirty
Definition: globals.c:145
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3483
#define P_NEW
Definition: bufmgr.h:91
double bgwriter_lru_multiplier
Definition: bufmgr.c:125
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:917
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:109
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8072
#define BM_DIRTY
Definition: buf_internals.h:58
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:3316
int VacuumCostPageDirty
Definition: globals.c:139
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
Buffer recent_buffer
Definition: bufmgr.h:54
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2847
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
int effective_io_concurrency
Definition: bufmgr.c:134
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2376
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4476
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:152
signed int int32
Definition: c.h:362
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4430
int bgwriter_flush_after
Definition: bufmgr.c:148
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
struct SMgrSortArray SMgrSortArray
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:491
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:272
ErrorContextCallback * error_context_stack
Definition: elog.c:92
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define RelationOpenSmgr(relation)
Definition: rel.h:513
void ProcSendSignal(int pid)
Definition: proc.c:1812
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
#define BUF_REUSABLE
Definition: bufmgr.c:68
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:210
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4073
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pfree(void *pointer)
Definition: mcxt.c:1056
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2465
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3506
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3916
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:498
bool delayChkpt
Definition: proc.h:152
#define RelationIsValid(relation)
Definition: rel.h:429
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4442
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
int bgwriter_lru_maxpages
Definition: bufmgr.c:124
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1422
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
SMgrRelation srel
Definition: bufmgr.c:119
int num_to_scan
Definition: bufmgr.c:101
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:589
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:98
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2896
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
static char * buf
Definition: pg_test_fsync.c:67
int index
Definition: bufmgr.c:106
float8 progress
Definition: bufmgr.c:97
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3420
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:513
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1620
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4373
int errdetail(const char *fmt,...)
Definition: elog.c:957
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:222
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:374
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:526
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4342
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2866
#define BUF_WRITTEN
Definition: bufmgr.c:67
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static bool IsForInput
Definition: bufmgr.c:153
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3748
int VacuumCostPageHit
Definition: globals.c:137
static void BufferSync(int flags)
Definition: bufmgr.c:1796
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:3972
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1712
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1800
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:251
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:487
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1427
void UnlockBuffers(void)
Definition: bufmgr.c:3694
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4140
#define HASH_BLOBS
Definition: hsearch.h:88
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4263
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:59
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:3016
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:318
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
Size keysize
Definition: hsearch.h:72
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:715
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:144
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2820
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1005
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:786
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:538
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1432
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2518
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3890
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3224
CheckpointStatsData CheckpointStats
Definition: xlog.c:185
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:606
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1674
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:473
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
BackendId backend
Definition: relfilenode.h:75
TimestampTz ckpt_write_t
Definition: xlog.h:246
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1434
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1194
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1531
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
void AbortBufferIO(void)
Definition: bufmgr.c:4172
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4318
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1400
TimestampTz ckpt_sync_t
Definition: xlog.h:247
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:477
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:496
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1390
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3463
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
#define BM_IO_ERROR
Definition: buf_internals.h:62
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:2947
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:462
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2633
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
#define elog(elevel,...)
Definition: elog.h:214
int i
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:574
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:156
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:185
int NBuffers
Definition: globals.c:131
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
int num_scanned
Definition: bufmgr.c:103
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2489
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61
int VacuumCostPageMiss
Definition: globals.c:138
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
RelFileNode rnode
Definition: bufmgr.c:118
#define qsort(a, b, c, d)
Definition: port.h:479
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2654
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3119
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4225
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4550
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2619
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3521
#define XLogHintBitIsNeeded()
Definition: xlog.h:202
bool track_io_timing
Definition: bufmgr.c:126
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:578
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:221
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:123
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:939
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2499