PostgreSQL Source Code  git master
bufmgr.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * bufmgr.c
4  * buffer manager interface routines
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/storage/buffer/bufmgr.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 /*
16  * Principal entry points:
17  *
18  * ReadBuffer() -- find or create a buffer holding the requested page,
19  * and pin it so that no one can destroy it while this process
20  * is using it.
21  *
22  * ReleaseBuffer() -- unpin a buffer
23  *
24  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
25  * The disk write is delayed until buffer replacement or checkpoint.
26  *
27  * See also these files:
28  * freelist.c -- chooses victim for buffer replacement
29  * buf_table.c -- manages the buffer lookup table
30  */
31 #include "postgres.h"
32 
33 #include <sys/file.h>
34 #include <unistd.h>
35 
36 #include "access/tableam.h"
37 #include "access/xlog.h"
38 #include "catalog/catalog.h"
39 #include "catalog/storage.h"
40 #include "executor/instrument.h"
41 #include "lib/binaryheap.h"
42 #include "miscadmin.h"
43 #include "pg_trace.h"
44 #include "pgstat.h"
45 #include "postmaster/bgwriter.h"
46 #include "storage/buf_internals.h"
47 #include "storage/bufmgr.h"
48 #include "storage/ipc.h"
49 #include "storage/proc.h"
50 #include "storage/smgr.h"
51 #include "storage/standby.h"
52 #include "utils/memdebug.h"
53 #include "utils/ps_status.h"
54 #include "utils/rel.h"
55 #include "utils/resowner_private.h"
56 #include "utils/timestamp.h"
57 
58 
59 /* Note: these two macros only work on shared buffers, not local ones! */
60 #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
61 #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
62 
63 /* Note: this macro only works on local buffers, not shared ones! */
64 #define LocalBufHdrGetBlock(bufHdr) \
65  LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
66 
67 /* Bits in SyncOneBuffer's return value */
68 #define BUF_WRITTEN 0x01
69 #define BUF_REUSABLE 0x02
70 
71 #define RELS_BSEARCH_THRESHOLD 20
72 
73 typedef struct PrivateRefCountEntry
74 {
78 
79 /* 64 bytes, about the size of a cache line on common systems */
80 #define REFCOUNT_ARRAY_ENTRIES 8
81 
82 /*
83  * Status of buffers to checkpoint for a particular tablespace, used
84  * internally in BufferSync.
85  */
86 typedef struct CkptTsStatus
87 {
88  /* oid of the tablespace */
90 
91  /*
92  * Checkpoint progress for this tablespace. To make progress comparable
93  * between tablespaces the progress is, for each tablespace, measured as a
94  * number between 0 and the total number of to-be-checkpointed pages. Each
95  * page checkpointed in this tablespace increments this space's progress
96  * by progress_slice.
97  */
100 
101  /* number of to-be checkpointed pages in this tablespace */
103  /* already processed pages in this tablespace */
105 
106  /* current offset in CkptBufferIds for this tablespace */
107  int index;
108 } CkptTsStatus;
109 
110 /*
111  * Type for array used to sort SMgrRelations
112  *
113  * FlushRelationsAllBuffers shares the same comparator function with
114  * DropRelFileNodesAllBuffers. Pointer to this struct and RelFileNode must be
115  * compatible.
116  */
117 typedef struct SMgrSortArray
118 {
119  RelFileNode rnode; /* This must be the first member */
121 } SMgrSortArray;
122 
123 /* GUC variables */
124 bool zero_damaged_pages = false;
127 bool track_io_timing = false;
128 
129 /*
130  * How many buffers PrefetchBuffer callers should try to stay ahead of their
131  * ReadBuffer calls by. Zero means "never prefetch". This value is only used
132  * for buffers not belonging to tablespaces that have their
133  * effective_io_concurrency parameter set.
134  */
136 
137 /*
138  * Like effective_io_concurrency, but used by maintenance code paths that might
139  * benefit from a higher setting because they work on behalf of many sessions.
140  * Overridden by the tablespace setting of the same name.
141  */
143 
144 /*
145  * GUC variables about triggering kernel writeback for buffers written; OS
146  * dependent defaults are set via the GUC mechanism.
147  */
151 
152 /* local state for StartBufferIO and related functions */
153 static BufferDesc *InProgressBuf = NULL;
154 static bool IsForInput;
155 
156 /* local state for LockBufferForCleanup */
158 
159 /*
160  * Backend-Private refcount management:
161  *
162  * Each buffer also has a private refcount that keeps track of the number of
163  * times the buffer is pinned in the current process. This is so that the
164  * shared refcount needs to be modified only once if a buffer is pinned more
165  * than once by an individual backend. It's also used to check that no buffers
166  * are still pinned at the end of transactions and when exiting.
167  *
168  *
169  * To avoid - as we used to - requiring an array with NBuffers entries to keep
170  * track of local buffers, we use a small sequentially searched array
171  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
172  * keep track of backend local pins.
173  *
174  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
175  * refcounts are kept track of in the array; after that, new array entries
176  * displace old ones into the hash table. That way a frequently used entry
177  * can't get "stuck" in the hashtable while infrequent ones clog the array.
178  *
179  * Note that in most scenarios the number of pinned buffers will not exceed
180  * REFCOUNT_ARRAY_ENTRIES.
181  *
182  *
183  * To enter a buffer into the refcount tracking mechanism first reserve a free
184  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
185  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
186  * memory allocations in NewPrivateRefCountEntry() which can be important
187  * because in some scenarios it's called with a spinlock held...
188  */
190 static HTAB *PrivateRefCountHash = NULL;
194 
195 static void ReservePrivateRefCountEntry(void);
198 static inline int32 GetPrivateRefCount(Buffer buffer);
200 
201 /*
202  * Ensure that the PrivateRefCountArray has sufficient space to store one more
203  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
204  * a new entry - but it's perfectly fine to not use a reserved entry.
205  */
206 static void
208 {
209  /* Already reserved (or freed), nothing to do */
210  if (ReservedRefCountEntry != NULL)
211  return;
212 
213  /*
214  * First search for a free entry the array, that'll be sufficient in the
215  * majority of cases.
216  */
217  {
218  int i;
219 
220  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
221  {
223 
224  res = &PrivateRefCountArray[i];
225 
226  if (res->buffer == InvalidBuffer)
227  {
228  ReservedRefCountEntry = res;
229  return;
230  }
231  }
232  }
233 
234  /*
235  * No luck. All array entries are full. Move one array entry into the hash
236  * table.
237  */
238  {
239  /*
240  * Move entry from the current clock position in the array into the
241  * hashtable. Use that slot.
242  */
243  PrivateRefCountEntry *hashent;
244  bool found;
245 
246  /* select victim slot */
247  ReservedRefCountEntry =
249 
250  /* Better be used, otherwise we shouldn't get here. */
251  Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
252 
253  /* enter victim array entry into hashtable */
254  hashent = hash_search(PrivateRefCountHash,
255  (void *) &(ReservedRefCountEntry->buffer),
256  HASH_ENTER,
257  &found);
258  Assert(!found);
259  hashent->refcount = ReservedRefCountEntry->refcount;
260 
261  /* clear the now free array slot */
262  ReservedRefCountEntry->buffer = InvalidBuffer;
263  ReservedRefCountEntry->refcount = 0;
264 
266  }
267 }
268 
269 /*
270  * Fill a previously reserved refcount entry.
271  */
272 static PrivateRefCountEntry *
274 {
276 
277  /* only allowed to be called when a reservation has been made */
278  Assert(ReservedRefCountEntry != NULL);
279 
280  /* use up the reserved entry */
281  res = ReservedRefCountEntry;
282  ReservedRefCountEntry = NULL;
283 
284  /* and fill it */
285  res->buffer = buffer;
286  res->refcount = 0;
287 
288  return res;
289 }
290 
291 /*
292  * Return the PrivateRefCount entry for the passed buffer.
293  *
294  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
295  * do_move is true, and the entry resides in the hashtable the entry is
296  * optimized for frequent access by moving it to the array.
297  */
298 static PrivateRefCountEntry *
300 {
302  int i;
303 
304  Assert(BufferIsValid(buffer));
305  Assert(!BufferIsLocal(buffer));
306 
307  /*
308  * First search for references in the array, that'll be sufficient in the
309  * majority of cases.
310  */
311  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
312  {
313  res = &PrivateRefCountArray[i];
314 
315  if (res->buffer == buffer)
316  return res;
317  }
318 
319  /*
320  * By here we know that the buffer, if already pinned, isn't residing in
321  * the array.
322  *
323  * Only look up the buffer in the hashtable if we've previously overflowed
324  * into it.
325  */
326  if (PrivateRefCountOverflowed == 0)
327  return NULL;
328 
329  res = hash_search(PrivateRefCountHash,
330  (void *) &buffer,
331  HASH_FIND,
332  NULL);
333 
334  if (res == NULL)
335  return NULL;
336  else if (!do_move)
337  {
338  /* caller doesn't want us to move the hash entry into the array */
339  return res;
340  }
341  else
342  {
343  /* move buffer from hashtable into the free array slot */
344  bool found;
346 
347  /* Ensure there's a free array slot */
349 
350  /* Use up the reserved slot */
351  Assert(ReservedRefCountEntry != NULL);
352  free = ReservedRefCountEntry;
353  ReservedRefCountEntry = NULL;
354  Assert(free->buffer == InvalidBuffer);
355 
356  /* and fill it */
357  free->buffer = buffer;
358  free->refcount = res->refcount;
359 
360  /* delete from hashtable */
361  hash_search(PrivateRefCountHash,
362  (void *) &buffer,
363  HASH_REMOVE,
364  &found);
365  Assert(found);
368 
369  return free;
370  }
371 }
372 
373 /*
374  * Returns how many times the passed buffer is pinned by this backend.
375  *
376  * Only works for shared memory buffers!
377  */
378 static inline int32
380 {
382 
383  Assert(BufferIsValid(buffer));
384  Assert(!BufferIsLocal(buffer));
385 
386  /*
387  * Not moving the entry - that's ok for the current users, but we might
388  * want to change this one day.
389  */
390  ref = GetPrivateRefCountEntry(buffer, false);
391 
392  if (ref == NULL)
393  return 0;
394  return ref->refcount;
395 }
396 
397 /*
398  * Release resources used to track the reference count of a buffer which we no
399  * longer have pinned and don't want to pin again immediately.
400  */
401 static void
403 {
404  Assert(ref->refcount == 0);
405 
406  if (ref >= &PrivateRefCountArray[0] &&
408  {
409  ref->buffer = InvalidBuffer;
410 
411  /*
412  * Mark the just used entry as reserved - in many scenarios that
413  * allows us to avoid ever having to search the array/hash for free
414  * entries.
415  */
416  ReservedRefCountEntry = ref;
417  }
418  else
419  {
420  bool found;
421  Buffer buffer = ref->buffer;
422 
423  hash_search(PrivateRefCountHash,
424  (void *) &buffer,
425  HASH_REMOVE,
426  &found);
427  Assert(found);
430  }
431 }
432 
433 /*
434  * BufferIsPinned
435  * True iff the buffer is pinned (also checks for valid buffer number).
436  *
437  * NOTE: what we check here is that *this* backend holds a pin on
438  * the buffer. We do not care whether some other backend does.
439  */
440 #define BufferIsPinned(bufnum) \
441 ( \
442  !BufferIsValid(bufnum) ? \
443  false \
444  : \
445  BufferIsLocal(bufnum) ? \
446  (LocalRefCount[-(bufnum) - 1] > 0) \
447  : \
448  (GetPrivateRefCount(bufnum) > 0) \
449 )
450 
451 
452 static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence,
453  ForkNumber forkNum, BlockNumber blockNum,
455  bool *hit);
456 static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
457 static void PinBuffer_Locked(BufferDesc *buf);
458 static void UnpinBuffer(BufferDesc *buf, bool fixOwner);
459 static void BufferSync(int flags);
461 static int SyncOneBuffer(int buf_id, bool skip_recently_used,
462  WritebackContext *wb_context);
463 static void WaitIO(BufferDesc *buf);
464 static bool StartBufferIO(BufferDesc *buf, bool forInput);
465 static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
466  uint32 set_flag_bits);
467 static void shared_buffer_write_error_callback(void *arg);
468 static void local_buffer_write_error_callback(void *arg);
469 static BufferDesc *BufferAlloc(SMgrRelation smgr,
470  char relpersistence,
471  ForkNumber forkNum,
472  BlockNumber blockNum,
473  BufferAccessStrategy strategy,
474  bool *foundPtr);
475 static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
476 static void AtProcExit_Buffers(int code, Datum arg);
477 static void CheckForBufferLeaks(void);
478 static int rnode_comparator(const void *p1, const void *p2);
479 static int buffertag_comparator(const void *p1, const void *p2);
480 static int ckpt_buforder_comparator(const void *pa, const void *pb);
481 static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
482 
483 
484 /*
485  * Implementation of PrefetchBuffer() for shared buffers.
486  */
489  ForkNumber forkNum,
490  BlockNumber blockNum)
491 {
492  PrefetchBufferResult result = {InvalidBuffer, false};
493  BufferTag newTag; /* identity of requested block */
494  uint32 newHash; /* hash value for newTag */
495  LWLock *newPartitionLock; /* buffer partition lock for it */
496  int buf_id;
497 
498  Assert(BlockNumberIsValid(blockNum));
499 
500  /* create a tag so we can lookup the buffer */
501  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
502  forkNum, blockNum);
503 
504  /* determine its hash code and partition lock ID */
505  newHash = BufTableHashCode(&newTag);
506  newPartitionLock = BufMappingPartitionLock(newHash);
507 
508  /* see if the block is in the buffer pool already */
509  LWLockAcquire(newPartitionLock, LW_SHARED);
510  buf_id = BufTableLookup(&newTag, newHash);
511  LWLockRelease(newPartitionLock);
512 
513  /* If not in buffers, initiate prefetch */
514  if (buf_id < 0)
515  {
516 #ifdef USE_PREFETCH
517  /*
518  * Try to initiate an asynchronous read. This returns false in
519  * recovery if the relation file doesn't exist.
520  */
521  if (smgrprefetch(smgr_reln, forkNum, blockNum))
522  result.initiated_io = true;
523 #endif /* USE_PREFETCH */
524  }
525  else
526  {
527  /*
528  * Report the buffer it was in at that time. The caller may be able
529  * to avoid a buffer table lookup, but it's not pinned and it must be
530  * rechecked!
531  */
532  result.recent_buffer = buf_id + 1;
533  }
534 
535  /*
536  * If the block *is* in buffers, we do nothing. This is not really ideal:
537  * the block might be just about to be evicted, which would be stupid
538  * since we know we are going to need it soon. But the only easy answer
539  * is to bump the usage_count, which does not seem like a great solution:
540  * when the caller does ultimately touch the block, usage_count would get
541  * bumped again, resulting in too much favoritism for blocks that are
542  * involved in a prefetch sequence. A real fix would involve some
543  * additional per-buffer state, and it's not clear that there's enough of
544  * a problem to justify that.
545  */
546 
547  return result;
548 }
549 
550 /*
551  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
552  *
553  * This is named by analogy to ReadBuffer but doesn't actually allocate a
554  * buffer. Instead it tries to ensure that a future ReadBuffer for the given
555  * block will not be delayed by the I/O. Prefetching is optional.
556  *
557  * There are three possible outcomes:
558  *
559  * 1. If the block is already cached, the result includes a valid buffer that
560  * could be used by the caller to avoid the need for a later buffer lookup, but
561  * it's not pinned, so the caller must recheck it.
562  *
563  * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
564  * true. Currently there is no way to know if the data was already cached by
565  * the kernel and therefore didn't really initiate I/O, and no way to know when
566  * the I/O completes other than using synchronous ReadBuffer().
567  *
568  * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
569  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
570  * lack of a kernel facility), or the underlying relation file wasn't found and
571  * we are in recovery. (If the relation file wasn't found and we are not in
572  * recovery, an error is raised).
573  */
576 {
577  Assert(RelationIsValid(reln));
578  Assert(BlockNumberIsValid(blockNum));
579 
580  /* Open it at the smgr level if not already done */
581  RelationOpenSmgr(reln);
582 
583  if (RelationUsesLocalBuffers(reln))
584  {
585  /* see comments in ReadBufferExtended */
586  if (RELATION_IS_OTHER_TEMP(reln))
587  ereport(ERROR,
588  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
589  errmsg("cannot access temporary tables of other sessions")));
590 
591  /* pass it off to localbuf.c */
592  return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
593  }
594  else
595  {
596  /* pass it to the shared buffer version */
597  return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
598  }
599 }
600 
601 
602 /*
603  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
604  * fork with RBM_NORMAL mode and default strategy.
605  */
606 Buffer
608 {
609  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
610 }
611 
612 /*
613  * ReadBufferExtended -- returns a buffer containing the requested
614  * block of the requested relation. If the blknum
615  * requested is P_NEW, extend the relation file and
616  * allocate a new block. (Caller is responsible for
617  * ensuring that only one backend tries to extend a
618  * relation at the same time!)
619  *
620  * Returns: the buffer number for the buffer containing
621  * the block read. The returned buffer has been pinned.
622  * Does not return on error --- elog's instead.
623  *
624  * Assume when this function is called, that reln has been opened already.
625  *
626  * In RBM_NORMAL mode, the page is read from disk, and the page header is
627  * validated. An error is thrown if the page header is not valid. (But
628  * note that an all-zero page is considered "valid"; see
629  * PageIsVerifiedExtended().)
630  *
631  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
632  * valid, the page is zeroed instead of throwing an error. This is intended
633  * for non-critical data, where the caller is prepared to repair errors.
634  *
635  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
636  * filled with zeros instead of reading it from disk. Useful when the caller
637  * is going to fill the page from scratch, since this saves I/O and avoids
638  * unnecessary failure if the page-on-disk has corrupt page headers.
639  * The page is returned locked to ensure that the caller has a chance to
640  * initialize the page before it's made visible to others.
641  * Caution: do not use this mode to read a page that is beyond the relation's
642  * current physical EOF; that is likely to cause problems in md.c when
643  * the page is modified and written out. P_NEW is OK, though.
644  *
645  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
646  * a cleanup-strength lock on the page.
647  *
648  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
649  *
650  * If strategy is not NULL, a nondefault buffer access strategy is used.
651  * See buffer/README for details.
652  */
653 Buffer
656 {
657  bool hit;
658  Buffer buf;
659 
660  /* Open it at the smgr level if not already done */
661  RelationOpenSmgr(reln);
662 
663  /*
664  * Reject attempts to read non-local temporary relations; we would be
665  * likely to get wrong data since we have no visibility into the owning
666  * session's local buffers.
667  */
668  if (RELATION_IS_OTHER_TEMP(reln))
669  ereport(ERROR,
670  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
671  errmsg("cannot access temporary tables of other sessions")));
672 
673  /*
674  * Read the buffer, and update pgstat counters to reflect a cache hit or
675  * miss.
676  */
678  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
679  forkNum, blockNum, mode, strategy, &hit);
680  if (hit)
682  return buf;
683 }
684 
685 
686 /*
687  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
688  * a relcache entry for the relation.
689  *
690  * NB: At present, this function may only be used on permanent relations, which
691  * is OK, because we only use it during XLOG replay. If in the future we
692  * want to use it on temporary or unlogged relations, we could pass additional
693  * parameters.
694  */
695 Buffer
697  BlockNumber blockNum, ReadBufferMode mode,
698  BufferAccessStrategy strategy)
699 {
700  bool hit;
701 
702  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
703 
705 
706  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
707  mode, strategy, &hit);
708 }
709 
710 
711 /*
712  * ReadBuffer_common -- common logic for all ReadBuffer variants
713  *
714  * *hit is set to true if the request was satisfied from shared buffer cache.
715  */
716 static Buffer
717 ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
718  BlockNumber blockNum, ReadBufferMode mode,
719  BufferAccessStrategy strategy, bool *hit)
720 {
721  BufferDesc *bufHdr;
722  Block bufBlock;
723  bool found;
724  bool isExtend;
725  bool isLocalBuf = SmgrIsTemp(smgr);
726 
727  *hit = false;
728 
729  /* Make sure we will have room to remember the buffer pin */
731 
732  isExtend = (blockNum == P_NEW);
733 
734  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
735  smgr->smgr_rnode.node.spcNode,
736  smgr->smgr_rnode.node.dbNode,
737  smgr->smgr_rnode.node.relNode,
738  smgr->smgr_rnode.backend,
739  isExtend);
740 
741  /* Substitute proper block number if caller asked for P_NEW */
742  if (isExtend)
743  blockNum = smgrnblocks(smgr, forkNum);
744 
745  if (isLocalBuf)
746  {
747  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
748  if (found)
750  else if (isExtend)
752  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
753  mode == RBM_ZERO_ON_ERROR)
755  }
756  else
757  {
758  /*
759  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
760  * not currently in memory.
761  */
762  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
763  strategy, &found);
764  if (found)
766  else if (isExtend)
768  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
769  mode == RBM_ZERO_ON_ERROR)
771  }
772 
773  /* At this point we do NOT hold any locks. */
774 
775  /* if it was already in the buffer pool, we're done */
776  if (found)
777  {
778  if (!isExtend)
779  {
780  /* Just need to update stats before we exit */
781  *hit = true;
782  VacuumPageHit++;
783 
784  if (VacuumCostActive)
786 
787  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
788  smgr->smgr_rnode.node.spcNode,
789  smgr->smgr_rnode.node.dbNode,
790  smgr->smgr_rnode.node.relNode,
791  smgr->smgr_rnode.backend,
792  isExtend,
793  found);
794 
795  /*
796  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
797  * locked on return.
798  */
799  if (!isLocalBuf)
800  {
801  if (mode == RBM_ZERO_AND_LOCK)
803  LW_EXCLUSIVE);
804  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
806  }
807 
808  return BufferDescriptorGetBuffer(bufHdr);
809  }
810 
811  /*
812  * We get here only in the corner case where we are trying to extend
813  * the relation but we found a pre-existing buffer marked BM_VALID.
814  * This can happen because mdread doesn't complain about reads beyond
815  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
816  * read a block beyond EOF could have left a "valid" zero-filled
817  * buffer. Unfortunately, we have also seen this case occurring
818  * because of buggy Linux kernels that sometimes return an
819  * lseek(SEEK_END) result that doesn't account for a recent write. In
820  * that situation, the pre-existing buffer would contain valid data
821  * that we don't want to overwrite. Since the legitimate case should
822  * always have left a zero-filled buffer, complain if not PageIsNew.
823  */
824  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
825  if (!PageIsNew((Page) bufBlock))
826  ereport(ERROR,
827  (errmsg("unexpected data beyond EOF in block %u of relation %s",
828  blockNum, relpath(smgr->smgr_rnode, forkNum)),
829  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
830 
831  /*
832  * We *must* do smgrextend before succeeding, else the page will not
833  * be reserved by the kernel, and the next P_NEW call will decide to
834  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
835  * call that BufferAlloc didn't, and proceed.
836  */
837  if (isLocalBuf)
838  {
839  /* Only need to adjust flags */
840  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
841 
842  Assert(buf_state & BM_VALID);
843  buf_state &= ~BM_VALID;
844  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
845  }
846  else
847  {
848  /*
849  * Loop to handle the very small possibility that someone re-sets
850  * BM_VALID between our clearing it and StartBufferIO inspecting
851  * it.
852  */
853  do
854  {
855  uint32 buf_state = LockBufHdr(bufHdr);
856 
857  Assert(buf_state & BM_VALID);
858  buf_state &= ~BM_VALID;
859  UnlockBufHdr(bufHdr, buf_state);
860  } while (!StartBufferIO(bufHdr, true));
861  }
862  }
863 
864  /*
865  * if we have gotten to this point, we have allocated a buffer for the
866  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
867  * if it's a shared buffer.
868  *
869  * Note: if smgrextend fails, we will end up with a buffer that is
870  * allocated but not marked BM_VALID. P_NEW will still select the same
871  * block number (because the relation didn't get any longer on disk) and
872  * so future attempts to extend the relation will find the same buffer (if
873  * it's not been recycled) but come right back here to try smgrextend
874  * again.
875  */
876  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
877 
878  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
879 
880  if (isExtend)
881  {
882  /* new buffers are zero-filled */
883  MemSet((char *) bufBlock, 0, BLCKSZ);
884  /* don't set checksum for all-zero page */
885  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
886 
887  /*
888  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
889  * although we're essentially performing a write. At least on linux
890  * doing so defeats the 'delayed allocation' mechanism, leading to
891  * increased file fragmentation.
892  */
893  }
894  else
895  {
896  /*
897  * Read in the page, unless the caller intends to overwrite it and
898  * just wants us to allocate a buffer.
899  */
900  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
901  MemSet((char *) bufBlock, 0, BLCKSZ);
902  else
903  {
904  instr_time io_start,
905  io_time;
906 
907  if (track_io_timing)
908  INSTR_TIME_SET_CURRENT(io_start);
909 
910  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
911 
912  if (track_io_timing)
913  {
914  INSTR_TIME_SET_CURRENT(io_time);
915  INSTR_TIME_SUBTRACT(io_time, io_start);
918  }
919 
920  /* check for garbage data */
921  if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
923  {
924  if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
925  {
928  errmsg("invalid page in block %u of relation %s; zeroing out page",
929  blockNum,
930  relpath(smgr->smgr_rnode, forkNum))));
931  MemSet((char *) bufBlock, 0, BLCKSZ);
932  }
933  else
934  ereport(ERROR,
936  errmsg("invalid page in block %u of relation %s",
937  blockNum,
938  relpath(smgr->smgr_rnode, forkNum))));
939  }
940  }
941  }
942 
943  /*
944  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
945  * the page as valid, to make sure that no other backend sees the zeroed
946  * page before the caller has had a chance to initialize it.
947  *
948  * Since no-one else can be looking at the page contents yet, there is no
949  * difference between an exclusive lock and a cleanup-strength lock. (Note
950  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
951  * they assert that the buffer is already valid.)
952  */
953  if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
954  !isLocalBuf)
955  {
957  }
958 
959  if (isLocalBuf)
960  {
961  /* Only need to adjust flags */
962  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
963 
964  buf_state |= BM_VALID;
965  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
966  }
967  else
968  {
969  /* Set BM_VALID, terminate IO, and wake up any waiters */
970  TerminateBufferIO(bufHdr, false, BM_VALID);
971  }
972 
973  VacuumPageMiss++;
974  if (VacuumCostActive)
976 
977  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
978  smgr->smgr_rnode.node.spcNode,
979  smgr->smgr_rnode.node.dbNode,
980  smgr->smgr_rnode.node.relNode,
981  smgr->smgr_rnode.backend,
982  isExtend,
983  found);
984 
985  return BufferDescriptorGetBuffer(bufHdr);
986 }
987 
988 /*
989  * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
990  * buffer. If no buffer exists already, selects a replacement
991  * victim and evicts the old page, but does NOT read in new page.
992  *
993  * "strategy" can be a buffer replacement strategy object, or NULL for
994  * the default strategy. The selected buffer's usage_count is advanced when
995  * using the default strategy, but otherwise possibly not (see PinBuffer).
996  *
997  * The returned buffer is pinned and is already marked as holding the
998  * desired page. If it already did have the desired page, *foundPtr is
999  * set true. Otherwise, *foundPtr is set false and the buffer is marked
1000  * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
1001  *
1002  * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
1003  * we keep it for simplicity in ReadBuffer.
1004  *
1005  * No locks are held either at entry or exit.
1006  */
1007 static BufferDesc *
1008 BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1009  BlockNumber blockNum,
1010  BufferAccessStrategy strategy,
1011  bool *foundPtr)
1012 {
1013  BufferTag newTag; /* identity of requested block */
1014  uint32 newHash; /* hash value for newTag */
1015  LWLock *newPartitionLock; /* buffer partition lock for it */
1016  BufferTag oldTag; /* previous identity of selected buffer */
1017  uint32 oldHash; /* hash value for oldTag */
1018  LWLock *oldPartitionLock; /* buffer partition lock for it */
1019  uint32 oldFlags;
1020  int buf_id;
1021  BufferDesc *buf;
1022  bool valid;
1023  uint32 buf_state;
1024 
1025  /* create a tag so we can lookup the buffer */
1026  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1027 
1028  /* determine its hash code and partition lock ID */
1029  newHash = BufTableHashCode(&newTag);
1030  newPartitionLock = BufMappingPartitionLock(newHash);
1031 
1032  /* see if the block is in the buffer pool already */
1033  LWLockAcquire(newPartitionLock, LW_SHARED);
1034  buf_id = BufTableLookup(&newTag, newHash);
1035  if (buf_id >= 0)
1036  {
1037  /*
1038  * Found it. Now, pin the buffer so no one can steal it from the
1039  * buffer pool, and check to see if the correct data has been loaded
1040  * into the buffer.
1041  */
1042  buf = GetBufferDescriptor(buf_id);
1043 
1044  valid = PinBuffer(buf, strategy);
1045 
1046  /* Can release the mapping lock as soon as we've pinned it */
1047  LWLockRelease(newPartitionLock);
1048 
1049  *foundPtr = true;
1050 
1051  if (!valid)
1052  {
1053  /*
1054  * We can only get here if (a) someone else is still reading in
1055  * the page, or (b) a previous read attempt failed. We have to
1056  * wait for any active read attempt to finish, and then set up our
1057  * own read attempt if the page is still not BM_VALID.
1058  * StartBufferIO does it all.
1059  */
1060  if (StartBufferIO(buf, true))
1061  {
1062  /*
1063  * If we get here, previous attempts to read the buffer must
1064  * have failed ... but we shall bravely try again.
1065  */
1066  *foundPtr = false;
1067  }
1068  }
1069 
1070  return buf;
1071  }
1072 
1073  /*
1074  * Didn't find it in the buffer pool. We'll have to initialize a new
1075  * buffer. Remember to unlock the mapping lock while doing the work.
1076  */
1077  LWLockRelease(newPartitionLock);
1078 
1079  /* Loop here in case we have to try another victim buffer */
1080  for (;;)
1081  {
1082  /*
1083  * Ensure, while the spinlock's not yet held, that there's a free
1084  * refcount entry.
1085  */
1087 
1088  /*
1089  * Select a victim buffer. The buffer is returned with its header
1090  * spinlock still held!
1091  */
1092  buf = StrategyGetBuffer(strategy, &buf_state);
1093 
1094  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1095 
1096  /* Must copy buffer flags while we still hold the spinlock */
1097  oldFlags = buf_state & BUF_FLAG_MASK;
1098 
1099  /* Pin the buffer and then release the buffer spinlock */
1100  PinBuffer_Locked(buf);
1101 
1102  /*
1103  * If the buffer was dirty, try to write it out. There is a race
1104  * condition here, in that someone might dirty it after we released it
1105  * above, or even while we are writing it out (since our share-lock
1106  * won't prevent hint-bit updates). We will recheck the dirty bit
1107  * after re-locking the buffer header.
1108  */
1109  if (oldFlags & BM_DIRTY)
1110  {
1111  /*
1112  * We need a share-lock on the buffer contents to write it out
1113  * (else we might write invalid data, eg because someone else is
1114  * compacting the page contents while we write). We must use a
1115  * conditional lock acquisition here to avoid deadlock. Even
1116  * though the buffer was not pinned (and therefore surely not
1117  * locked) when StrategyGetBuffer returned it, someone else could
1118  * have pinned and exclusive-locked it by the time we get here. If
1119  * we try to get the lock unconditionally, we'd block waiting for
1120  * them; if they later block waiting for us, deadlock ensues.
1121  * (This has been observed to happen when two backends are both
1122  * trying to split btree index pages, and the second one just
1123  * happens to be trying to split the page the first one got from
1124  * StrategyGetBuffer.)
1125  */
1127  LW_SHARED))
1128  {
1129  /*
1130  * If using a nondefault strategy, and writing the buffer
1131  * would require a WAL flush, let the strategy decide whether
1132  * to go ahead and write/reuse the buffer or to choose another
1133  * victim. We need lock to inspect the page LSN, so this
1134  * can't be done inside StrategyGetBuffer.
1135  */
1136  if (strategy != NULL)
1137  {
1138  XLogRecPtr lsn;
1139 
1140  /* Read the LSN while holding buffer header lock */
1141  buf_state = LockBufHdr(buf);
1142  lsn = BufferGetLSN(buf);
1143  UnlockBufHdr(buf, buf_state);
1144 
1145  if (XLogNeedsFlush(lsn) &&
1146  StrategyRejectBuffer(strategy, buf))
1147  {
1148  /* Drop lock/pin and loop around for another buffer */
1150  UnpinBuffer(buf, true);
1151  continue;
1152  }
1153  }
1154 
1155  /* OK, do the I/O */
1156  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1157  smgr->smgr_rnode.node.spcNode,
1158  smgr->smgr_rnode.node.dbNode,
1159  smgr->smgr_rnode.node.relNode);
1160 
1161  FlushBuffer(buf, NULL);
1163 
1165  &buf->tag);
1166 
1167  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1168  smgr->smgr_rnode.node.spcNode,
1169  smgr->smgr_rnode.node.dbNode,
1170  smgr->smgr_rnode.node.relNode);
1171  }
1172  else
1173  {
1174  /*
1175  * Someone else has locked the buffer, so give it up and loop
1176  * back to get another one.
1177  */
1178  UnpinBuffer(buf, true);
1179  continue;
1180  }
1181  }
1182 
1183  /*
1184  * To change the association of a valid buffer, we'll need to have
1185  * exclusive lock on both the old and new mapping partitions.
1186  */
1187  if (oldFlags & BM_TAG_VALID)
1188  {
1189  /*
1190  * Need to compute the old tag's hashcode and partition lock ID.
1191  * XXX is it worth storing the hashcode in BufferDesc so we need
1192  * not recompute it here? Probably not.
1193  */
1194  oldTag = buf->tag;
1195  oldHash = BufTableHashCode(&oldTag);
1196  oldPartitionLock = BufMappingPartitionLock(oldHash);
1197 
1198  /*
1199  * Must lock the lower-numbered partition first to avoid
1200  * deadlocks.
1201  */
1202  if (oldPartitionLock < newPartitionLock)
1203  {
1204  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1205  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1206  }
1207  else if (oldPartitionLock > newPartitionLock)
1208  {
1209  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1210  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1211  }
1212  else
1213  {
1214  /* only one partition, only one lock */
1215  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1216  }
1217  }
1218  else
1219  {
1220  /* if it wasn't valid, we need only the new partition */
1221  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1222  /* remember we have no old-partition lock or tag */
1223  oldPartitionLock = NULL;
1224  /* keep the compiler quiet about uninitialized variables */
1225  oldHash = 0;
1226  }
1227 
1228  /*
1229  * Try to make a hashtable entry for the buffer under its new tag.
1230  * This could fail because while we were writing someone else
1231  * allocated another buffer for the same block we want to read in.
1232  * Note that we have not yet removed the hashtable entry for the old
1233  * tag.
1234  */
1235  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1236 
1237  if (buf_id >= 0)
1238  {
1239  /*
1240  * Got a collision. Someone has already done what we were about to
1241  * do. We'll just handle this as if it were found in the buffer
1242  * pool in the first place. First, give up the buffer we were
1243  * planning to use.
1244  */
1245  UnpinBuffer(buf, true);
1246 
1247  /* Can give up that buffer's mapping partition lock now */
1248  if (oldPartitionLock != NULL &&
1249  oldPartitionLock != newPartitionLock)
1250  LWLockRelease(oldPartitionLock);
1251 
1252  /* remaining code should match code at top of routine */
1253 
1254  buf = GetBufferDescriptor(buf_id);
1255 
1256  valid = PinBuffer(buf, strategy);
1257 
1258  /* Can release the mapping lock as soon as we've pinned it */
1259  LWLockRelease(newPartitionLock);
1260 
1261  *foundPtr = true;
1262 
1263  if (!valid)
1264  {
1265  /*
1266  * We can only get here if (a) someone else is still reading
1267  * in the page, or (b) a previous read attempt failed. We
1268  * have to wait for any active read attempt to finish, and
1269  * then set up our own read attempt if the page is still not
1270  * BM_VALID. StartBufferIO does it all.
1271  */
1272  if (StartBufferIO(buf, true))
1273  {
1274  /*
1275  * If we get here, previous attempts to read the buffer
1276  * must have failed ... but we shall bravely try again.
1277  */
1278  *foundPtr = false;
1279  }
1280  }
1281 
1282  return buf;
1283  }
1284 
1285  /*
1286  * Need to lock the buffer header too in order to change its tag.
1287  */
1288  buf_state = LockBufHdr(buf);
1289 
1290  /*
1291  * Somebody could have pinned or re-dirtied the buffer while we were
1292  * doing the I/O and making the new hashtable entry. If so, we can't
1293  * recycle this buffer; we must undo everything we've done and start
1294  * over with a new victim buffer.
1295  */
1296  oldFlags = buf_state & BUF_FLAG_MASK;
1297  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1298  break;
1299 
1300  UnlockBufHdr(buf, buf_state);
1301  BufTableDelete(&newTag, newHash);
1302  if (oldPartitionLock != NULL &&
1303  oldPartitionLock != newPartitionLock)
1304  LWLockRelease(oldPartitionLock);
1305  LWLockRelease(newPartitionLock);
1306  UnpinBuffer(buf, true);
1307  }
1308 
1309  /*
1310  * Okay, it's finally safe to rename the buffer.
1311  *
1312  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1313  * paranoia. We also reset the usage_count since any recency of use of
1314  * the old content is no longer relevant. (The usage_count starts out at
1315  * 1 so that the buffer can survive one clock-sweep pass.)
1316  *
1317  * Make sure BM_PERMANENT is set for buffers that must be written at every
1318  * checkpoint. Unlogged buffers only need to be written at shutdown
1319  * checkpoints, except for their "init" forks, which need to be treated
1320  * just like permanent relations.
1321  */
1322  buf->tag = newTag;
1323  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1326  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1327  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1328  else
1329  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1330 
1331  UnlockBufHdr(buf, buf_state);
1332 
1333  if (oldPartitionLock != NULL)
1334  {
1335  BufTableDelete(&oldTag, oldHash);
1336  if (oldPartitionLock != newPartitionLock)
1337  LWLockRelease(oldPartitionLock);
1338  }
1339 
1340  LWLockRelease(newPartitionLock);
1341 
1342  /*
1343  * Buffer contents are currently invalid. Try to get the io_in_progress
1344  * lock. If StartBufferIO returns false, then someone else managed to
1345  * read it before we did, so there's nothing left for BufferAlloc() to do.
1346  */
1347  if (StartBufferIO(buf, true))
1348  *foundPtr = false;
1349  else
1350  *foundPtr = true;
1351 
1352  return buf;
1353 }
1354 
1355 /*
1356  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1357  * freelist.
1358  *
1359  * The buffer header spinlock must be held at entry. We drop it before
1360  * returning. (This is sane because the caller must have locked the
1361  * buffer in order to be sure it should be dropped.)
1362  *
1363  * This is used only in contexts such as dropping a relation. We assume
1364  * that no other backend could possibly be interested in using the page,
1365  * so the only reason the buffer might be pinned is if someone else is
1366  * trying to write it out. We have to let them finish before we can
1367  * reclaim the buffer.
1368  *
1369  * The buffer could get reclaimed by someone else while we are waiting
1370  * to acquire the necessary locks; if so, don't mess it up.
1371  */
1372 static void
1374 {
1375  BufferTag oldTag;
1376  uint32 oldHash; /* hash value for oldTag */
1377  LWLock *oldPartitionLock; /* buffer partition lock for it */
1378  uint32 oldFlags;
1379  uint32 buf_state;
1380 
1381  /* Save the original buffer tag before dropping the spinlock */
1382  oldTag = buf->tag;
1383 
1384  buf_state = pg_atomic_read_u32(&buf->state);
1385  Assert(buf_state & BM_LOCKED);
1386  UnlockBufHdr(buf, buf_state);
1387 
1388  /*
1389  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1390  * worth storing the hashcode in BufferDesc so we need not recompute it
1391  * here? Probably not.
1392  */
1393  oldHash = BufTableHashCode(&oldTag);
1394  oldPartitionLock = BufMappingPartitionLock(oldHash);
1395 
1396 retry:
1397 
1398  /*
1399  * Acquire exclusive mapping lock in preparation for changing the buffer's
1400  * association.
1401  */
1402  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1403 
1404  /* Re-lock the buffer header */
1405  buf_state = LockBufHdr(buf);
1406 
1407  /* If it's changed while we were waiting for lock, do nothing */
1408  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1409  {
1410  UnlockBufHdr(buf, buf_state);
1411  LWLockRelease(oldPartitionLock);
1412  return;
1413  }
1414 
1415  /*
1416  * We assume the only reason for it to be pinned is that someone else is
1417  * flushing the page out. Wait for them to finish. (This could be an
1418  * infinite loop if the refcount is messed up... it would be nice to time
1419  * out after awhile, but there seems no way to be sure how many loops may
1420  * be needed. Note that if the other guy has pinned the buffer but not
1421  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1422  * be busy-looping here.)
1423  */
1424  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1425  {
1426  UnlockBufHdr(buf, buf_state);
1427  LWLockRelease(oldPartitionLock);
1428  /* safety check: should definitely not be our *own* pin */
1430  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1431  WaitIO(buf);
1432  goto retry;
1433  }
1434 
1435  /*
1436  * Clear out the buffer's tag and flags. We must do this to ensure that
1437  * linear scans of the buffer array don't think the buffer is valid.
1438  */
1439  oldFlags = buf_state & BUF_FLAG_MASK;
1440  CLEAR_BUFFERTAG(buf->tag);
1441  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1442  UnlockBufHdr(buf, buf_state);
1443 
1444  /*
1445  * Remove the buffer from the lookup hashtable, if it was in there.
1446  */
1447  if (oldFlags & BM_TAG_VALID)
1448  BufTableDelete(&oldTag, oldHash);
1449 
1450  /*
1451  * Done with mapping lock.
1452  */
1453  LWLockRelease(oldPartitionLock);
1454 
1455  /*
1456  * Insert the buffer at the head of the list of free buffers.
1457  */
1458  StrategyFreeBuffer(buf);
1459 }
1460 
1461 /*
1462  * MarkBufferDirty
1463  *
1464  * Marks buffer contents as dirty (actual write happens later).
1465  *
1466  * Buffer must be pinned and exclusive-locked. (If caller does not hold
1467  * exclusive lock, then somebody could be in process of writing the buffer,
1468  * leading to risk of bad data written to disk.)
1469  */
1470 void
1472 {
1473  BufferDesc *bufHdr;
1474  uint32 buf_state;
1475  uint32 old_buf_state;
1476 
1477  if (!BufferIsValid(buffer))
1478  elog(ERROR, "bad buffer ID: %d", buffer);
1479 
1480  if (BufferIsLocal(buffer))
1481  {
1482  MarkLocalBufferDirty(buffer);
1483  return;
1484  }
1485 
1486  bufHdr = GetBufferDescriptor(buffer - 1);
1487 
1488  Assert(BufferIsPinned(buffer));
1490  LW_EXCLUSIVE));
1491 
1492  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1493  for (;;)
1494  {
1495  if (old_buf_state & BM_LOCKED)
1496  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1497 
1498  buf_state = old_buf_state;
1499 
1500  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1501  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1502 
1503  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1504  buf_state))
1505  break;
1506  }
1507 
1508  /*
1509  * If the buffer was not dirty already, do vacuum accounting.
1510  */
1511  if (!(old_buf_state & BM_DIRTY))
1512  {
1513  VacuumPageDirty++;
1515  if (VacuumCostActive)
1517  }
1518 }
1519 
1520 /*
1521  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
1522  *
1523  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
1524  * compared to calling the two routines separately. Now it's mainly just
1525  * a convenience function. However, if the passed buffer is valid and
1526  * already contains the desired block, we just return it as-is; and that
1527  * does save considerable work compared to a full release and reacquire.
1528  *
1529  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
1530  * buffer actually needs to be released. This case is the same as ReadBuffer,
1531  * but can save some tests in the caller.
1532  */
1533 Buffer
1535  Relation relation,
1536  BlockNumber blockNum)
1537 {
1538  ForkNumber forkNum = MAIN_FORKNUM;
1539  BufferDesc *bufHdr;
1540 
1541  if (BufferIsValid(buffer))
1542  {
1543  Assert(BufferIsPinned(buffer));
1544  if (BufferIsLocal(buffer))
1545  {
1546  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1547  if (bufHdr->tag.blockNum == blockNum &&
1548  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1549  bufHdr->tag.forkNum == forkNum)
1550  return buffer;
1552  LocalRefCount[-buffer - 1]--;
1553  }
1554  else
1555  {
1556  bufHdr = GetBufferDescriptor(buffer - 1);
1557  /* we have pin, so it's ok to examine tag without spinlock */
1558  if (bufHdr->tag.blockNum == blockNum &&
1559  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1560  bufHdr->tag.forkNum == forkNum)
1561  return buffer;
1562  UnpinBuffer(bufHdr, true);
1563  }
1564  }
1565 
1566  return ReadBuffer(relation, blockNum);
1567 }
1568 
1569 /*
1570  * PinBuffer -- make buffer unavailable for replacement.
1571  *
1572  * For the default access strategy, the buffer's usage_count is incremented
1573  * when we first pin it; for other strategies we just make sure the usage_count
1574  * isn't zero. (The idea of the latter is that we don't want synchronized
1575  * heap scans to inflate the count, but we need it to not be zero to discourage
1576  * other backends from stealing buffers from our ring. As long as we cycle
1577  * through the ring faster than the global clock-sweep cycles, buffers in
1578  * our ring won't be chosen as victims for replacement by other backends.)
1579  *
1580  * This should be applied only to shared buffers, never local ones.
1581  *
1582  * Since buffers are pinned/unpinned very frequently, pin buffers without
1583  * taking the buffer header lock; instead update the state variable in loop of
1584  * CAS operations. Hopefully it's just a single CAS.
1585  *
1586  * Note that ResourceOwnerEnlargeBuffers must have been done already.
1587  *
1588  * Returns true if buffer is BM_VALID, else false. This provision allows
1589  * some callers to avoid an extra spinlock cycle.
1590  */
1591 static bool
1593 {
1595  bool result;
1596  PrivateRefCountEntry *ref;
1597 
1598  ref = GetPrivateRefCountEntry(b, true);
1599 
1600  if (ref == NULL)
1601  {
1602  uint32 buf_state;
1603  uint32 old_buf_state;
1604 
1606  ref = NewPrivateRefCountEntry(b);
1607 
1608  old_buf_state = pg_atomic_read_u32(&buf->state);
1609  for (;;)
1610  {
1611  if (old_buf_state & BM_LOCKED)
1612  old_buf_state = WaitBufHdrUnlocked(buf);
1613 
1614  buf_state = old_buf_state;
1615 
1616  /* increase refcount */
1617  buf_state += BUF_REFCOUNT_ONE;
1618 
1619  if (strategy == NULL)
1620  {
1621  /* Default case: increase usagecount unless already max. */
1623  buf_state += BUF_USAGECOUNT_ONE;
1624  }
1625  else
1626  {
1627  /*
1628  * Ring buffers shouldn't evict others from pool. Thus we
1629  * don't make usagecount more than 1.
1630  */
1631  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1632  buf_state += BUF_USAGECOUNT_ONE;
1633  }
1634 
1635  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1636  buf_state))
1637  {
1638  result = (buf_state & BM_VALID) != 0;
1639 
1640  /*
1641  * Assume that we acquired a buffer pin for the purposes of
1642  * Valgrind buffer client checks (even in !result case) to
1643  * keep things simple. Buffers that are unsafe to access are
1644  * not generally guaranteed to be marked undefined or
1645  * non-accessible in any case.
1646  */
1648  break;
1649  }
1650  }
1651  }
1652  else
1653  {
1654  /*
1655  * If we previously pinned the buffer, it must surely be valid.
1656  *
1657  * Note: We deliberately avoid a Valgrind client request here.
1658  * Individual access methods can optionally superimpose buffer page
1659  * client requests on top of our client requests to enforce that
1660  * buffers are only accessed while locked (and pinned). It's possible
1661  * that the buffer page is legitimately non-accessible here. We
1662  * cannot meddle with that.
1663  */
1664  result = true;
1665  }
1666 
1667  ref->refcount++;
1668  Assert(ref->refcount > 0);
1670  return result;
1671 }
1672 
1673 /*
1674  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
1675  * The spinlock is released before return.
1676  *
1677  * As this function is called with the spinlock held, the caller has to
1678  * previously call ReservePrivateRefCountEntry().
1679  *
1680  * Currently, no callers of this function want to modify the buffer's
1681  * usage_count at all, so there's no need for a strategy parameter.
1682  * Also we don't bother with a BM_VALID test (the caller could check that for
1683  * itself).
1684  *
1685  * Also all callers only ever use this function when it's known that the
1686  * buffer can't have a preexisting pin by this backend. That allows us to skip
1687  * searching the private refcount array & hash, which is a boon, because the
1688  * spinlock is still held.
1689  *
1690  * Note: use of this routine is frequently mandatory, not just an optimization
1691  * to save a spin lock/unlock cycle, because we need to pin a buffer before
1692  * its state can change under us.
1693  */
1694 static void
1696 {
1697  Buffer b;
1698  PrivateRefCountEntry *ref;
1699  uint32 buf_state;
1700 
1701  /*
1702  * As explained, We don't expect any preexisting pins. That allows us to
1703  * manipulate the PrivateRefCount after releasing the spinlock
1704  */
1706 
1707  /*
1708  * Buffer can't have a preexisting pin, so mark its page as defined to
1709  * Valgrind (this is similar to the PinBuffer() case where the backend
1710  * doesn't already have a buffer pin)
1711  */
1713 
1714  /*
1715  * Since we hold the buffer spinlock, we can update the buffer state and
1716  * release the lock in one operation.
1717  */
1718  buf_state = pg_atomic_read_u32(&buf->state);
1719  Assert(buf_state & BM_LOCKED);
1720  buf_state += BUF_REFCOUNT_ONE;
1721  UnlockBufHdr(buf, buf_state);
1722 
1723  b = BufferDescriptorGetBuffer(buf);
1724 
1725  ref = NewPrivateRefCountEntry(b);
1726  ref->refcount++;
1727 
1729 }
1730 
1731 /*
1732  * UnpinBuffer -- make buffer available for replacement.
1733  *
1734  * This should be applied only to shared buffers, never local ones.
1735  *
1736  * Most but not all callers want CurrentResourceOwner to be adjusted.
1737  * Those that don't should pass fixOwner = false.
1738  */
1739 static void
1740 UnpinBuffer(BufferDesc *buf, bool fixOwner)
1741 {
1742  PrivateRefCountEntry *ref;
1744 
1745  /* not moving as we're likely deleting it soon anyway */
1746  ref = GetPrivateRefCountEntry(b, false);
1747  Assert(ref != NULL);
1748 
1749  if (fixOwner)
1751 
1752  Assert(ref->refcount > 0);
1753  ref->refcount--;
1754  if (ref->refcount == 0)
1755  {
1756  uint32 buf_state;
1757  uint32 old_buf_state;
1758 
1759  /*
1760  * Mark buffer non-accessible to Valgrind.
1761  *
1762  * Note that the buffer may have already been marked non-accessible
1763  * within access method code that enforces that buffers are only
1764  * accessed while a buffer lock is held.
1765  */
1767 
1768  /* I'd better not still hold any locks on the buffer */
1771 
1772  /*
1773  * Decrement the shared reference count.
1774  *
1775  * Since buffer spinlock holder can update status using just write,
1776  * it's not safe to use atomic decrement here; thus use a CAS loop.
1777  */
1778  old_buf_state = pg_atomic_read_u32(&buf->state);
1779  for (;;)
1780  {
1781  if (old_buf_state & BM_LOCKED)
1782  old_buf_state = WaitBufHdrUnlocked(buf);
1783 
1784  buf_state = old_buf_state;
1785 
1786  buf_state -= BUF_REFCOUNT_ONE;
1787 
1788  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1789  buf_state))
1790  break;
1791  }
1792 
1793  /* Support LockBufferForCleanup() */
1794  if (buf_state & BM_PIN_COUNT_WAITER)
1795  {
1796  /*
1797  * Acquire the buffer header lock, re-check that there's a waiter.
1798  * Another backend could have unpinned this buffer, and already
1799  * woken up the waiter. There's no danger of the buffer being
1800  * replaced after we unpinned it above, as it's pinned by the
1801  * waiter.
1802  */
1803  buf_state = LockBufHdr(buf);
1804 
1805  if ((buf_state & BM_PIN_COUNT_WAITER) &&
1806  BUF_STATE_GET_REFCOUNT(buf_state) == 1)
1807  {
1808  /* we just released the last pin other than the waiter's */
1809  int wait_backend_pid = buf->wait_backend_pid;
1810 
1811  buf_state &= ~BM_PIN_COUNT_WAITER;
1812  UnlockBufHdr(buf, buf_state);
1813  ProcSendSignal(wait_backend_pid);
1814  }
1815  else
1816  UnlockBufHdr(buf, buf_state);
1817  }
1819  }
1820 }
1821 
1822 /*
1823  * BufferSync -- Write out all dirty buffers in the pool.
1824  *
1825  * This is called at checkpoint time to write out all dirty shared buffers.
1826  * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
1827  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
1828  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
1829  * unlogged buffers, which are otherwise skipped. The remaining flags
1830  * currently have no effect here.
1831  */
1832 static void
1833 BufferSync(int flags)
1834 {
1835  uint32 buf_state;
1836  int buf_id;
1837  int num_to_scan;
1838  int num_spaces;
1839  int num_processed;
1840  int num_written;
1841  CkptTsStatus *per_ts_stat = NULL;
1842  Oid last_tsid;
1843  binaryheap *ts_heap;
1844  int i;
1845  int mask = BM_DIRTY;
1846  WritebackContext wb_context;
1847 
1848  /* Make sure we can handle the pin inside SyncOneBuffer */
1850 
1851  /*
1852  * Unless this is a shutdown checkpoint or we have been explicitly told,
1853  * we write only permanent, dirty buffers. But at shutdown or end of
1854  * recovery, we write all dirty buffers.
1855  */
1858  mask |= BM_PERMANENT;
1859 
1860  /*
1861  * Loop over all buffers, and mark the ones that need to be written with
1862  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1863  * can estimate how much work needs to be done.
1864  *
1865  * This allows us to write only those pages that were dirty when the
1866  * checkpoint began, and not those that get dirtied while it proceeds.
1867  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1868  * later in this function, or by normal backends or the bgwriter cleaning
1869  * scan, the flag is cleared. Any buffer dirtied after this point won't
1870  * have the flag set.
1871  *
1872  * Note that if we fail to write some buffer, we may leave buffers with
1873  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1874  * certainly need to be written for the next checkpoint attempt, too.
1875  */
1876  num_to_scan = 0;
1877  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1878  {
1879  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1880 
1881  /*
1882  * Header spinlock is enough to examine BM_DIRTY, see comment in
1883  * SyncOneBuffer.
1884  */
1885  buf_state = LockBufHdr(bufHdr);
1886 
1887  if ((buf_state & mask) == mask)
1888  {
1889  CkptSortItem *item;
1890 
1891  buf_state |= BM_CHECKPOINT_NEEDED;
1892 
1893  item = &CkptBufferIds[num_to_scan++];
1894  item->buf_id = buf_id;
1895  item->tsId = bufHdr->tag.rnode.spcNode;
1896  item->relNode = bufHdr->tag.rnode.relNode;
1897  item->forkNum = bufHdr->tag.forkNum;
1898  item->blockNum = bufHdr->tag.blockNum;
1899  }
1900 
1901  UnlockBufHdr(bufHdr, buf_state);
1902 
1903  /* Check for barrier events in case NBuffers is large. */
1906  }
1907 
1908  if (num_to_scan == 0)
1909  return; /* nothing to do */
1910 
1912 
1913  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1914 
1915  /*
1916  * Sort buffers that need to be written to reduce the likelihood of random
1917  * IO. The sorting is also important for the implementation of balancing
1918  * writes between tablespaces. Without balancing writes we'd potentially
1919  * end up writing to the tablespaces one-by-one; possibly overloading the
1920  * underlying system.
1921  */
1922  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1924 
1925  num_spaces = 0;
1926 
1927  /*
1928  * Allocate progress status for each tablespace with buffers that need to
1929  * be flushed. This requires the to-be-flushed array to be sorted.
1930  */
1931  last_tsid = InvalidOid;
1932  for (i = 0; i < num_to_scan; i++)
1933  {
1934  CkptTsStatus *s;
1935  Oid cur_tsid;
1936 
1937  cur_tsid = CkptBufferIds[i].tsId;
1938 
1939  /*
1940  * Grow array of per-tablespace status structs, every time a new
1941  * tablespace is found.
1942  */
1943  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1944  {
1945  Size sz;
1946 
1947  num_spaces++;
1948 
1949  /*
1950  * Not worth adding grow-by-power-of-2 logic here - even with a
1951  * few hundred tablespaces this should be fine.
1952  */
1953  sz = sizeof(CkptTsStatus) * num_spaces;
1954 
1955  if (per_ts_stat == NULL)
1956  per_ts_stat = (CkptTsStatus *) palloc(sz);
1957  else
1958  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1959 
1960  s = &per_ts_stat[num_spaces - 1];
1961  memset(s, 0, sizeof(*s));
1962  s->tsId = cur_tsid;
1963 
1964  /*
1965  * The first buffer in this tablespace. As CkptBufferIds is sorted
1966  * by tablespace all (s->num_to_scan) buffers in this tablespace
1967  * will follow afterwards.
1968  */
1969  s->index = i;
1970 
1971  /*
1972  * progress_slice will be determined once we know how many buffers
1973  * are in each tablespace, i.e. after this loop.
1974  */
1975 
1976  last_tsid = cur_tsid;
1977  }
1978  else
1979  {
1980  s = &per_ts_stat[num_spaces - 1];
1981  }
1982 
1983  s->num_to_scan++;
1984 
1985  /* Check for barrier events. */
1988  }
1989 
1990  Assert(num_spaces > 0);
1991 
1992  /*
1993  * Build a min-heap over the write-progress in the individual tablespaces,
1994  * and compute how large a portion of the total progress a single
1995  * processed buffer is.
1996  */
1997  ts_heap = binaryheap_allocate(num_spaces,
1999  NULL);
2000 
2001  for (i = 0; i < num_spaces; i++)
2002  {
2003  CkptTsStatus *ts_stat = &per_ts_stat[i];
2004 
2005  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2006 
2007  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2008  }
2009 
2010  binaryheap_build(ts_heap);
2011 
2012  /*
2013  * Iterate through to-be-checkpointed buffers and write the ones (still)
2014  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2015  * tablespaces; otherwise the sorting would lead to only one tablespace
2016  * receiving writes at a time, making inefficient use of the hardware.
2017  */
2018  num_processed = 0;
2019  num_written = 0;
2020  while (!binaryheap_empty(ts_heap))
2021  {
2022  BufferDesc *bufHdr = NULL;
2023  CkptTsStatus *ts_stat = (CkptTsStatus *)
2025 
2026  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2027  Assert(buf_id != -1);
2028 
2029  bufHdr = GetBufferDescriptor(buf_id);
2030 
2031  num_processed++;
2032 
2033  /*
2034  * We don't need to acquire the lock here, because we're only looking
2035  * at a single bit. It's possible that someone else writes the buffer
2036  * and clears the flag right after we check, but that doesn't matter
2037  * since SyncOneBuffer will then do nothing. However, there is a
2038  * further race condition: it's conceivable that between the time we
2039  * examine the bit here and the time SyncOneBuffer acquires the lock,
2040  * someone else not only wrote the buffer but replaced it with another
2041  * page and dirtied it. In that improbable case, SyncOneBuffer will
2042  * write the buffer though we didn't need to. It doesn't seem worth
2043  * guarding against this, though.
2044  */
2046  {
2047  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2048  {
2049  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2051  num_written++;
2052  }
2053  }
2054 
2055  /*
2056  * Measure progress independent of actually having to flush the buffer
2057  * - otherwise writing become unbalanced.
2058  */
2059  ts_stat->progress += ts_stat->progress_slice;
2060  ts_stat->num_scanned++;
2061  ts_stat->index++;
2062 
2063  /* Have all the buffers from the tablespace been processed? */
2064  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2065  {
2066  binaryheap_remove_first(ts_heap);
2067  }
2068  else
2069  {
2070  /* update heap with the new progress */
2071  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2072  }
2073 
2074  /*
2075  * Sleep to throttle our I/O rate.
2076  *
2077  * (This will check for barrier events even if it doesn't sleep.)
2078  */
2079  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2080  }
2081 
2082  /* issue all pending flushes */
2083  IssuePendingWritebacks(&wb_context);
2084 
2085  pfree(per_ts_stat);
2086  per_ts_stat = NULL;
2087  binaryheap_free(ts_heap);
2088 
2089  /*
2090  * Update checkpoint statistics. As noted above, this doesn't include
2091  * buffers written by other backends or bgwriter scan.
2092  */
2093  CheckpointStats.ckpt_bufs_written += num_written;
2094 
2095  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2096 }
2097 
2098 /*
2099  * BgBufferSync -- Write out some dirty buffers in the pool.
2100  *
2101  * This is called periodically by the background writer process.
2102  *
2103  * Returns true if it's appropriate for the bgwriter process to go into
2104  * low-power hibernation mode. (This happens if the strategy clock sweep
2105  * has been "lapped" and no buffer allocations have occurred recently,
2106  * or if the bgwriter has been effectively disabled by setting
2107  * bgwriter_lru_maxpages to 0.)
2108  */
2109 bool
2111 {
2112  /* info obtained from freelist.c */
2113  int strategy_buf_id;
2114  uint32 strategy_passes;
2115  uint32 recent_alloc;
2116 
2117  /*
2118  * Information saved between calls so we can determine the strategy
2119  * point's advance rate and avoid scanning already-cleaned buffers.
2120  */
2121  static bool saved_info_valid = false;
2122  static int prev_strategy_buf_id;
2123  static uint32 prev_strategy_passes;
2124  static int next_to_clean;
2125  static uint32 next_passes;
2126 
2127  /* Moving averages of allocation rate and clean-buffer density */
2128  static float smoothed_alloc = 0;
2129  static float smoothed_density = 10.0;
2130 
2131  /* Potentially these could be tunables, but for now, not */
2132  float smoothing_samples = 16;
2133  float scan_whole_pool_milliseconds = 120000.0;
2134 
2135  /* Used to compute how far we scan ahead */
2136  long strategy_delta;
2137  int bufs_to_lap;
2138  int bufs_ahead;
2139  float scans_per_alloc;
2140  int reusable_buffers_est;
2141  int upcoming_alloc_est;
2142  int min_scan_buffers;
2143 
2144  /* Variables for the scanning loop proper */
2145  int num_to_scan;
2146  int num_written;
2147  int reusable_buffers;
2148 
2149  /* Variables for final smoothed_density update */
2150  long new_strategy_delta;
2151  uint32 new_recent_alloc;
2152 
2153  /*
2154  * Find out where the freelist clock sweep currently is, and how many
2155  * buffer allocations have happened since our last call.
2156  */
2157  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2158 
2159  /* Report buffer alloc counts to pgstat */
2160  BgWriterStats.m_buf_alloc += recent_alloc;
2161 
2162  /*
2163  * If we're not running the LRU scan, just stop after doing the stats
2164  * stuff. We mark the saved state invalid so that we can recover sanely
2165  * if LRU scan is turned back on later.
2166  */
2167  if (bgwriter_lru_maxpages <= 0)
2168  {
2169  saved_info_valid = false;
2170  return true;
2171  }
2172 
2173  /*
2174  * Compute strategy_delta = how many buffers have been scanned by the
2175  * clock sweep since last time. If first time through, assume none. Then
2176  * see if we are still ahead of the clock sweep, and if so, how many
2177  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2178  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2179  * behavior when the passes counts wrap around.
2180  */
2181  if (saved_info_valid)
2182  {
2183  int32 passes_delta = strategy_passes - prev_strategy_passes;
2184 
2185  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2186  strategy_delta += (long) passes_delta * NBuffers;
2187 
2188  Assert(strategy_delta >= 0);
2189 
2190  if ((int32) (next_passes - strategy_passes) > 0)
2191  {
2192  /* we're one pass ahead of the strategy point */
2193  bufs_to_lap = strategy_buf_id - next_to_clean;
2194 #ifdef BGW_DEBUG
2195  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2196  next_passes, next_to_clean,
2197  strategy_passes, strategy_buf_id,
2198  strategy_delta, bufs_to_lap);
2199 #endif
2200  }
2201  else if (next_passes == strategy_passes &&
2202  next_to_clean >= strategy_buf_id)
2203  {
2204  /* on same pass, but ahead or at least not behind */
2205  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2206 #ifdef BGW_DEBUG
2207  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2208  next_passes, next_to_clean,
2209  strategy_passes, strategy_buf_id,
2210  strategy_delta, bufs_to_lap);
2211 #endif
2212  }
2213  else
2214  {
2215  /*
2216  * We're behind, so skip forward to the strategy point and start
2217  * cleaning from there.
2218  */
2219 #ifdef BGW_DEBUG
2220  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2221  next_passes, next_to_clean,
2222  strategy_passes, strategy_buf_id,
2223  strategy_delta);
2224 #endif
2225  next_to_clean = strategy_buf_id;
2226  next_passes = strategy_passes;
2227  bufs_to_lap = NBuffers;
2228  }
2229  }
2230  else
2231  {
2232  /*
2233  * Initializing at startup or after LRU scanning had been off. Always
2234  * start at the strategy point.
2235  */
2236 #ifdef BGW_DEBUG
2237  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2238  strategy_passes, strategy_buf_id);
2239 #endif
2240  strategy_delta = 0;
2241  next_to_clean = strategy_buf_id;
2242  next_passes = strategy_passes;
2243  bufs_to_lap = NBuffers;
2244  }
2245 
2246  /* Update saved info for next time */
2247  prev_strategy_buf_id = strategy_buf_id;
2248  prev_strategy_passes = strategy_passes;
2249  saved_info_valid = true;
2250 
2251  /*
2252  * Compute how many buffers had to be scanned for each new allocation, ie,
2253  * 1/density of reusable buffers, and track a moving average of that.
2254  *
2255  * If the strategy point didn't move, we don't update the density estimate
2256  */
2257  if (strategy_delta > 0 && recent_alloc > 0)
2258  {
2259  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2260  smoothed_density += (scans_per_alloc - smoothed_density) /
2261  smoothing_samples;
2262  }
2263 
2264  /*
2265  * Estimate how many reusable buffers there are between the current
2266  * strategy point and where we've scanned ahead to, based on the smoothed
2267  * density estimate.
2268  */
2269  bufs_ahead = NBuffers - bufs_to_lap;
2270  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2271 
2272  /*
2273  * Track a moving average of recent buffer allocations. Here, rather than
2274  * a true average we want a fast-attack, slow-decline behavior: we
2275  * immediately follow any increase.
2276  */
2277  if (smoothed_alloc <= (float) recent_alloc)
2278  smoothed_alloc = recent_alloc;
2279  else
2280  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2281  smoothing_samples;
2282 
2283  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2284  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2285 
2286  /*
2287  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2288  * eventually underflow to zero, and the underflows produce annoying
2289  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2290  * zero, there's no point in tracking smaller and smaller values of
2291  * smoothed_alloc, so just reset it to exactly zero to avoid this
2292  * syndrome. It will pop back up as soon as recent_alloc increases.
2293  */
2294  if (upcoming_alloc_est == 0)
2295  smoothed_alloc = 0;
2296 
2297  /*
2298  * Even in cases where there's been little or no buffer allocation
2299  * activity, we want to make a small amount of progress through the buffer
2300  * cache so that as many reusable buffers as possible are clean after an
2301  * idle period.
2302  *
2303  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2304  * the BGW will be called during the scan_whole_pool time; slice the
2305  * buffer pool into that many sections.
2306  */
2307  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2308 
2309  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2310  {
2311 #ifdef BGW_DEBUG
2312  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2313  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2314 #endif
2315  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2316  }
2317 
2318  /*
2319  * Now write out dirty reusable buffers, working forward from the
2320  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2321  * enough buffers to match our estimate of the next cycle's allocation
2322  * requirements, or hit the bgwriter_lru_maxpages limit.
2323  */
2324 
2325  /* Make sure we can handle the pin inside SyncOneBuffer */
2327 
2328  num_to_scan = bufs_to_lap;
2329  num_written = 0;
2330  reusable_buffers = reusable_buffers_est;
2331 
2332  /* Execute the LRU scan */
2333  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2334  {
2335  int sync_state = SyncOneBuffer(next_to_clean, true,
2336  wb_context);
2337 
2338  if (++next_to_clean >= NBuffers)
2339  {
2340  next_to_clean = 0;
2341  next_passes++;
2342  }
2343  num_to_scan--;
2344 
2345  if (sync_state & BUF_WRITTEN)
2346  {
2347  reusable_buffers++;
2348  if (++num_written >= bgwriter_lru_maxpages)
2349  {
2351  break;
2352  }
2353  }
2354  else if (sync_state & BUF_REUSABLE)
2355  reusable_buffers++;
2356  }
2357 
2358  BgWriterStats.m_buf_written_clean += num_written;
2359 
2360 #ifdef BGW_DEBUG
2361  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2362  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2363  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2364  bufs_to_lap - num_to_scan,
2365  num_written,
2366  reusable_buffers - reusable_buffers_est);
2367 #endif
2368 
2369  /*
2370  * Consider the above scan as being like a new allocation scan.
2371  * Characterize its density and update the smoothed one based on it. This
2372  * effectively halves the moving average period in cases where both the
2373  * strategy and the background writer are doing some useful scanning,
2374  * which is helpful because a long memory isn't as desirable on the
2375  * density estimates.
2376  */
2377  new_strategy_delta = bufs_to_lap - num_to_scan;
2378  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2379  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2380  {
2381  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2382  smoothed_density += (scans_per_alloc - smoothed_density) /
2383  smoothing_samples;
2384 
2385 #ifdef BGW_DEBUG
2386  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2387  new_recent_alloc, new_strategy_delta,
2388  scans_per_alloc, smoothed_density);
2389 #endif
2390  }
2391 
2392  /* Return true if OK to hibernate */
2393  return (bufs_to_lap == 0 && recent_alloc == 0);
2394 }
2395 
2396 /*
2397  * SyncOneBuffer -- process a single buffer during syncing.
2398  *
2399  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
2400  * buffers marked recently used, as these are not replacement candidates.
2401  *
2402  * Returns a bitmask containing the following flag bits:
2403  * BUF_WRITTEN: we wrote the buffer.
2404  * BUF_REUSABLE: buffer is available for replacement, ie, it has
2405  * pin count 0 and usage count 0.
2406  *
2407  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
2408  * after locking it, but we don't care all that much.)
2409  *
2410  * Note: caller must have done ResourceOwnerEnlargeBuffers.
2411  */
2412 static int
2413 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
2414 {
2415  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2416  int result = 0;
2417  uint32 buf_state;
2418  BufferTag tag;
2419 
2421 
2422  /*
2423  * Check whether buffer needs writing.
2424  *
2425  * We can make this check without taking the buffer content lock so long
2426  * as we mark pages dirty in access methods *before* logging changes with
2427  * XLogInsert(): if someone marks the buffer dirty just after our check we
2428  * don't worry because our checkpoint.redo points before log record for
2429  * upcoming changes and so we are not required to write such dirty buffer.
2430  */
2431  buf_state = LockBufHdr(bufHdr);
2432 
2433  if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
2434  BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2435  {
2436  result |= BUF_REUSABLE;
2437  }
2438  else if (skip_recently_used)
2439  {
2440  /* Caller told us not to write recently-used buffers */
2441  UnlockBufHdr(bufHdr, buf_state);
2442  return result;
2443  }
2444 
2445  if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
2446  {
2447  /* It's clean, so nothing to do */
2448  UnlockBufHdr(bufHdr, buf_state);
2449  return result;
2450  }
2451 
2452  /*
2453  * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
2454  * buffer is clean by the time we've locked it.)
2455  */
2456  PinBuffer_Locked(bufHdr);
2458 
2459  FlushBuffer(bufHdr, NULL);
2460 
2462 
2463  tag = bufHdr->tag;
2464 
2465  UnpinBuffer(bufHdr, true);
2466 
2467  ScheduleBufferTagForWriteback(wb_context, &tag);
2468 
2469  return result | BUF_WRITTEN;
2470 }
2471 
2472 /*
2473  * AtEOXact_Buffers - clean up at end of transaction.
2474  *
2475  * As of PostgreSQL 8.0, buffer pins should get released by the
2476  * ResourceOwner mechanism. This routine is just a debugging
2477  * cross-check that no pins remain.
2478  */
2479 void
2480 AtEOXact_Buffers(bool isCommit)
2481 {
2483 
2484  AtEOXact_LocalBuffers(isCommit);
2485 
2487 }
2488 
2489 /*
2490  * Initialize access to shared buffer pool
2491  *
2492  * This is called during backend startup (whether standalone or under the
2493  * postmaster). It sets up for this backend's access to the already-existing
2494  * buffer pool.
2495  *
2496  * NB: this is called before InitProcess(), so we do not have a PGPROC and
2497  * cannot do LWLockAcquire; hence we can't actually access stuff in
2498  * shared memory yet. We are only initializing local data here.
2499  * (See also InitBufferPoolBackend)
2500  */
2501 void
2503 {
2504  HASHCTL hash_ctl;
2505 
2506  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2507 
2508  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2509  hash_ctl.keysize = sizeof(int32);
2510  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2511 
2512  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2513  HASH_ELEM | HASH_BLOBS);
2514 }
2515 
2516 /*
2517  * InitBufferPoolBackend --- second-stage initialization of a new backend
2518  *
2519  * This is called after we have acquired a PGPROC and so can safely get
2520  * LWLocks. We don't currently need to do anything at this stage ...
2521  * except register a shmem-exit callback. AtProcExit_Buffers needs LWLock
2522  * access, and thereby has to be called at the corresponding phase of
2523  * backend shutdown.
2524  */
2525 void
2527 {
2529 }
2530 
2531 /*
2532  * During backend exit, ensure that we released all shared-buffer locks and
2533  * assert that we have no remaining pins.
2534  */
2535 static void
2537 {
2538  AbortBufferIO();
2539  UnlockBuffers();
2540 
2542 
2543  /* localbuf.c needs a chance too */
2545 }
2546 
2547 /*
2548  * CheckForBufferLeaks - ensure this backend holds no buffer pins
2549  *
2550  * As of PostgreSQL 8.0, buffer pins should get released by the
2551  * ResourceOwner mechanism. This routine is just a debugging
2552  * cross-check that no pins remain.
2553  */
2554 static void
2556 {
2557 #ifdef USE_ASSERT_CHECKING
2558  int RefCountErrors = 0;
2559  PrivateRefCountEntry *res;
2560  int i;
2561 
2562  /* check the array */
2563  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2564  {
2565  res = &PrivateRefCountArray[i];
2566 
2567  if (res->buffer != InvalidBuffer)
2568  {
2570  RefCountErrors++;
2571  }
2572  }
2573 
2574  /* if necessary search the hash */
2576  {
2577  HASH_SEQ_STATUS hstat;
2578 
2579  hash_seq_init(&hstat, PrivateRefCountHash);
2580  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2581  {
2583  RefCountErrors++;
2584  }
2585 
2586  }
2587 
2588  Assert(RefCountErrors == 0);
2589 #endif
2590 }
2591 
2592 /*
2593  * Helper routine to issue warnings when a buffer is unexpectedly pinned
2594  */
2595 void
2597 {
2598  BufferDesc *buf;
2599  int32 loccount;
2600  char *path;
2601  BackendId backend;
2602  uint32 buf_state;
2603 
2604  Assert(BufferIsValid(buffer));
2605  if (BufferIsLocal(buffer))
2606  {
2607  buf = GetLocalBufferDescriptor(-buffer - 1);
2608  loccount = LocalRefCount[-buffer - 1];
2609  backend = MyBackendId;
2610  }
2611  else
2612  {
2613  buf = GetBufferDescriptor(buffer - 1);
2614  loccount = GetPrivateRefCount(buffer);
2615  backend = InvalidBackendId;
2616  }
2617 
2618  /* theoretically we should lock the bufhdr here */
2619  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2620  buf_state = pg_atomic_read_u32(&buf->state);
2621  elog(WARNING,
2622  "buffer refcount leak: [%03d] "
2623  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2624  buffer, path,
2625  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2626  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2627  pfree(path);
2628 }
2629 
2630 /*
2631  * CheckPointBuffers
2632  *
2633  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
2634  *
2635  * Note: temporary relations do not participate in checkpoints, so they don't
2636  * need to be flushed.
2637  */
2638 void
2640 {
2641  BufferSync(flags);
2642 }
2643 
2644 
2645 /*
2646  * Do whatever is needed to prepare for commit at the bufmgr and smgr levels
2647  */
2648 void
2650 {
2651  /* Nothing to do in bufmgr anymore... */
2652 }
2653 
2654 /*
2655  * BufferGetBlockNumber
2656  * Returns the block number associated with a buffer.
2657  *
2658  * Note:
2659  * Assumes that the buffer is valid and pinned, else the
2660  * value may be obsolete immediately...
2661  */
2664 {
2665  BufferDesc *bufHdr;
2666 
2667  Assert(BufferIsPinned(buffer));
2668 
2669  if (BufferIsLocal(buffer))
2670  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2671  else
2672  bufHdr = GetBufferDescriptor(buffer - 1);
2673 
2674  /* pinned, so OK to read tag without spinlock */
2675  return bufHdr->tag.blockNum;
2676 }
2677 
2678 /*
2679  * BufferGetTag
2680  * Returns the relfilenode, fork number and block number associated with
2681  * a buffer.
2682  */
2683 void
2685  BlockNumber *blknum)
2686 {
2687  BufferDesc *bufHdr;
2688 
2689  /* Do the same checks as BufferGetBlockNumber. */
2690  Assert(BufferIsPinned(buffer));
2691 
2692  if (BufferIsLocal(buffer))
2693  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2694  else
2695  bufHdr = GetBufferDescriptor(buffer - 1);
2696 
2697  /* pinned, so OK to read tag without spinlock */
2698  *rnode = bufHdr->tag.rnode;
2699  *forknum = bufHdr->tag.forkNum;
2700  *blknum = bufHdr->tag.blockNum;
2701 }
2702 
2703 /*
2704  * FlushBuffer
2705  * Physically write out a shared buffer.
2706  *
2707  * NOTE: this actually just passes the buffer contents to the kernel; the
2708  * real write to disk won't happen until the kernel feels like it. This
2709  * is okay from our point of view since we can redo the changes from WAL.
2710  * However, we will need to force the changes to disk via fsync before
2711  * we can checkpoint WAL.
2712  *
2713  * The caller must hold a pin on the buffer and have share-locked the
2714  * buffer contents. (Note: a share-lock does not prevent updates of
2715  * hint bits in the buffer, so the page could change while the write
2716  * is in progress, but we assume that that will not invalidate the data
2717  * written.)
2718  *
2719  * If the caller has an smgr reference for the buffer's relation, pass it
2720  * as the second parameter. If not, pass NULL.
2721  */
2722 static void
2724 {
2725  XLogRecPtr recptr;
2726  ErrorContextCallback errcallback;
2727  instr_time io_start,
2728  io_time;
2729  Block bufBlock;
2730  char *bufToWrite;
2731  uint32 buf_state;
2732 
2733  /*
2734  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2735  * false, then someone else flushed the buffer before we could, so we need
2736  * not do anything.
2737  */
2738  if (!StartBufferIO(buf, false))
2739  return;
2740 
2741  /* Setup error traceback support for ereport() */
2743  errcallback.arg = (void *) buf;
2744  errcallback.previous = error_context_stack;
2745  error_context_stack = &errcallback;
2746 
2747  /* Find smgr relation for buffer */
2748  if (reln == NULL)
2749  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2750 
2751  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2752  buf->tag.blockNum,
2753  reln->smgr_rnode.node.spcNode,
2754  reln->smgr_rnode.node.dbNode,
2755  reln->smgr_rnode.node.relNode);
2756 
2757  buf_state = LockBufHdr(buf);
2758 
2759  /*
2760  * Run PageGetLSN while holding header lock, since we don't have the
2761  * buffer locked exclusively in all cases.
2762  */
2763  recptr = BufferGetLSN(buf);
2764 
2765  /* To check if block content changes while flushing. - vadim 01/17/97 */
2766  buf_state &= ~BM_JUST_DIRTIED;
2767  UnlockBufHdr(buf, buf_state);
2768 
2769  /*
2770  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2771  * rule that log updates must hit disk before any of the data-file changes
2772  * they describe do.
2773  *
2774  * However, this rule does not apply to unlogged relations, which will be
2775  * lost after a crash anyway. Most unlogged relation pages do not bear
2776  * LSNs since we never emit WAL records for them, and therefore flushing
2777  * up through the buffer LSN would be useless, but harmless. However,
2778  * GiST indexes use LSNs internally to track page-splits, and therefore
2779  * unlogged GiST pages bear "fake" LSNs generated by
2780  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2781  * LSN counter could advance past the WAL insertion point; and if it did
2782  * happen, attempting to flush WAL through that location would fail, with
2783  * disastrous system-wide consequences. To make sure that can't happen,
2784  * skip the flush if the buffer isn't permanent.
2785  */
2786  if (buf_state & BM_PERMANENT)
2787  XLogFlush(recptr);
2788 
2789  /*
2790  * Now it's safe to write buffer to disk. Note that no one else should
2791  * have been able to write it while we were busy with log flushing because
2792  * we have the io_in_progress lock.
2793  */
2794  bufBlock = BufHdrGetBlock(buf);
2795 
2796  /*
2797  * Update page checksum if desired. Since we have only shared lock on the
2798  * buffer, other processes might be updating hint bits in it, so we must
2799  * copy the page to private storage if we do checksumming.
2800  */
2801  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2802 
2803  if (track_io_timing)
2804  INSTR_TIME_SET_CURRENT(io_start);
2805 
2806  /*
2807  * bufToWrite is either the shared buffer or a copy, as appropriate.
2808  */
2809  smgrwrite(reln,
2810  buf->tag.forkNum,
2811  buf->tag.blockNum,
2812  bufToWrite,
2813  false);
2814 
2815  if (track_io_timing)
2816  {
2817  INSTR_TIME_SET_CURRENT(io_time);
2818  INSTR_TIME_SUBTRACT(io_time, io_start);
2821  }
2822 
2824 
2825  /*
2826  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2827  * end the io_in_progress state.
2828  */
2829  TerminateBufferIO(buf, true, 0);
2830 
2831  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2832  buf->tag.blockNum,
2833  reln->smgr_rnode.node.spcNode,
2834  reln->smgr_rnode.node.dbNode,
2835  reln->smgr_rnode.node.relNode);
2836 
2837  /* Pop the error context stack */
2838  error_context_stack = errcallback.previous;
2839 }
2840 
2841 /*
2842  * RelationGetNumberOfBlocksInFork
2843  * Determines the current number of pages in the specified relation fork.
2844  *
2845  * Note that the accuracy of the result will depend on the details of the
2846  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
2847  * it might not be.
2848  */
2851 {
2852  switch (relation->rd_rel->relkind)
2853  {
2854  case RELKIND_SEQUENCE:
2855  case RELKIND_INDEX:
2856  case RELKIND_PARTITIONED_INDEX:
2857  /* Open it at the smgr level if not already done */
2858  RelationOpenSmgr(relation);
2859 
2860  return smgrnblocks(relation->rd_smgr, forkNum);
2861 
2862  case RELKIND_RELATION:
2863  case RELKIND_TOASTVALUE:
2864  case RELKIND_MATVIEW:
2865  {
2866  /*
2867  * Not every table AM uses BLCKSZ wide fixed size blocks.
2868  * Therefore tableam returns the size in bytes - but for the
2869  * purpose of this routine, we want the number of blocks.
2870  * Therefore divide, rounding up.
2871  */
2872  uint64 szbytes;
2873 
2874  szbytes = table_relation_size(relation, forkNum);
2875 
2876  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2877  }
2878  case RELKIND_VIEW:
2879  case RELKIND_COMPOSITE_TYPE:
2880  case RELKIND_FOREIGN_TABLE:
2881  case RELKIND_PARTITIONED_TABLE:
2882  default:
2883  Assert(false);
2884  break;
2885  }
2886 
2887  return 0; /* keep compiler quiet */
2888 }
2889 
2890 /*
2891  * BufferIsPermanent
2892  * Determines whether a buffer will potentially still be around after
2893  * a crash. Caller must hold a buffer pin.
2894  */
2895 bool
2897 {
2898  BufferDesc *bufHdr;
2899 
2900  /* Local buffers are used only for temp relations. */
2901  if (BufferIsLocal(buffer))
2902  return false;
2903 
2904  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2905  Assert(BufferIsValid(buffer));
2906  Assert(BufferIsPinned(buffer));
2907 
2908  /*
2909  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2910  * need not bother with the buffer header spinlock. Even if someone else
2911  * changes the buffer header state while we're doing this, the state is
2912  * changed atomically, so we'll read the old value or the new value, but
2913  * not random garbage.
2914  */
2915  bufHdr = GetBufferDescriptor(buffer - 1);
2916  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2917 }
2918 
2919 /*
2920  * BufferGetLSNAtomic
2921  * Retrieves the LSN of the buffer atomically using a buffer header lock.
2922  * This is necessary for some callers who may not have an exclusive lock
2923  * on the buffer.
2924  */
2925 XLogRecPtr
2927 {
2928  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2929  char *page = BufferGetPage(buffer);
2930  XLogRecPtr lsn;
2931  uint32 buf_state;
2932 
2933  /*
2934  * If we don't need locking for correctness, fastpath out.
2935  */
2936  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2937  return PageGetLSN(page);
2938 
2939  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2940  Assert(BufferIsValid(buffer));
2941  Assert(BufferIsPinned(buffer));
2942 
2943  buf_state = LockBufHdr(bufHdr);
2944  lsn = PageGetLSN(page);
2945  UnlockBufHdr(bufHdr, buf_state);
2946 
2947  return lsn;
2948 }
2949 
2950 /* ---------------------------------------------------------------------
2951  * DropRelFileNodeBuffers
2952  *
2953  * This function removes from the buffer pool all the pages of the
2954  * specified relation forks that have block numbers >= firstDelBlock.
2955  * (In particular, with firstDelBlock = 0, all pages are removed.)
2956  * Dirty pages are simply dropped, without bothering to write them
2957  * out first. Therefore, this is NOT rollback-able, and so should be
2958  * used only with extreme caution!
2959  *
2960  * Currently, this is called only from smgr.c when the underlying file
2961  * is about to be deleted or truncated (firstDelBlock is needed for
2962  * the truncation case). The data in the affected pages would therefore
2963  * be deleted momentarily anyway, and there is no point in writing it.
2964  * It is the responsibility of higher-level code to ensure that the
2965  * deletion or truncation does not lose any data that could be needed
2966  * later. It is also the responsibility of higher-level code to ensure
2967  * that no other process could be trying to load more pages of the
2968  * relation into buffers.
2969  *
2970  * XXX currently it sequentially searches the buffer pool, should be
2971  * changed to more clever ways of searching. However, this routine
2972  * is used only in code paths that aren't very performance-critical,
2973  * and we shouldn't slow down the hot paths to make it faster ...
2974  * --------------------------------------------------------------------
2975  */
2976 void
2978  int nforks, BlockNumber *firstDelBlock)
2979 {
2980  int i;
2981  int j;
2982 
2983  /* If it's a local relation, it's localbuf.c's problem. */
2984  if (RelFileNodeBackendIsTemp(rnode))
2985  {
2986  if (rnode.backend == MyBackendId)
2987  {
2988  for (j = 0; j < nforks; j++)
2989  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2990  firstDelBlock[j]);
2991  }
2992  return;
2993  }
2994 
2995  for (i = 0; i < NBuffers; i++)
2996  {
2997  BufferDesc *bufHdr = GetBufferDescriptor(i);
2998  uint32 buf_state;
2999 
3000  /*
3001  * We can make this a tad faster by prechecking the buffer tag before
3002  * we attempt to lock the buffer; this saves a lot of lock
3003  * acquisitions in typical cases. It should be safe because the
3004  * caller must have AccessExclusiveLock on the relation, or some other
3005  * reason to be certain that no one is loading new pages of the rel
3006  * into the buffer pool. (Otherwise we might well miss such pages
3007  * entirely.) Therefore, while the tag might be changing while we
3008  * look at it, it can't be changing *to* a value we care about, only
3009  * *away* from such a value. So false negatives are impossible, and
3010  * false positives are safe because we'll recheck after getting the
3011  * buffer lock.
3012  *
3013  * We could check forkNum and blockNum as well as the rnode, but the
3014  * incremental win from doing so seems small.
3015  */
3016  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
3017  continue;
3018 
3019  buf_state = LockBufHdr(bufHdr);
3020 
3021  for (j = 0; j < nforks; j++)
3022  {
3023  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
3024  bufHdr->tag.forkNum == forkNum[j] &&
3025  bufHdr->tag.blockNum >= firstDelBlock[j])
3026  {
3027  InvalidateBuffer(bufHdr); /* releases spinlock */
3028  break;
3029  }
3030  }
3031  if (j >= nforks)
3032  UnlockBufHdr(bufHdr, buf_state);
3033  }
3034 }
3035 
3036 /* ---------------------------------------------------------------------
3037  * DropRelFileNodesAllBuffers
3038  *
3039  * This function removes from the buffer pool all the pages of all
3040  * forks of the specified relations. It's equivalent to calling
3041  * DropRelFileNodeBuffers once per fork per relation with
3042  * firstDelBlock = 0.
3043  * --------------------------------------------------------------------
3044  */
3045 void
3047 {
3048  int i,
3049  n = 0;
3050  RelFileNode *nodes;
3051  bool use_bsearch;
3052 
3053  if (nnodes == 0)
3054  return;
3055 
3056  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3057 
3058  /* If it's a local relation, it's localbuf.c's problem. */
3059  for (i = 0; i < nnodes; i++)
3060  {
3061  if (RelFileNodeBackendIsTemp(rnodes[i]))
3062  {
3063  if (rnodes[i].backend == MyBackendId)
3064  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3065  }
3066  else
3067  nodes[n++] = rnodes[i].node;
3068  }
3069 
3070  /*
3071  * If there are no non-local relations, then we're done. Release the
3072  * memory and return.
3073  */
3074  if (n == 0)
3075  {
3076  pfree(nodes);
3077  return;
3078  }
3079 
3080  /*
3081  * For low number of relations to drop just use a simple walk through, to
3082  * save the bsearch overhead. The threshold to use is rather a guess than
3083  * an exactly determined value, as it depends on many factors (CPU and RAM
3084  * speeds, amount of shared buffers etc.).
3085  */
3086  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3087 
3088  /* sort the list of rnodes if necessary */
3089  if (use_bsearch)
3090  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3091 
3092  for (i = 0; i < NBuffers; i++)
3093  {
3094  RelFileNode *rnode = NULL;
3095  BufferDesc *bufHdr = GetBufferDescriptor(i);
3096  uint32 buf_state;
3097 
3098  /*
3099  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3100  * and saves some cycles.
3101  */
3102 
3103  if (!use_bsearch)
3104  {
3105  int j;
3106 
3107  for (j = 0; j < n; j++)
3108  {
3109  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3110  {
3111  rnode = &nodes[j];
3112  break;
3113  }
3114  }
3115  }
3116  else
3117  {
3118  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3119  nodes, n, sizeof(RelFileNode),
3121  }
3122 
3123  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3124  if (rnode == NULL)
3125  continue;
3126 
3127  buf_state = LockBufHdr(bufHdr);
3128  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3129  InvalidateBuffer(bufHdr); /* releases spinlock */
3130  else
3131  UnlockBufHdr(bufHdr, buf_state);
3132  }
3133 
3134  pfree(nodes);
3135 }
3136 
3137 /* ---------------------------------------------------------------------
3138  * DropDatabaseBuffers
3139  *
3140  * This function removes all the buffers in the buffer cache for a
3141  * particular database. Dirty pages are simply dropped, without
3142  * bothering to write them out first. This is used when we destroy a
3143  * database, to avoid trying to flush data to disk when the directory
3144  * tree no longer exists. Implementation is pretty similar to
3145  * DropRelFileNodeBuffers() which is for destroying just one relation.
3146  * --------------------------------------------------------------------
3147  */
3148 void
3150 {
3151  int i;
3152 
3153  /*
3154  * We needn't consider local buffers, since by assumption the target
3155  * database isn't our own.
3156  */
3157 
3158  for (i = 0; i < NBuffers; i++)
3159  {
3160  BufferDesc *bufHdr = GetBufferDescriptor(i);
3161  uint32 buf_state;
3162 
3163  /*
3164  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3165  * and saves some cycles.
3166  */
3167  if (bufHdr->tag.rnode.dbNode != dbid)
3168  continue;
3169 
3170  buf_state = LockBufHdr(bufHdr);
3171  if (bufHdr->tag.rnode.dbNode == dbid)
3172  InvalidateBuffer(bufHdr); /* releases spinlock */
3173  else
3174  UnlockBufHdr(bufHdr, buf_state);
3175  }
3176 }
3177 
3178 /* -----------------------------------------------------------------
3179  * PrintBufferDescs
3180  *
3181  * this function prints all the buffer descriptors, for debugging
3182  * use only.
3183  * -----------------------------------------------------------------
3184  */
3185 #ifdef NOT_USED
3186 void
3187 PrintBufferDescs(void)
3188 {
3189  int i;
3190 
3191  for (i = 0; i < NBuffers; ++i)
3192  {
3195 
3196  /* theoretically we should lock the bufhdr here */
3197  elog(LOG,
3198  "[%02d] (freeNext=%d, rel=%s, "
3199  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3200  i, buf->freeNext,
3202  buf->tag.blockNum, buf->flags,
3203  buf->refcount, GetPrivateRefCount(b));
3204  }
3205 }
3206 #endif
3207 
3208 #ifdef NOT_USED
3209 void
3210 PrintPinnedBufs(void)
3211 {
3212  int i;
3213 
3214  for (i = 0; i < NBuffers; ++i)
3215  {
3218 
3219  if (GetPrivateRefCount(b) > 0)
3220  {
3221  /* theoretically we should lock the bufhdr here */
3222  elog(LOG,
3223  "[%02d] (freeNext=%d, rel=%s, "
3224  "blockNum=%u, flags=0x%x, refcount=%u %d)",
3225  i, buf->freeNext,
3226  relpathperm(buf->tag.rnode, buf->tag.forkNum),
3227  buf->tag.blockNum, buf->flags,
3228  buf->refcount, GetPrivateRefCount(b));
3229  }
3230  }
3231 }
3232 #endif
3233 
3234 /* ---------------------------------------------------------------------
3235  * FlushRelationBuffers
3236  *
3237  * This function writes all dirty pages of a relation out to disk
3238  * (or more accurately, out to kernel disk buffers), ensuring that the
3239  * kernel has an up-to-date view of the relation.
3240  *
3241  * Generally, the caller should be holding AccessExclusiveLock on the
3242  * target relation to ensure that no other backend is busy dirtying
3243  * more blocks of the relation; the effects can't be expected to last
3244  * after the lock is released.
3245  *
3246  * XXX currently it sequentially searches the buffer pool, should be
3247  * changed to more clever ways of searching. This routine is not
3248  * used in any performance-critical code paths, so it's not worth
3249  * adding additional overhead to normal paths to make it go faster;
3250  * but see also DropRelFileNodeBuffers.
3251  * --------------------------------------------------------------------
3252  */
3253 void
3255 {
3256  int i;
3257  BufferDesc *bufHdr;
3258 
3259  /* Open rel at the smgr level if not already done */
3260  RelationOpenSmgr(rel);
3261 
3262  if (RelationUsesLocalBuffers(rel))
3263  {
3264  for (i = 0; i < NLocBuffer; i++)
3265  {
3266  uint32 buf_state;
3267 
3268  bufHdr = GetLocalBufferDescriptor(i);
3269  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3270  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3271  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3272  {
3273  ErrorContextCallback errcallback;
3274  Page localpage;
3275 
3276  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3277 
3278  /* Setup error traceback support for ereport() */
3280  errcallback.arg = (void *) bufHdr;
3281  errcallback.previous = error_context_stack;
3282  error_context_stack = &errcallback;
3283 
3284  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3285 
3286  smgrwrite(rel->rd_smgr,
3287  bufHdr->tag.forkNum,
3288  bufHdr->tag.blockNum,
3289  localpage,
3290  false);
3291 
3292  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3293  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3294 
3295  /* Pop the error context stack */
3296  error_context_stack = errcallback.previous;
3297  }
3298  }
3299 
3300  return;
3301  }
3302 
3303  /* Make sure we can handle the pin inside the loop */
3305 
3306  for (i = 0; i < NBuffers; i++)
3307  {
3308  uint32 buf_state;
3309 
3310  bufHdr = GetBufferDescriptor(i);
3311 
3312  /*
3313  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3314  * and saves some cycles.
3315  */
3316  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3317  continue;
3318 
3320 
3321  buf_state = LockBufHdr(bufHdr);
3322  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3323  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3324  {
3325  PinBuffer_Locked(bufHdr);
3327  FlushBuffer(bufHdr, rel->rd_smgr);
3329  UnpinBuffer(bufHdr, true);
3330  }
3331  else
3332  UnlockBufHdr(bufHdr, buf_state);
3333  }
3334 }
3335 
3336 /* ---------------------------------------------------------------------
3337  * FlushRelationsAllBuffers
3338  *
3339  * This function flushes out of the buffer pool all the pages of all
3340  * forks of the specified smgr relations. It's equivalent to calling
3341  * FlushRelationBuffers once per fork per relation. The relations are
3342  * assumed not to use local buffers.
3343  * --------------------------------------------------------------------
3344  */
3345 void
3347 {
3348  int i;
3349  SMgrSortArray *srels;
3350  bool use_bsearch;
3351 
3352  if (nrels == 0)
3353  return;
3354 
3355  /* fill-in array for qsort */
3356  srels = palloc(sizeof(SMgrSortArray) * nrels);
3357 
3358  for (i = 0; i < nrels; i++)
3359  {
3360  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3361 
3362  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3363  srels[i].srel = smgrs[i];
3364  }
3365 
3366  /*
3367  * Save the bsearch overhead for low number of relations to sync. See
3368  * DropRelFileNodesAllBuffers for details.
3369  */
3370  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3371 
3372  /* sort the list of SMgrRelations if necessary */
3373  if (use_bsearch)
3374  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3375 
3376  /* Make sure we can handle the pin inside the loop */
3378 
3379  for (i = 0; i < NBuffers; i++)
3380  {
3381  SMgrSortArray *srelent = NULL;
3382  BufferDesc *bufHdr = GetBufferDescriptor(i);
3383  uint32 buf_state;
3384 
3385  /*
3386  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3387  * and saves some cycles.
3388  */
3389 
3390  if (!use_bsearch)
3391  {
3392  int j;
3393 
3394  for (j = 0; j < nrels; j++)
3395  {
3396  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3397  {
3398  srelent = &srels[j];
3399  break;
3400  }
3401  }
3402 
3403  }
3404  else
3405  {
3406  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3407  srels, nrels, sizeof(SMgrSortArray),
3409  }
3410 
3411  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3412  if (srelent == NULL)
3413  continue;
3414 
3416 
3417  buf_state = LockBufHdr(bufHdr);
3418  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3419  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3420  {
3421  PinBuffer_Locked(bufHdr);
3423  FlushBuffer(bufHdr, srelent->srel);
3425  UnpinBuffer(bufHdr, true);
3426  }
3427  else
3428  UnlockBufHdr(bufHdr, buf_state);
3429  }
3430 
3431  pfree(srels);
3432 }
3433 
3434 /* ---------------------------------------------------------------------
3435  * FlushDatabaseBuffers
3436  *
3437  * This function writes all dirty pages of a database out to disk
3438  * (or more accurately, out to kernel disk buffers), ensuring that the
3439  * kernel has an up-to-date view of the database.
3440  *
3441  * Generally, the caller should be holding an appropriate lock to ensure
3442  * no other backend is active in the target database; otherwise more
3443  * pages could get dirtied.
3444  *
3445  * Note we don't worry about flushing any pages of temporary relations.
3446  * It's assumed these wouldn't be interesting.
3447  * --------------------------------------------------------------------
3448  */
3449 void
3451 {
3452  int i;
3453  BufferDesc *bufHdr;
3454 
3455  /* Make sure we can handle the pin inside the loop */
3457 
3458  for (i = 0; i < NBuffers; i++)
3459  {
3460  uint32 buf_state;
3461 
3462  bufHdr = GetBufferDescriptor(i);
3463 
3464  /*
3465  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3466  * and saves some cycles.
3467  */
3468  if (bufHdr->tag.rnode.dbNode != dbid)
3469  continue;
3470 
3472 
3473  buf_state = LockBufHdr(bufHdr);
3474  if (bufHdr->tag.rnode.dbNode == dbid &&
3475  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3476  {
3477  PinBuffer_Locked(bufHdr);
3479  FlushBuffer(bufHdr, NULL);
3481  UnpinBuffer(bufHdr, true);
3482  }
3483  else
3484  UnlockBufHdr(bufHdr, buf_state);
3485  }
3486 }
3487 
3488 /*
3489  * Flush a previously, shared or exclusively, locked and pinned buffer to the
3490  * OS.
3491  */
3492 void
3494 {
3495  BufferDesc *bufHdr;
3496 
3497  /* currently not needed, but no fundamental reason not to support */
3498  Assert(!BufferIsLocal(buffer));
3499 
3500  Assert(BufferIsPinned(buffer));
3501 
3502  bufHdr = GetBufferDescriptor(buffer - 1);
3503 
3505 
3506  FlushBuffer(bufHdr, NULL);
3507 }
3508 
3509 /*
3510  * ReleaseBuffer -- release the pin on a buffer
3511  */
3512 void
3514 {
3515  if (!BufferIsValid(buffer))
3516  elog(ERROR, "bad buffer ID: %d", buffer);
3517 
3518  if (BufferIsLocal(buffer))
3519  {
3521 
3522  Assert(LocalRefCount[-buffer - 1] > 0);
3523  LocalRefCount[-buffer - 1]--;
3524  return;
3525  }
3526 
3527  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3528 }
3529 
3530 /*
3531  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
3532  *
3533  * This is just a shorthand for a common combination.
3534  */
3535 void
3537 {
3538  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3539  ReleaseBuffer(buffer);
3540 }
3541 
3542 /*
3543  * IncrBufferRefCount
3544  * Increment the pin count on a buffer that we have *already* pinned
3545  * at least once.
3546  *
3547  * This function cannot be used on a buffer we do not have pinned,
3548  * because it doesn't change the shared buffer state.
3549  */
3550 void
3552 {
3553  Assert(BufferIsPinned(buffer));
3555  if (BufferIsLocal(buffer))
3556  LocalRefCount[-buffer - 1]++;
3557  else
3558  {
3559  PrivateRefCountEntry *ref;
3560 
3561  ref = GetPrivateRefCountEntry(buffer, true);
3562  Assert(ref != NULL);
3563  ref->refcount++;
3564  }
3566 }
3567 
3568 /*
3569  * MarkBufferDirtyHint
3570  *
3571  * Mark a buffer dirty for non-critical changes.
3572  *
3573  * This is essentially the same as MarkBufferDirty, except:
3574  *
3575  * 1. The caller does not write WAL; so if checksums are enabled, we may need
3576  * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
3577  * 2. The caller might have only share-lock instead of exclusive-lock on the
3578  * buffer's content lock.
3579  * 3. This function does not guarantee that the buffer is always marked dirty
3580  * (due to a race condition), so it cannot be used for important changes.
3581  */
3582 void
3584 {
3585  BufferDesc *bufHdr;
3586  Page page = BufferGetPage(buffer);
3587 
3588  if (!BufferIsValid(buffer))
3589  elog(ERROR, "bad buffer ID: %d", buffer);
3590 
3591  if (BufferIsLocal(buffer))
3592  {
3593  MarkLocalBufferDirty(buffer);
3594  return;
3595  }
3596 
3597  bufHdr = GetBufferDescriptor(buffer - 1);
3598 
3599  Assert(GetPrivateRefCount(buffer) > 0);
3600  /* here, either share or exclusive lock is OK */
3602 
3603  /*
3604  * This routine might get called many times on the same page, if we are
3605  * making the first scan after commit of an xact that added/deleted many
3606  * tuples. So, be as quick as we can if the buffer is already dirty. We
3607  * do this by not acquiring spinlock if it looks like the status bits are
3608  * already set. Since we make this test unlocked, there's a chance we
3609  * might fail to notice that the flags have just been cleared, and failed
3610  * to reset them, due to memory-ordering issues. But since this function
3611  * is only intended to be used in cases where failing to write out the
3612  * data would be harmless anyway, it doesn't really matter.
3613  */
3614  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3616  {
3618  bool dirtied = false;
3619  bool delayChkpt = false;
3620  uint32 buf_state;
3621 
3622  /*
3623  * If we need to protect hint bit updates from torn writes, WAL-log a
3624  * full page image of the page. This full page image is only necessary
3625  * if the hint bit update is the first change to the page since the
3626  * last checkpoint.
3627  *
3628  * We don't check full_page_writes here because that logic is included
3629  * when we call XLogInsert() since the value changes dynamically.
3630  */
3631  if (XLogHintBitIsNeeded() &&
3632  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3633  {
3634  /*
3635  * If we must not write WAL, due to a relfilenode-specific
3636  * condition or being in recovery, don't dirty the page. We can
3637  * set the hint, just not dirty the page as a result so the hint
3638  * is lost when we evict the page or shutdown.
3639  *
3640  * See src/backend/storage/page/README for longer discussion.
3641  */
3642  if (RecoveryInProgress() ||
3643  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3644  return;
3645 
3646  /*
3647  * If the block is already dirty because we either made a change
3648  * or set a hint already, then we don't need to write a full page
3649  * image. Note that aggressive cleaning of blocks dirtied by hint
3650  * bit setting would increase the call rate. Bulk setting of hint
3651  * bits would reduce the call rate...
3652  *
3653  * We must issue the WAL record before we mark the buffer dirty.
3654  * Otherwise we might write the page before we write the WAL. That
3655  * causes a race condition, since a checkpoint might occur between
3656  * writing the WAL record and marking the buffer dirty. We solve
3657  * that with a kluge, but one that is already in use during
3658  * transaction commit to prevent race conditions. Basically, we
3659  * simply prevent the checkpoint WAL record from being written
3660  * until we have marked the buffer dirty. We don't start the
3661  * checkpoint flush until we have marked dirty, so our checkpoint
3662  * must flush the change to disk successfully or the checkpoint
3663  * never gets written, so crash recovery will fix.
3664  *
3665  * It's possible we may enter here without an xid, so it is
3666  * essential that CreateCheckpoint waits for virtual transactions
3667  * rather than full transactionids.
3668  */
3669  MyProc->delayChkpt = delayChkpt = true;
3670  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3671  }
3672 
3673  buf_state = LockBufHdr(bufHdr);
3674 
3675  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3676 
3677  if (!(buf_state & BM_DIRTY))
3678  {
3679  dirtied = true; /* Means "will be dirtied by this action" */
3680 
3681  /*
3682  * Set the page LSN if we wrote a backup block. We aren't supposed
3683  * to set this when only holding a share lock but as long as we
3684  * serialise it somehow we're OK. We choose to set LSN while
3685  * holding the buffer header lock, which causes any reader of an
3686  * LSN who holds only a share lock to also obtain a buffer header
3687  * lock before using PageGetLSN(), which is enforced in
3688  * BufferGetLSNAtomic().
3689  *
3690  * If checksums are enabled, you might think we should reset the
3691  * checksum here. That will happen when the page is written
3692  * sometime later in this checkpoint cycle.
3693  */
3694  if (!XLogRecPtrIsInvalid(lsn))
3695  PageSetLSN(page, lsn);
3696  }
3697 
3698  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3699  UnlockBufHdr(bufHdr, buf_state);
3700 
3701  if (delayChkpt)
3702  MyProc->delayChkpt = false;
3703 
3704  if (dirtied)
3705  {
3706  VacuumPageDirty++;
3708  if (VacuumCostActive)
3710  }
3711  }
3712 }
3713 
3714 /*
3715  * Release buffer content locks for shared buffers.
3716  *
3717  * Used to clean up after errors.
3718  *
3719  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
3720  * of releasing buffer content locks per se; the only thing we need to deal
3721  * with here is clearing any PIN_COUNT request that was in progress.
3722  */
3723 void
3725 {
3727 
3728  if (buf)
3729  {
3730  uint32 buf_state;
3731 
3732  buf_state = LockBufHdr(buf);
3733 
3734  /*
3735  * Don't complain if flag bit not set; it could have been reset but we
3736  * got a cancel/die interrupt before getting the signal.
3737  */
3738  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3739  buf->wait_backend_pid == MyProcPid)
3740  buf_state &= ~BM_PIN_COUNT_WAITER;
3741 
3742  UnlockBufHdr(buf, buf_state);
3743 
3744  PinCountWaitBuf = NULL;
3745  }
3746 }
3747 
3748 /*
3749  * Acquire or release the content_lock for the buffer.
3750  */
3751 void
3753 {
3754  BufferDesc *buf;
3755 
3756  Assert(BufferIsPinned(buffer));
3757  if (BufferIsLocal(buffer))
3758  return; /* local buffers need no lock */
3759 
3760  buf = GetBufferDescriptor(buffer - 1);
3761 
3762  if (mode == BUFFER_LOCK_UNLOCK)
3764  else if (mode == BUFFER_LOCK_SHARE)
3766  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3768  else
3769  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3770 }
3771 
3772 /*
3773  * Acquire the content_lock for the buffer, but only if we don't have to wait.
3774  *
3775  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
3776  */
3777 bool
3779 {
3780  BufferDesc *buf;
3781 
3782  Assert(BufferIsPinned(buffer));
3783  if (BufferIsLocal(buffer))
3784  return true; /* act as though we got it */
3785 
3786  buf = GetBufferDescriptor(buffer - 1);
3787 
3789  LW_EXCLUSIVE);
3790 }
3791 
3792 /*
3793  * LockBufferForCleanup - lock a buffer in preparation for deleting items
3794  *
3795  * Items may be deleted from a disk page only when the caller (a) holds an
3796  * exclusive lock on the buffer and (b) has observed that no other backend
3797  * holds a pin on the buffer. If there is a pin, then the other backend
3798  * might have a pointer into the buffer (for example, a heapscan reference
3799  * to an item --- see README for more details). It's OK if a pin is added
3800  * after the cleanup starts, however; the newly-arrived backend will be
3801  * unable to look at the page until we release the exclusive lock.
3802  *
3803  * To implement this protocol, a would-be deleter must pin the buffer and
3804  * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
3805  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
3806  * it has successfully observed pin count = 1.
3807  */
3808 void
3810 {
3811  BufferDesc *bufHdr;
3812  char *new_status = NULL;
3813 
3814  Assert(BufferIsPinned(buffer));
3815  Assert(PinCountWaitBuf == NULL);
3816 
3817  if (BufferIsLocal(buffer))
3818  {
3819  /* There should be exactly one pin */
3820  if (LocalRefCount[-buffer - 1] != 1)
3821  elog(ERROR, "incorrect local pin count: %d",
3822  LocalRefCount[-buffer - 1]);
3823  /* Nobody else to wait for */
3824  return;
3825  }
3826 
3827  /* There should be exactly one local pin */
3828  if (GetPrivateRefCount(buffer) != 1)
3829  elog(ERROR, "incorrect local pin count: %d",
3830  GetPrivateRefCount(buffer));
3831 
3832  bufHdr = GetBufferDescriptor(buffer - 1);
3833 
3834  for (;;)
3835  {
3836  uint32 buf_state;
3837 
3838  /* Try to acquire lock */
3840  buf_state = LockBufHdr(bufHdr);
3841 
3842  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3843  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3844  {
3845  /* Successfully acquired exclusive lock with pincount 1 */
3846  UnlockBufHdr(bufHdr, buf_state);
3847 
3848  /* Report change to non-waiting status */
3849  if (new_status)
3850  {
3851  set_ps_display(new_status);
3852  pfree(new_status);
3853  }
3854  return;
3855  }
3856  /* Failed, so mark myself as waiting for pincount 1 */
3857  if (buf_state & BM_PIN_COUNT_WAITER)
3858  {
3859  UnlockBufHdr(bufHdr, buf_state);
3860  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3861  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3862  }
3863  bufHdr->wait_backend_pid = MyProcPid;
3864  PinCountWaitBuf = bufHdr;
3865  buf_state |= BM_PIN_COUNT_WAITER;
3866  UnlockBufHdr(bufHdr, buf_state);
3867  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3868 
3869  /* Wait to be signaled by UnpinBuffer() */
3870  if (InHotStandby)
3871  {
3872  /* Report change to waiting status */
3873  if (update_process_title && new_status == NULL)
3874  {
3875  const char *old_status;
3876  int len;
3877 
3878  old_status = get_ps_display(&len);
3879  new_status = (char *) palloc(len + 8 + 1);
3880  memcpy(new_status, old_status, len);
3881  strcpy(new_status + len, " waiting");
3882  set_ps_display(new_status);
3883  new_status[len] = '\0'; /* truncate off " waiting" */
3884  }
3885 
3886  /* Publish the bufid that Startup process waits on */
3887  SetStartupBufferPinWaitBufId(buffer - 1);
3888  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3890  /* Reset the published bufid */
3892  }
3893  else
3895 
3896  /*
3897  * Remove flag marking us as waiter. Normally this will not be set
3898  * anymore, but ProcWaitForSignal() can return for other signals as
3899  * well. We take care to only reset the flag if we're the waiter, as
3900  * theoretically another backend could have started waiting. That's
3901  * impossible with the current usages due to table level locking, but
3902  * better be safe.
3903  */
3904  buf_state = LockBufHdr(bufHdr);
3905  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3906  bufHdr->wait_backend_pid == MyProcPid)
3907  buf_state &= ~BM_PIN_COUNT_WAITER;
3908  UnlockBufHdr(bufHdr, buf_state);
3909 
3910  PinCountWaitBuf = NULL;
3911  /* Loop back and try again */
3912  }
3913 }
3914 
3915 /*
3916  * Check called from RecoveryConflictInterrupt handler when Startup
3917  * process requests cancellation of all pin holders that are blocking it.
3918  */
3919 bool
3921 {
3922  int bufid = GetStartupBufferPinWaitBufId();
3923 
3924  /*
3925  * If we get woken slowly then it's possible that the Startup process was
3926  * already woken by other backends before we got here. Also possible that
3927  * we get here by multiple interrupts or interrupts at inappropriate
3928  * times, so make sure we do nothing if the bufid is not set.
3929  */
3930  if (bufid < 0)
3931  return false;
3932 
3933  if (GetPrivateRefCount(bufid + 1) > 0)
3934  return true;
3935 
3936  return false;
3937 }
3938 
3939 /*
3940  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
3941  *
3942  * We won't loop, but just check once to see if the pin count is OK. If
3943  * not, return false with no lock held.
3944  */
3945 bool
3947 {
3948  BufferDesc *bufHdr;
3949  uint32 buf_state,
3950  refcount;
3951 
3952  Assert(BufferIsValid(buffer));
3953 
3954  if (BufferIsLocal(buffer))
3955  {
3956  refcount = LocalRefCount[-buffer - 1];
3957  /* There should be exactly one pin */
3958  Assert(refcount > 0);
3959  if (refcount != 1)
3960  return false;
3961  /* Nobody else to wait for */
3962  return true;
3963  }
3964 
3965  /* There should be exactly one local pin */
3966  refcount = GetPrivateRefCount(buffer);
3967  Assert(refcount);
3968  if (refcount != 1)
3969  return false;
3970 
3971  /* Try to acquire lock */
3972  if (!ConditionalLockBuffer(buffer))
3973  return false;
3974 
3975  bufHdr = GetBufferDescriptor(buffer - 1);
3976  buf_state = LockBufHdr(bufHdr);
3977  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3978 
3979  Assert(refcount > 0);
3980  if (refcount == 1)
3981  {
3982  /* Successfully acquired exclusive lock with pincount 1 */
3983  UnlockBufHdr(bufHdr, buf_state);
3984  return true;
3985  }
3986 
3987  /* Failed, so release the lock */
3988  UnlockBufHdr(bufHdr, buf_state);
3989  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3990  return false;
3991 }
3992 
3993 /*
3994  * IsBufferCleanupOK - as above, but we already have the lock
3995  *
3996  * Check whether it's OK to perform cleanup on a buffer we've already
3997  * locked. If we observe that the pin count is 1, our exclusive lock
3998  * happens to be a cleanup lock, and we can proceed with anything that
3999  * would have been allowable had we sought a cleanup lock originally.
4000  */
4001 bool
4003 {
4004  BufferDesc *bufHdr;
4005  uint32 buf_state;
4006 
4007  Assert(BufferIsValid(buffer));
4008 
4009  if (BufferIsLocal(buffer))
4010  {
4011  /* There should be exactly one pin */
4012  if (LocalRefCount[-buffer - 1] != 1)
4013  return false;
4014  /* Nobody else to wait for */
4015  return true;
4016  }
4017 
4018  /* There should be exactly one local pin */
4019  if (GetPrivateRefCount(buffer) != 1)
4020  return false;
4021 
4022  bufHdr = GetBufferDescriptor(buffer - 1);
4023 
4024  /* caller must hold exclusive lock on buffer */
4026  LW_EXCLUSIVE));
4027 
4028  buf_state = LockBufHdr(bufHdr);
4029 
4030  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4031  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4032  {
4033  /* pincount is OK. */
4034  UnlockBufHdr(bufHdr, buf_state);
4035  return true;
4036  }
4037 
4038  UnlockBufHdr(bufHdr, buf_state);
4039  return false;
4040 }
4041 
4042 
4043 /*
4044  * Functions for buffer I/O handling
4045  *
4046  * Note: We assume that nested buffer I/O never occurs.
4047  * i.e at most one io_in_progress lock is held per proc.
4048  *
4049  * Also note that these are used only for shared buffers, not local ones.
4050  */
4051 
4052 /*
4053  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
4054  */
4055 static void
4057 {
4058  /*
4059  * Changed to wait until there's no IO - Inoue 01/13/2000
4060  *
4061  * Note this is *necessary* because an error abort in the process doing
4062  * I/O could release the io_in_progress_lock prematurely. See
4063  * AbortBufferIO.
4064  */
4065  for (;;)
4066  {
4067  uint32 buf_state;
4068 
4069  /*
4070  * It may not be necessary to acquire the spinlock to check the flag
4071  * here, but since this test is essential for correctness, we'd better
4072  * play it safe.
4073  */
4074  buf_state = LockBufHdr(buf);
4075  UnlockBufHdr(buf, buf_state);
4076 
4077  if (!(buf_state & BM_IO_IN_PROGRESS))
4078  break;
4081  }
4082 }
4083 
4084 /*
4085  * StartBufferIO: begin I/O on this buffer
4086  * (Assumptions)
4087  * My process is executing no IO
4088  * The buffer is Pinned
4089  *
4090  * In some scenarios there are race conditions in which multiple backends
4091  * could attempt the same I/O operation concurrently. If someone else
4092  * has already started I/O on this buffer then we will block on the
4093  * io_in_progress lock until he's done.
4094  *
4095  * Input operations are only attempted on buffers that are not BM_VALID,
4096  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
4097  * so we can always tell if the work is already done.
4098  *
4099  * Returns true if we successfully marked the buffer as I/O busy,
4100  * false if someone else already did the work.
4101  */
4102 static bool
4103 StartBufferIO(BufferDesc *buf, bool forInput)
4104 {
4105  uint32 buf_state;
4106 
4107  Assert(!InProgressBuf);
4108 
4109  for (;;)
4110  {
4111  /*
4112  * Grab the io_in_progress lock so that other processes can wait for
4113  * me to finish the I/O.
4114  */
4116 
4117  buf_state = LockBufHdr(buf);
4118 
4119  if (!(buf_state & BM_IO_IN_PROGRESS))
4120  break;
4121 
4122  /*
4123  * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
4124  * lock isn't held is if the process doing the I/O is recovering from
4125  * an error (see AbortBufferIO). If that's the case, we must wait for
4126  * him to get unwedged.
4127  */
4128  UnlockBufHdr(buf, buf_state);
4130  WaitIO(buf);
4131  }
4132 
4133  /* Once we get here, there is definitely no I/O active on this buffer */
4134 
4135  if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
4136  {
4137  /* someone else already did the I/O */
4138  UnlockBufHdr(buf, buf_state);
4140  return false;
4141  }
4142 
4143  buf_state |= BM_IO_IN_PROGRESS;
4144  UnlockBufHdr(buf, buf_state);
4145 
4146  InProgressBuf = buf;
4147  IsForInput = forInput;
4148 
4149  return true;
4150 }
4151 
4152 /*
4153  * TerminateBufferIO: release a buffer we were doing I/O on
4154  * (Assumptions)
4155  * My process is executing IO for the buffer
4156  * BM_IO_IN_PROGRESS bit is set for the buffer
4157  * We hold the buffer's io_in_progress lock
4158  * The buffer is Pinned
4159  *
4160  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
4161  * buffer's BM_DIRTY flag. This is appropriate when terminating a
4162  * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
4163  * marking the buffer clean if it was re-dirtied while we were writing.
4164  *
4165  * set_flag_bits gets ORed into the buffer's flags. It must include
4166  * BM_IO_ERROR in a failure case. For successful completion it could
4167  * be 0, or BM_VALID if we just finished reading in the page.
4168  */
4169 static void
4170 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
4171 {
4172  uint32 buf_state;
4173 
4174  Assert(buf == InProgressBuf);
4175 
4176  buf_state = LockBufHdr(buf);
4177 
4178  Assert(buf_state & BM_IO_IN_PROGRESS);
4179 
4180  buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
4181  if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
4182  buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
4183 
4184  buf_state |= set_flag_bits;
4185  UnlockBufHdr(buf, buf_state);
4186 
4187  InProgressBuf = NULL;
4188 
4190 }
4191 
4192 /*
4193  * AbortBufferIO: Clean up any active buffer I/O after an error.
4194  *
4195  * All LWLocks we might have held have been released,
4196  * but we haven't yet released buffer pins, so the buffer is still pinned.
4197  *
4198  * If I/O was in progress, we always set BM_IO_ERROR, even though it's
4199  * possible the error condition wasn't related to the I/O.
4200  */
4201 void
4203 {
4205 
4206  if (buf)
4207  {
4208  uint32 buf_state;
4209 
4210  /*
4211  * Since LWLockReleaseAll has already been called, we're not holding
4212  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4213  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4214  * buffer will be in a busy spin until we succeed in doing this.
4215  */
4217 
4218  buf_state = LockBufHdr(buf);
4219  Assert(buf_state & BM_IO_IN_PROGRESS);
4220  if (IsForInput)
4221  {
4222  Assert(!(buf_state & BM_DIRTY));
4223 
4224  /* We'd better not think buffer is valid yet */
4225  Assert(!(buf_state & BM_VALID));
4226  UnlockBufHdr(buf, buf_state);
4227  }
4228  else
4229  {
4230  Assert(buf_state & BM_DIRTY);
4231  UnlockBufHdr(buf, buf_state);
4232  /* Issue notice if this is not the first failure... */
4233  if (buf_state & BM_IO_ERROR)
4234  {
4235  /* Buffer is pinned, so we can read tag without spinlock */
4236  char *path;
4237 
4238  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4239  ereport(WARNING,
4240  (errcode(ERRCODE_IO_ERROR),
4241  errmsg("could not write block %u of %s",
4242  buf->tag.blockNum, path),
4243  errdetail("Multiple failures --- write error might be permanent.")));
4244  pfree(path);
4245  }
4246  }
4247  TerminateBufferIO(buf, false, BM_IO_ERROR);
4248  }
4249 }
4250 
4251 /*
4252  * Error context callback for errors occurring during shared buffer writes.
4253  */
4254 static void
4256 {
4257  BufferDesc *bufHdr = (BufferDesc *) arg;
4258 
4259  /* Buffer is pinned, so we can read the tag without locking the spinlock */
4260  if (bufHdr != NULL)
4261  {
4262  char *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);
4263 
4264  errcontext("writing block %u of relation %s",
4265  bufHdr->tag.blockNum, path);
4266  pfree(path);
4267  }
4268 }
4269 
4270 /*
4271  * Error context callback for errors occurring during local buffer writes.
4272  */
4273 static void
4275 {
4276  BufferDesc *bufHdr = (BufferDesc *) arg;
4277 
4278  if (bufHdr != NULL)
4279  {
4280  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4281  bufHdr->tag.forkNum);
4282 
4283  errcontext("writing block %u of relation %s",
4284  bufHdr->tag.blockNum, path);
4285  pfree(path);
4286  }
4287 }
4288 
4289 /*
4290  * RelFileNode qsort/bsearch comparator; see RelFileNodeEquals.
4291  */
4292 static int
4293 rnode_comparator(const void *p1, const void *p2)
4294 {
4295  RelFileNode n1 = *(const RelFileNode *) p1;
4296  RelFileNode n2 = *(const RelFileNode *) p2;
4297 
4298  if (n1.relNode < n2.relNode)
4299  return -1;
4300  else if (n1.relNode > n2.relNode)
4301  return 1;
4302 
4303  if (n1.dbNode < n2.dbNode)
4304  return -1;
4305  else if (n1.dbNode > n2.dbNode)
4306  return 1;
4307 
4308  if (n1.spcNode < n2.spcNode)
4309  return -1;
4310  else if (n1.spcNode > n2.spcNode)
4311  return 1;
4312  else
4313  return 0;
4314 }
4315 
4316 /*
4317  * Lock buffer header - set BM_LOCKED in buffer state.
4318  */
4319 uint32
4321 {
4322  SpinDelayStatus delayStatus;
4323  uint32 old_buf_state;
4324 
4325  init_local_spin_delay(&delayStatus);
4326 
4327  while (true)
4328  {
4329  /* set BM_LOCKED flag */
4330  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4331  /* if it wasn't set before we're OK */
4332  if (!(old_buf_state & BM_LOCKED))
4333  break;
4334  perform_spin_delay(&delayStatus);
4335  }
4336  finish_spin_delay(&delayStatus);
4337  return old_buf_state | BM_LOCKED;
4338 }
4339 
4340 /*
4341  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
4342  * state at that point.
4343  *
4344  * Obviously the buffer could be locked by the time the value is returned, so
4345  * this is primarily useful in CAS style loops.
4346  */
4347 static uint32
4349 {
4350  SpinDelayStatus delayStatus;
4351  uint32 buf_state;
4352 
4353  init_local_spin_delay(&delayStatus);
4354 
4355  buf_state = pg_atomic_read_u32(&buf->state);
4356 
4357  while (buf_state & BM_LOCKED)
4358  {
4359  perform_spin_delay(&delayStatus);
4360  buf_state = pg_atomic_read_u32(&buf->state);
4361  }
4362 
4363  finish_spin_delay(&delayStatus);
4364 
4365  return buf_state;
4366 }
4367 
4368 /*
4369  * BufferTag comparator.
4370  */
4371 static int
4372 buffertag_comparator(const void *a, const void *b)
4373 {
4374  const BufferTag *ba = (const BufferTag *) a;
4375  const BufferTag *bb = (const BufferTag *) b;
4376  int ret;
4377 
4378  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4379 
4380  if (ret != 0)
4381  return ret;
4382 
4383  if (ba->forkNum < bb->forkNum)
4384  return -1;
4385  if (ba->forkNum > bb->forkNum)
4386  return 1;
4387 
4388  if (ba->blockNum < bb->blockNum)
4389  return -1;
4390  if (ba->blockNum > bb->blockNum)
4391  return 1;
4392 
4393  return 0;
4394 }
4395 
4396 /*
4397  * Comparator determining the writeout order in a checkpoint.
4398  *
4399  * It is important that tablespaces are compared first, the logic balancing
4400  * writes between tablespaces relies on it.
4401  */
4402 static int
4403 ckpt_buforder_comparator(const void *pa, const void *pb)
4404 {
4405  const CkptSortItem *a = (const CkptSortItem *) pa;
4406  const CkptSortItem *b = (const CkptSortItem *) pb;
4407 
4408  /* compare tablespace */
4409  if (a->tsId < b->tsId)
4410  return -1;
4411  else if (a->tsId > b->tsId)
4412  return 1;
4413  /* compare relation */
4414  if (a->relNode < b->relNode)
4415  return -1;
4416  else if (a->relNode > b->relNode)
4417  return 1;
4418  /* compare fork */
4419  else if (a->forkNum < b->forkNum)
4420  return -1;
4421  else if (a->forkNum > b->forkNum)
4422  return 1;
4423  /* compare block number */
4424  else if (a->blockNum < b->blockNum)
4425  return -1;
4426  else if (a->blockNum > b->blockNum)
4427  return 1;
4428  /* equal page IDs are unlikely, but not impossible */
4429  return 0;
4430 }
4431 
4432 /*
4433  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
4434  * progress.
4435  */
4436 static int
4438 {
4439  CkptTsStatus *sa = (CkptTsStatus *) a;
4440  CkptTsStatus *sb = (CkptTsStatus *) b;
4441 
4442  /* we want a min-heap, so return 1 for the a < b */
4443  if (sa->progress < sb->progress)
4444  return 1;
4445  else if (sa->progress == sb->progress)
4446  return 0;
4447  else
4448  return -1;
4449 }
4450 
4451 /*
4452  * Initialize a writeback context, discarding potential previous state.
4453  *
4454  * *max_pending is a pointer instead of an immediate value, so the coalesce
4455  * limits can easily changed by the GUC mechanism, and so calling code does
4456  * not have to check the current configuration. A value of 0 means that no
4457  * writeback control will be performed.
4458  */
4459 void
4460 WritebackContextInit(WritebackContext *context, int *max_pending)
4461 {
4462  Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
4463 
4464  context->max_pending = max_pending;
4465  context->nr_pending = 0;
4466 }
4467 
4468 /*
4469  * Add buffer to list of pending writeback requests.
4470  */
4471 void
4473 {
4474  PendingWriteback *pending;
4475 
4476  /*
4477  * Add buffer to the pending writeback array, unless writeback control is
4478  * disabled.
4479  */
4480  if (*context->max_pending > 0)
4481  {
4483 
4484  pending = &context->pending_writebacks[context->nr_pending++];
4485 
4486  pending->tag = *tag;
4487  }
4488 
4489  /*
4490  * Perform pending flushes if the writeback limit is exceeded. This
4491  * includes the case where previously an item has been added, but control
4492  * is now disabled.
4493  */
4494  if (context->nr_pending >= *context->max_pending)
4495  IssuePendingWritebacks(context);
4496 }
4497 
4498 /*
4499  * Issue all pending writeback requests, previously scheduled with
4500  * ScheduleBufferTagForWriteback, to the OS.
4501  *
4502  * Because this is only used to improve the OSs IO scheduling we try to never
4503  * error out - it's just a hint.
4504  */
4505 void
4507 {
4508  int i;
4509 
4510  if (context->nr_pending == 0)
4511  return;
4512 
4513  /*
4514  * Executing the writes in-order can make them a lot faster, and allows to
4515  * merge writeback requests to consecutive blocks into larger writebacks.
4516  */
4517  qsort(&context->pending_writebacks, context->nr_pending,
4519 
4520  /*
4521  * Coalesce neighbouring writes, but nothing else. For that we iterate
4522  * through the, now sorted, array of pending flushes, and look forward to
4523  * find all neighbouring (or identical) writes.
4524  */
4525  for (i = 0; i < context->nr_pending; i++)
4526  {
4529  SMgrRelation reln;
4530  int ahead;
4531  BufferTag tag;
4532  Size nblocks = 1;
4533 
4534  cur = &context->pending_writebacks[i];
4535  tag = cur->tag;
4536 
4537  /*
4538  * Peek ahead, into following writeback requests, to see if they can
4539  * be combined with the current one.
4540  */
4541  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4542  {
4543  next = &context->pending_writebacks[i + ahead + 1];
4544 
4545  /* different file, stop */
4546  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4547  cur->tag.forkNum != next->tag.forkNum)
4548  break;
4549 
4550  /* ok, block queued twice, skip */
4551  if (cur->tag.blockNum == next->tag.blockNum)
4552  continue;
4553 
4554  /* only merge consecutive writes */
4555  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4556  break;
4557 
4558  nblocks++;
4559  cur = next;
4560  }
4561 
4562  i += ahead;
4563 
4564  /* and finally tell the kernel to write the data to storage */
4565  reln = smgropen(tag.rnode, InvalidBackendId);
4566  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4567  }
4568 
4569  context->nr_pending = 0;
4570 }
4571 
4572 
4573 /*
4574  * Implement slower/larger portions of TestForOldSnapshot
4575  *
4576  * Smaller/faster portions are put inline, but the entire set of logic is too
4577  * big for that.
4578  */
4579 void
4581 {
4582  if (RelationAllowsEarlyPruning(relation)
4583  && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
4584  ereport(ERROR,
4585  (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
4586  errmsg("snapshot too old")));
4587 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:64
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1592
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
struct PrivateRefCountEntry PrivateRefCountEntry
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:683
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3809
Definition: lwlock.h:31
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:445
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void CheckPointBuffers(int flags)
Definition: bufmgr.c:2639
PgStat_Counter m_buf_alloc
Definition: pgstat.h:450
#define BM_PERMANENT
Definition: buf_internals.h:66
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1945
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
int64 VacuumPageMiss
Definition: globals.c:145
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:226
#define BufMappingPartitionLock(hashcode)
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
int errhint(const char *fmt,...)
Definition: elog.c:1149
BackendId MyBackendId
Definition: globals.c:81
TimestampTz GetOldSnapshotThresholdTimestamp(void)
Definition: snapmgr.c:1660
long local_blks_read
Definition: instrument.h:26
int maintenance_io_concurrency
Definition: bufmgr.c:142
#define BM_TAG_VALID
Definition: buf_internals.h:60
Oid tsId
Definition: bufmgr.c:89
static int32 next
Definition: blutils.c:219
int VacuumCostBalance
Definition: globals.c:148
bool BgBufferSync(WritebackContext *wb_context)
Definition: bufmgr.c:2110
#define binaryheap_empty(h)
Definition: binaryheap.h:52
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2596
int BgWriterDelay
Definition: bgwriter.c:64
int wait_backend_pid
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
#define HASH_ELEM
Definition: hsearch.h:85
static uint32 PrivateRefCountClock
Definition: bufmgr.c:192
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition: bufmgr.c:3583
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1927
instr_time blk_read_time
Definition: instrument.h:31
bool update_process_title
Definition: ps_status.c:36
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1373
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4274
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3169
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1471
PGPROC * MyProc
Definition: proc.c:67
int backend_flush_after
Definition: bufmgr.c:150
#define PointerGetDatum(X)
Definition: postgres.h:556
void AtEOXact_Buffers(bool isCommit)
Definition: bufmgr.c:2480
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
ResourceOwner CurrentResourceOwner
Definition: resowner.c:144
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:447
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:71
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:654
#define RelationAllowsEarlyPruning(rel)
Definition: snapmgr.h:38
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:917
struct timeval instr_time
Definition: instr_time.h:150
bool InRecovery
Definition: xlog.c:205
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
long shared_blks_read
Definition: instrument.h:22
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4056
int64 VacuumPageHit
Definition: globals.c:144
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:452
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4437
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:402
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:446
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:144
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:72
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define GetLocalBufferDescriptor(id)
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:696
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1393
int checkpoint_flush_after
Definition: bufmgr.c:148
struct cursor * cur
Definition: ecpg.c:28
#define InHotStandby
Definition: xlog.h:74
int errcode(int sqlerrcode)
Definition: elog.c:691
#define MemSet(start, val, len)
Definition: c.h:1004
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
int64 VacuumPageDirty
Definition: globals.c:146
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3513
#define P_NEW
Definition: bufmgr.h:91
double bgwriter_lru_multiplier
Definition: bufmgr.c:126
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:919
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
#define LOG
Definition: elog.h:26
Form_pg_class rd_rel
Definition: rel.h:110
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8071
#define BM_DIRTY
Definition: buf_internals.h:58
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:3346
int VacuumCostPageDirty
Definition: globals.c:140
void(* callback)(void *arg)
Definition: elog.h:243
struct ErrorContextCallback * previous
Definition: elog.h:242
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:947
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
Buffer recent_buffer
Definition: bufmgr.h:54
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2848
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2723
int effective_io_concurrency
Definition: bufmgr.c:135
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2413
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4506
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:153
signed int int32
Definition: c.h:417
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4460
int bgwriter_flush_after
Definition: bufmgr.c:149
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1811
struct SMgrSortArray SMgrSortArray
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:502
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:273
ErrorContextCallback * error_context_stack
Definition: elog.c:92
void set_ps_display(const char *activity)
Definition: ps_status.c:349
#define RelationOpenSmgr(relation)
Definition: rel.h:514
void ProcSendSignal(int pid)
Definition: proc.c:1821
#define SmgrIsTemp(smgr)
Definition: smgr.h:77
#define BUF_REUSABLE
Definition: bufmgr.c:69
long shared_blks_written
Definition: instrument.h:24
Definition: dynahash.c:218
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4103
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pfree(void *pointer)
Definition: mcxt.c:1057
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void InitBufferPoolAccess(void)
Definition: bufmgr.c:2502
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3536
bool ConditionalLockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3946
#define ERROR
Definition: elog.h:43
double float8
Definition: c.h:553
bool delayChkpt
Definition: proc.h:184
#define PIV_LOG_WARNING
Definition: bufpage.h:413
#define RelationIsValid(relation)
Definition: rel.h:430
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4472
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
int bgwriter_lru_maxpages
Definition: bufmgr.c:125
int NLocBuffer
Definition: localbuf.c:41
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1513
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
#define DEBUG2
Definition: elog.h:24
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
SMgrRelation srel
Definition: bufmgr.c:120
int num_to_scan
Definition: bufmgr.c:102
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:589
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
float8 progress_slice
Definition: bufmgr.c:99
XLogRecPtr BufferGetLSNAtomic(Buffer buffer)
Definition: bufmgr.c:2926
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1379
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:640
static char * buf
Definition: pg_test_fsync.c:68
int index
Definition: bufmgr.c:107
float8 progress
Definition: bufmgr.c:98
void FlushDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3450
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:524
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1645
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4403
#define PIV_REPORT_STAT
Definition: bufpage.h:414
int errdetail(const char *fmt,...)
Definition: elog.c:1035
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:222
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:429
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:537
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4372
bool BufferIsPermanent(Buffer buffer)
Definition: bufmgr.c:2896
#define BUF_WRITTEN
Definition: bufmgr.c:68
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
static bool IsForInput
Definition: bufmgr.c:154
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3778
int VacuumCostPageHit
Definition: globals.c:138
static void BufferSync(int flags)
Definition: bufmgr.c:1833
#define BUFFERTAGS_EQUAL(a, b)
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
bool IsBufferCleanupOK(Buffer buffer)
Definition: bufmgr.c:4002
ForkNumber
Definition: relpath.h:40
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1740
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1809
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:934
int ckpt_bufs_written
Definition: xlog.h:252
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:488
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:193
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:652
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1518
void UnlockBuffers(void)
Definition: bufmgr.c:3724
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4170
#define HASH_BLOBS
Definition: hsearch.h:86
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4293
#define InvalidBackendId
Definition: backendid.h:23
#define BM_VALID
Definition: buf_internals.h:59
void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
Definition: bufmgr.c:3046
BlockNumber blockNum
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:326
uintptr_t Datum
Definition: postgres.h:367
int BackendId
Definition: backendid.h:21
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3752
Size keysize
Definition: hsearch.h:71
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:717
#define InvalidOid
Definition: postgres_ext.h:36
#define ereport(elevel,...)
Definition: elog.h:155
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
RelFileNode node
Definition: relfilenode.h:74
#define free(a)
Definition: header.h:65
BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
Definition: bufmgr.c:2850
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1008
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:863
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4320
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:549
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:800
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1523
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2555
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:594
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
bool HoldingBufferPinThatDelaysRecovery(void)
Definition: bufmgr.c:3920
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:80
void FlushRelationBuffers(Relation rel)
Definition: bufmgr.c:3254
CheckpointStatsData CheckpointStats
Definition: xlog.c:186
instr_time blk_write_time
Definition: instrument.h:32
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:607
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1695
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:528
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
BackendId backend
Definition: relfilenode.h:75
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferDescriptorGetBuffer(bdesc)
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1525
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1422
Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)
Definition: bufmgr.c:1534
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1207
void AbortBufferIO(void)
Definition: bufmgr.c:4202
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4348
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1401
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:488
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:497
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1391
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3493
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:573
#define BM_IO_ERROR
Definition: buf_internals.h:62
#define PageGetLSN(page)
Definition: bufpage.h:366
#define DatumGetPointer(X)
Definition: postgres.h:549
void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:2977
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:463
BufferTag tag
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2663
#define PageIsNew(page)
Definition: bufpage.h:229
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:902
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190
#define elog(elevel,...)
Definition: elog.h:228
int i
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:575
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:157
#define relpath(rnode, forknum)
Definition: relpath.h:87
#define errcontext
Definition: elog.h:199
int NBuffers
Definition: globals.c:132
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
void * arg
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
int num_scanned
Definition: bufmgr.c:104
void InitBufferPoolBackend(void)
Definition: bufmgr.c:2526
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61
int VacuumCostPageMiss
Definition: globals.c:139
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
RelFileNode rnode
Definition: bufmgr.c:119
#define qsort(a, b, c, d)
Definition: port.h:497
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:61
void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: bufmgr.c:2684
void DropDatabaseBuffers(Oid dbid)
Definition: bufmgr.c:3149
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4255
int Buffer
Definition: buf.h:23
void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
Definition: bufmgr.c:4580
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
void BufmgrCommit(void)
Definition: bufmgr.c:2649
void IncrBufferRefCount(Buffer buffer)
Definition: bufmgr.c:3551
#define XLogHintBitIsNeeded()
Definition: xlog.h:202
bool track_io_timing
Definition: bufmgr.c:127
int32 * LocalRefCount
Definition: localbuf.c:45
Pointer Page
Definition: bufpage.h:78
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:578
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:221
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:149
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:124
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:956
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2536