PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
storage.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * storage.c
4 * code to create and destroy physical storage for relations
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/catalog/storage.c
12 *
13 * NOTES
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
16 *
17 *-------------------------------------------------------------------------
18 */
19
20#include "postgres.h"
21
23#include "access/xact.h"
24#include "access/xlog.h"
25#include "access/xloginsert.h"
26#include "access/xlogutils.h"
27#include "catalog/storage.h"
29#include "miscadmin.h"
30#include "pgstat.h"
31#include "storage/bulk_write.h"
32#include "storage/freespace.h"
33#include "storage/proc.h"
34#include "storage/smgr.h"
35#include "utils/hsearch.h"
36#include "utils/memutils.h"
37#include "utils/rel.h"
38
39/* GUC variables */
40int wal_skip_threshold = 2048; /* in kilobytes */
41
42/*
43 * We keep a list of all relations (represented as RelFileLocator values)
44 * that have been created or deleted in the current transaction. When
45 * a relation is created, we create the physical file immediately, but
46 * remember it so that we can delete the file again if the current
47 * transaction is aborted. Conversely, a deletion request is NOT
48 * executed immediately, but is just entered in the list. When and if
49 * the transaction commits, we can delete the physical file.
50 *
51 * To handle subtransactions, every entry is marked with its transaction
52 * nesting level. At subtransaction commit, we reassign the subtransaction's
53 * entries to the parent nesting level. At subtransaction abort, we can
54 * immediately execute the abort-time actions for all entries of the current
55 * nesting level.
56 *
57 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
58 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
59 * but I'm being paranoid.
60 */
61
62typedef struct PendingRelDelete
63{
64 RelFileLocator rlocator; /* relation that may need to be deleted */
65 ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */
66 bool atCommit; /* T=delete at commit; F=delete at abort */
67 int nestLevel; /* xact nesting level of request */
68 struct PendingRelDelete *next; /* linked-list link */
70
71typedef struct PendingRelSync
72{
74 bool is_truncated; /* Has the file experienced truncation? */
76
77static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
78static HTAB *pendingSyncHash = NULL;
79
80
81/*
82 * AddPendingSync
83 * Queue an at-commit fsync.
84 */
85static void
87{
88 PendingRelSync *pending;
89 bool found;
90
91 /* create the hash if not yet */
92 if (!pendingSyncHash)
93 {
95
96 ctl.keysize = sizeof(RelFileLocator);
97 ctl.entrysize = sizeof(PendingRelSync);
99 pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
101 }
102
103 pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
104 Assert(!found);
105 pending->is_truncated = false;
106}
107
108/*
109 * RelationCreateStorage
110 * Create physical storage for a relation.
111 *
112 * Create the underlying disk file storage for the relation. This only
113 * creates the main fork; additional forks are created lazily by the
114 * modules that need them.
115 *
116 * This function is transactional. The creation is WAL-logged, and if the
117 * transaction aborts later on, the storage will be destroyed. A caller
118 * that does not want the storage to be destroyed in case of an abort may
119 * pass register_delete = false.
120 */
122RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
123 bool register_delete)
124{
125 SMgrRelation srel;
126 ProcNumber procNumber;
127 bool needs_wal;
128
129 Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
130
131 switch (relpersistence)
132 {
133 case RELPERSISTENCE_TEMP:
134 procNumber = ProcNumberForTempRelations();
135 needs_wal = false;
136 break;
137 case RELPERSISTENCE_UNLOGGED:
138 procNumber = INVALID_PROC_NUMBER;
139 needs_wal = false;
140 break;
141 case RELPERSISTENCE_PERMANENT:
142 procNumber = INVALID_PROC_NUMBER;
143 needs_wal = true;
144 break;
145 default:
146 elog(ERROR, "invalid relpersistence: %c", relpersistence);
147 return NULL; /* placate compiler */
148 }
149
150 srel = smgropen(rlocator, procNumber);
151 smgrcreate(srel, MAIN_FORKNUM, false);
152
153 if (needs_wal)
155
156 /*
157 * Add the relation to the list of stuff to delete at abort, if we are
158 * asked to do so.
159 */
160 if (register_delete)
161 {
162 PendingRelDelete *pending;
163
164 pending = (PendingRelDelete *)
166 pending->rlocator = rlocator;
167 pending->procNumber = procNumber;
168 pending->atCommit = false; /* delete if abort */
170 pending->next = pendingDeletes;
171 pendingDeletes = pending;
172 }
173
174 if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
175 {
176 Assert(procNumber == INVALID_PROC_NUMBER);
177 AddPendingSync(&rlocator);
178 }
179
180 return srel;
181}
182
183/*
184 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
185 */
186void
187log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
188{
189 xl_smgr_create xlrec;
190
191 /*
192 * Make an XLOG entry reporting the file creation.
193 */
194 xlrec.rlocator = *rlocator;
195 xlrec.forkNum = forkNum;
196
198 XLogRegisterData(&xlrec, sizeof(xlrec));
200}
201
202/*
203 * RelationDropStorage
204 * Schedule unlinking of physical storage at transaction commit.
205 */
206void
208{
209 PendingRelDelete *pending;
210
211 /* Add the relation to the list of stuff to delete at commit */
212 pending = (PendingRelDelete *)
214 pending->rlocator = rel->rd_locator;
215 pending->procNumber = rel->rd_backend;
216 pending->atCommit = true; /* delete if commit */
218 pending->next = pendingDeletes;
219 pendingDeletes = pending;
220
221 /*
222 * NOTE: if the relation was created in this transaction, it will now be
223 * present in the pending-delete list twice, once with atCommit true and
224 * once with atCommit false. Hence, it will be physically deleted at end
225 * of xact in either case (and the other entry will be ignored by
226 * smgrDoPendingDeletes, so no error will occur). We could instead remove
227 * the existing list entry and delete the physical file immediately, but
228 * for now I'll keep the logic simple.
229 */
230
232}
233
234/*
235 * RelationPreserveStorage
236 * Mark a relation as not to be deleted after all.
237 *
238 * We need this function because relation mapping changes are committed
239 * separately from commit of the whole transaction, so it's still possible
240 * for the transaction to abort after the mapping update is done.
241 * When a new physical relation is installed in the map, it would be
242 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
243 * The relation mapper fixes this by telling us to not delete such relations
244 * after all as part of its commit.
245 *
246 * We also use this to reuse an old build of an index during ALTER TABLE, this
247 * time removing the delete-at-commit entry.
248 *
249 * No-op if the relation is not among those scheduled for deletion.
250 */
251void
253{
254 PendingRelDelete *pending;
255 PendingRelDelete *prev;
257
258 prev = NULL;
259 for (pending = pendingDeletes; pending != NULL; pending = next)
260 {
261 next = pending->next;
262 if (RelFileLocatorEquals(rlocator, pending->rlocator)
263 && pending->atCommit == atCommit)
264 {
265 /* unlink and delete list entry */
266 if (prev)
267 prev->next = next;
268 else
270 pfree(pending);
271 /* prev does not change */
272 }
273 else
274 {
275 /* unrelated entry, don't touch it */
276 prev = pending;
277 }
278 }
279}
280
281/*
282 * RelationTruncate
283 * Physically truncate a relation to the specified number of blocks.
284 *
285 * This includes getting rid of any buffers for the blocks that are to be
286 * dropped.
287 */
288void
290{
291 bool fsm;
292 bool vm;
293 bool need_fsm_vacuum = false;
294 ForkNumber forks[MAX_FORKNUM];
295 BlockNumber old_blocks[MAX_FORKNUM];
296 BlockNumber blocks[MAX_FORKNUM];
297 int nforks = 0;
298 SMgrRelation reln;
299
300 /*
301 * Make sure smgr_targblock etc aren't pointing somewhere past new end.
302 * (Note: don't rely on this reln pointer below this loop.)
303 */
304 reln = RelationGetSmgr(rel);
306 for (int i = 0; i <= MAX_FORKNUM; ++i)
308
309 /* Prepare for truncation of MAIN fork of the relation */
310 forks[nforks] = MAIN_FORKNUM;
311 old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
312 blocks[nforks] = nblocks;
313 nforks++;
314
315 /* Prepare for truncation of the FSM if it exists */
317 if (fsm)
318 {
319 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
320 if (BlockNumberIsValid(blocks[nforks]))
321 {
322 forks[nforks] = FSM_FORKNUM;
323 old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
324 nforks++;
325 need_fsm_vacuum = true;
326 }
327 }
328
329 /* Prepare for truncation of the visibility map too if it exists */
331 if (vm)
332 {
333 blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
334 if (BlockNumberIsValid(blocks[nforks]))
335 {
336 forks[nforks] = VISIBILITYMAP_FORKNUM;
337 old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
338 nforks++;
339 }
340 }
341
343
344 /*
345 * The code which follows can interact with concurrent checkpoints in two
346 * separate ways.
347 *
348 * First, the truncation operation might drop buffers that the checkpoint
349 * otherwise would have flushed. If it does, then it's essential that the
350 * files actually get truncated on disk before the checkpoint record is
351 * written. Otherwise, if reply begins from that checkpoint, the
352 * to-be-truncated blocks might still exist on disk but have older
353 * contents than expected, which can cause replay to fail. It's OK for the
354 * blocks to not exist on disk at all, but not for them to have the wrong
355 * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
356 * this code executes.
357 *
358 * Second, the call to smgrtruncate() below will in turn call
359 * RegisterSyncRequest(). We need the sync request created by that call to
360 * be processed before the checkpoint completes. CheckPointGuts() will
361 * call ProcessSyncRequests(), but if we register our sync request after
362 * that happens, then the WAL record for the truncation could end up
363 * preceding the checkpoint record, while the actual sync doesn't happen
364 * until the next checkpoint. To prevent that, we need to set
365 * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
366 * the redo pointer of a concurrent checkpoint, we're guaranteed that the
367 * corresponding sync request will be processed before the checkpoint
368 * completes.
369 */
372
373 /*
374 * We WAL-log the truncation first and then truncate in a critical
375 * section. Truncation drops buffers, even if dirty, and then truncates
376 * disk files. All of that work needs to complete before the lock is
377 * released, or else old versions of pages on disk that are missing recent
378 * changes would become accessible again. We'll try the whole operation
379 * again in crash recovery if we panic, but even then we can't give up
380 * because we don't want standbys' relation sizes to diverge and break
381 * replay or visibility invariants downstream. The critical section also
382 * suppresses interrupts.
383 *
384 * (See also visibilitymap.c if changing this code.)
385 */
387
388 if (RelationNeedsWAL(rel))
389 {
390 /*
391 * Make an XLOG entry reporting the file truncation.
392 */
393 XLogRecPtr lsn;
394 xl_smgr_truncate xlrec;
395
396 xlrec.blkno = nblocks;
397 xlrec.rlocator = rel->rd_locator;
398 xlrec.flags = SMGR_TRUNCATE_ALL;
399
401 XLogRegisterData(&xlrec, sizeof(xlrec));
402
403 lsn = XLogInsert(RM_SMGR_ID,
405
406 /*
407 * Flush, because otherwise the truncation of the main relation might
408 * hit the disk before the WAL record, and the truncation of the FSM
409 * or visibility map. If we crashed during that window, we'd be left
410 * with a truncated heap, but the FSM or visibility map would still
411 * contain entries for the non-existent heap pages, and standbys would
412 * also never replay the truncation.
413 */
414 XLogFlush(lsn);
415 }
416
417 /*
418 * This will first remove any buffers from the buffer pool that should no
419 * longer exist after truncation is complete, and then truncate the
420 * corresponding files on disk.
421 */
422 smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
423
425
426 /* We've done all the critical work, so checkpoints are OK now. */
428
429 /*
430 * Update upper-level FSM pages to account for the truncation. This is
431 * important because the just-truncated pages were likely marked as
432 * all-free, and would be preferentially selected.
433 *
434 * NB: There's no point in delaying checkpoints until this is done.
435 * Because the FSM is not WAL-logged, we have to be prepared for the
436 * possibility of corruption after a crash anyway.
437 */
438 if (need_fsm_vacuum)
440}
441
442/*
443 * RelationPreTruncate
444 * Perform AM-independent work before a physical truncation.
445 *
446 * If an access method's relation_nontransactional_truncate does not call
447 * RelationTruncate(), it must call this before decreasing the table size.
448 */
449void
451{
452 PendingRelSync *pending;
453
454 if (!pendingSyncHash)
455 return;
456
458 &(RelationGetSmgr(rel)->smgr_rlocator.locator),
459 HASH_FIND, NULL);
460 if (pending)
461 pending->is_truncated = true;
462}
463
464/*
465 * Copy a fork's data, block by block.
466 *
467 * Note that this requires that there is no dirty data in shared buffers. If
468 * it's possible that there are, callers need to flush those using
469 * e.g. FlushRelationBuffers(rel).
470 *
471 * Also note that this is frequently called via locutions such as
472 * RelationCopyStorage(RelationGetSmgr(rel), ...);
473 * That's safe only because we perform only smgr and WAL operations here.
474 * If we invoked anything else, a relcache flush could cause our SMgrRelation
475 * argument to become a dangling pointer.
476 */
477void
479 ForkNumber forkNum, char relpersistence)
480{
481 bool use_wal;
482 bool copying_initfork;
483 BlockNumber nblocks;
484 BlockNumber blkno;
485 BulkWriteState *bulkstate;
486
487 /*
488 * The init fork for an unlogged relation in many respects has to be
489 * treated the same as normal relation, changes need to be WAL logged and
490 * it needs to be synced to disk.
491 */
492 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
493 forkNum == INIT_FORKNUM;
494
495 /*
496 * We need to log the copied data in WAL iff WAL archiving/streaming is
497 * enabled AND it's a permanent relation. This gives the same answer as
498 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
499 * current operation created new relation storage.
500 */
501 use_wal = XLogIsNeeded() &&
502 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
503
504 bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
505
506 nblocks = smgrnblocks(src, forkNum);
507
508 for (blkno = 0; blkno < nblocks; blkno++)
509 {
511 int piv_flags;
512 bool checksum_failure;
513 bool verified;
514
515 /* If we got a cancel signal during the copy of the data, quit */
517
518 buf = smgr_bulk_get_buf(bulkstate);
519 smgrread(src, forkNum, blkno, (Page) buf);
520
521 piv_flags = PIV_LOG_WARNING;
523 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
524 verified = PageIsVerified((Page) buf, blkno, piv_flags,
527 {
529
532 }
533
534 if (!verified)
535 {
536 /*
537 * For paranoia's sake, capture the file path before invoking the
538 * ereport machinery. This guards against the possibility of a
539 * relcache flush caused by, e.g., an errcontext callback.
540 * (errcontext callbacks shouldn't be risking any such thing, but
541 * people have been known to forget that rule.)
542 */
545 forkNum);
546
549 errmsg("invalid page in block %u of relation %s",
550 blkno, relpath.str)));
551 }
552
553 /*
554 * Queue the page for WAL-logging and writing out. Unfortunately we
555 * don't know what kind of a page this is, so we have to log the full
556 * page including any unused space.
557 */
558 smgr_bulk_write(bulkstate, blkno, buf, false);
559 }
560 smgr_bulk_finish(bulkstate);
561}
562
563/*
564 * RelFileLocatorSkippingWAL
565 * Check if a BM_PERMANENT relfilelocator is using WAL.
566 *
567 * Changes to certain relations must not write WAL; see "Skipping WAL for
568 * New RelFileLocator" in src/backend/access/transam/README. Though it is
569 * known from Relation efficiently, this function is intended for the code
570 * paths not having access to Relation.
571 */
572bool
574{
575 if (!pendingSyncHash ||
576 hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
577 return false;
578
579 return true;
580}
581
582/*
583 * EstimatePendingSyncsSpace
584 * Estimate space needed to pass syncs to parallel workers.
585 */
586Size
588{
589 long entries;
590
592 return mul_size(1 + entries, sizeof(RelFileLocator));
593}
594
595/*
596 * SerializePendingSyncs
597 * Serialize syncs for parallel workers.
598 */
599void
600SerializePendingSyncs(Size maxSize, char *startAddress)
601{
602 HTAB *tmphash;
603 HASHCTL ctl;
604 HASH_SEQ_STATUS scan;
605 PendingRelSync *sync;
606 PendingRelDelete *delete;
607 RelFileLocator *src;
608 RelFileLocator *dest = (RelFileLocator *) startAddress;
609
610 if (!pendingSyncHash)
611 goto terminate;
612
613 /* Create temporary hash to collect active relfilelocators */
614 ctl.keysize = sizeof(RelFileLocator);
615 ctl.entrysize = sizeof(RelFileLocator);
617 tmphash = hash_create("tmp relfilelocators",
620
621 /* collect all rlocator from pending syncs */
623 while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
624 (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
625
626 /* remove deleted rnodes */
627 for (delete = pendingDeletes; delete != NULL; delete = delete->next)
628 if (delete->atCommit)
629 (void) hash_search(tmphash, &delete->rlocator,
630 HASH_REMOVE, NULL);
631
632 hash_seq_init(&scan, tmphash);
633 while ((src = (RelFileLocator *) hash_seq_search(&scan)))
634 *dest++ = *src;
635
636 hash_destroy(tmphash);
637
638terminate:
639 MemSet(dest, 0, sizeof(RelFileLocator));
640}
641
642/*
643 * RestorePendingSyncs
644 * Restore syncs within a parallel worker.
645 *
646 * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
647 * answer to parallel workers. Only smgrDoPendingSyncs() reads the
648 * is_truncated field, at end of transaction. Hence, don't restore it.
649 */
650void
651RestorePendingSyncs(char *startAddress)
652{
653 RelFileLocator *rlocator;
654
655 Assert(pendingSyncHash == NULL);
656 for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
657 rlocator++)
658 AddPendingSync(rlocator);
659}
660
661/*
662 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
663 *
664 * This also runs when aborting a subxact; we want to clean up a failed
665 * subxact immediately.
666 *
667 * Note: It's possible that we're being asked to remove a relation that has
668 * no physical storage in any fork. In particular, it's possible that we're
669 * cleaning up an old temporary relation for which RemovePgTempFiles has
670 * already recovered the physical storage.
671 */
672void
674{
675 int nestLevel = GetCurrentTransactionNestLevel();
676 PendingRelDelete *pending;
677 PendingRelDelete *prev;
679 int nrels = 0,
680 maxrels = 0;
681 SMgrRelation *srels = NULL;
682
683 prev = NULL;
684 for (pending = pendingDeletes; pending != NULL; pending = next)
685 {
686 next = pending->next;
687 if (pending->nestLevel < nestLevel)
688 {
689 /* outer-level entries should not be processed yet */
690 prev = pending;
691 }
692 else
693 {
694 /* unlink list entry first, so we don't retry on failure */
695 if (prev)
696 prev->next = next;
697 else
699 /* do deletion if called for */
700 if (pending->atCommit == isCommit)
701 {
702 SMgrRelation srel;
703
704 srel = smgropen(pending->rlocator, pending->procNumber);
705
706 /* allocate the initial array, or extend it, if needed */
707 if (maxrels == 0)
708 {
709 maxrels = 8;
710 srels = palloc(sizeof(SMgrRelation) * maxrels);
711 }
712 else if (maxrels <= nrels)
713 {
714 maxrels *= 2;
715 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
716 }
717
718 srels[nrels++] = srel;
719 }
720 /* must explicitly free the list entry */
721 pfree(pending);
722 /* prev does not change */
723 }
724 }
725
726 if (nrels > 0)
727 {
728 smgrdounlinkall(srels, nrels, false);
729
730 for (int i = 0; i < nrels; i++)
731 smgrclose(srels[i]);
732
733 pfree(srels);
734 }
735}
736
737/*
738 * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
739 */
740void
741smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
742{
743 PendingRelDelete *pending;
744 int nrels = 0,
745 maxrels = 0;
746 SMgrRelation *srels = NULL;
747 HASH_SEQ_STATUS scan;
748 PendingRelSync *pendingsync;
749
751
752 if (!pendingSyncHash)
753 return; /* no relation needs sync */
754
755 /* Abort -- just throw away all pending syncs */
756 if (!isCommit)
757 {
758 pendingSyncHash = NULL;
759 return;
760 }
761
763
764 /* Parallel worker -- just throw away all pending syncs */
765 if (isParallelWorker)
766 {
767 pendingSyncHash = NULL;
768 return;
769 }
770
771 /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
772 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
773 if (pending->atCommit)
774 (void) hash_search(pendingSyncHash, &pending->rlocator,
775 HASH_REMOVE, NULL);
776
778 while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
779 {
780 ForkNumber fork;
781 BlockNumber nblocks[MAX_FORKNUM + 1];
782 uint64 total_blocks = 0;
783 SMgrRelation srel;
784
785 srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
786
787 /*
788 * We emit newpage WAL records for smaller relations.
789 *
790 * Small WAL records have a chance to be flushed along with other
791 * backends' WAL records. We emit WAL records instead of syncing for
792 * files that are smaller than a certain threshold, expecting faster
793 * commit. The threshold is defined by the GUC wal_skip_threshold.
794 */
795 if (!pendingsync->is_truncated)
796 {
797 for (fork = 0; fork <= MAX_FORKNUM; fork++)
798 {
799 if (smgrexists(srel, fork))
800 {
801 BlockNumber n = smgrnblocks(srel, fork);
802
803 /* we shouldn't come here for unlogged relations */
804 Assert(fork != INIT_FORKNUM);
805 nblocks[fork] = n;
806 total_blocks += n;
807 }
808 else
809 nblocks[fork] = InvalidBlockNumber;
810 }
811 }
812
813 /*
814 * Sync file or emit WAL records for its contents.
815 *
816 * Although we emit WAL record if the file is small enough, do file
817 * sync regardless of the size if the file has experienced a
818 * truncation. It is because the file would be followed by trailing
819 * garbage blocks after a crash recovery if, while a past longer file
820 * had been flushed out, we omitted syncing-out of the file and
821 * emitted WAL instead. You might think that we could choose WAL if
822 * the current main fork is longer than ever, but there's a case where
823 * main fork is longer than ever but FSM fork gets shorter.
824 */
825 if (pendingsync->is_truncated ||
826 total_blocks >= wal_skip_threshold * (uint64) 1024 / BLCKSZ)
827 {
828 /* allocate the initial array, or extend it, if needed */
829 if (maxrels == 0)
830 {
831 maxrels = 8;
832 srels = palloc(sizeof(SMgrRelation) * maxrels);
833 }
834 else if (maxrels <= nrels)
835 {
836 maxrels *= 2;
837 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
838 }
839
840 srels[nrels++] = srel;
841 }
842 else
843 {
844 /* Emit WAL records for all blocks. The file is small enough. */
845 for (fork = 0; fork <= MAX_FORKNUM; fork++)
846 {
847 int n = nblocks[fork];
848 Relation rel;
849
850 if (!BlockNumberIsValid(n))
851 continue;
852
853 /*
854 * Emit WAL for the whole file. Unfortunately we don't know
855 * what kind of a page this is, so we have to log the full
856 * page including any unused space. ReadBufferExtended()
857 * counts some pgstat events; unfortunately, we discard them.
858 */
860 log_newpage_range(rel, fork, 0, n, false);
862 }
863 }
864 }
865
866 pendingSyncHash = NULL;
867
868 if (nrels > 0)
869 {
870 smgrdosyncall(srels, nrels);
871 pfree(srels);
872 }
873}
874
875/*
876 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
877 *
878 * The return value is the number of relations scheduled for termination.
879 * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
880 * If there are no relations to be deleted, *ptr is set to NULL.
881 *
882 * Only non-temporary relations are included in the returned list. This is OK
883 * because the list is used only in contexts where temporary relations don't
884 * matter: we're either writing to the two-phase state file (and transactions
885 * that have touched temp tables can't be prepared) or we're writing to xlog
886 * (and all temporary files will be zapped if we restart anyway, so no need
887 * for redo to do it also).
888 *
889 * Note that the list does not include anything scheduled for termination
890 * by upper-level transactions.
891 */
892int
894{
895 int nestLevel = GetCurrentTransactionNestLevel();
896 int nrels;
897 RelFileLocator *rptr;
898 PendingRelDelete *pending;
899
900 nrels = 0;
901 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
902 {
903 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
904 && pending->procNumber == INVALID_PROC_NUMBER)
905 nrels++;
906 }
907 if (nrels == 0)
908 {
909 *ptr = NULL;
910 return 0;
911 }
912 rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
913 *ptr = rptr;
914 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
915 {
916 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
917 && pending->procNumber == INVALID_PROC_NUMBER)
918 {
919 *rptr = pending->rlocator;
920 rptr++;
921 }
922 }
923 return nrels;
924}
925
926/*
927 * PostPrepare_smgr -- Clean up after a successful PREPARE
928 *
929 * What we have to do here is throw away the in-memory state about pending
930 * relation deletes. It's all been recorded in the 2PC state file and
931 * it's no longer smgr's job to worry about it.
932 */
933void
935{
936 PendingRelDelete *pending;
938
939 for (pending = pendingDeletes; pending != NULL; pending = next)
940 {
941 next = pending->next;
943 /* must explicitly free the list entry */
944 pfree(pending);
945 }
946}
947
948
949/*
950 * AtSubCommit_smgr() --- Take care of subtransaction commit.
951 *
952 * Reassign all items in the pending-deletes list to the parent transaction.
953 */
954void
956{
957 int nestLevel = GetCurrentTransactionNestLevel();
958 PendingRelDelete *pending;
959
960 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
961 {
962 if (pending->nestLevel >= nestLevel)
963 pending->nestLevel = nestLevel - 1;
964 }
965}
966
967/*
968 * AtSubAbort_smgr() --- Take care of subtransaction abort.
969 *
970 * Delete created relations and forget about deleted relations.
971 * We can execute these operations immediately because we know this
972 * subtransaction will not commit.
973 */
974void
976{
978}
979
980void
982{
983 XLogRecPtr lsn = record->EndRecPtr;
984 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
985
986 /* Backup blocks are not used in smgr records */
988
989 if (info == XLOG_SMGR_CREATE)
990 {
991 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
992 SMgrRelation reln;
993
994 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
995 smgrcreate(reln, xlrec->forkNum, true);
996 }
997 else if (info == XLOG_SMGR_TRUNCATE)
998 {
1000 SMgrRelation reln;
1001 Relation rel;
1002 ForkNumber forks[MAX_FORKNUM];
1003 BlockNumber blocks[MAX_FORKNUM];
1004 BlockNumber old_blocks[MAX_FORKNUM];
1005 int nforks = 0;
1006 bool need_fsm_vacuum = false;
1007
1008 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
1009
1010 /*
1011 * Forcibly create relation if it doesn't exist (which suggests that
1012 * it was dropped somewhere later in the WAL sequence). As in
1013 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
1014 * log as best we can until the drop is seen.
1015 */
1016 smgrcreate(reln, MAIN_FORKNUM, true);
1017
1018 /*
1019 * Before we perform the truncation, update minimum recovery point to
1020 * cover this WAL record. Once the relation is truncated, there's no
1021 * going back. The buffer manager enforces the WAL-first rule for
1022 * normal updates to relation files, so that the minimum recovery
1023 * point is always updated before the corresponding change in the data
1024 * file is flushed to disk. We have to do the same manually here.
1025 *
1026 * Doing this before the truncation means that if the truncation fails
1027 * for some reason, you cannot start up the system even after restart,
1028 * until you fix the underlying situation so that the truncation will
1029 * succeed. Alternatively, we could update the minimum recovery point
1030 * after truncation, but that would leave a small window where the
1031 * WAL-first rule could be violated.
1032 */
1033 XLogFlush(lsn);
1034
1035 /* Prepare for truncation of MAIN fork */
1036 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1037 {
1038 forks[nforks] = MAIN_FORKNUM;
1039 old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
1040 blocks[nforks] = xlrec->blkno;
1041 nforks++;
1042
1043 /* Also tell xlogutils.c about it */
1045 }
1046
1047 /* Prepare for truncation of FSM and VM too */
1048 rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1049
1050 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1051 smgrexists(reln, FSM_FORKNUM))
1052 {
1053 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1054 if (BlockNumberIsValid(blocks[nforks]))
1055 {
1056 forks[nforks] = FSM_FORKNUM;
1057 old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
1058 nforks++;
1059 need_fsm_vacuum = true;
1060 }
1061 }
1062 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1064 {
1065 blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1066 if (BlockNumberIsValid(blocks[nforks]))
1067 {
1068 forks[nforks] = VISIBILITYMAP_FORKNUM;
1069 old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
1070 nforks++;
1071 }
1072 }
1073
1074 /* Do the real work to truncate relation forks */
1075 if (nforks > 0)
1076 {
1078 smgrtruncate(reln, forks, nforks, old_blocks, blocks);
1080 }
1081
1082 /*
1083 * Update upper-level FSM pages to account for the truncation. This is
1084 * important because the just-truncated pages were likely marked as
1085 * all-free, and would be preferentially selected.
1086 */
1087 if (need_fsm_vacuum)
1088 FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1090
1092 }
1093 else
1094 elog(PANIC, "smgr_redo: unknown op code %u", info);
1095}
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static int32 next
Definition: blutils.c:224
bool ignore_checksum_failure
Definition: bufpage.c:27
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_WARNING
Definition: bufpage.h:468
PageData * Page
Definition: bufpage.h:82
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
BulkWriteState * smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
Definition: bulk_write.c:100
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
uint8_t uint8
Definition: c.h:500
uint64_t uint64
Definition: c.h:503
#define MemSet(start, val, len)
Definition: c.h:991
size_t Size
Definition: c.h:576
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:865
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1341
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:377
BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
Definition: freespace.c:275
Assert(PointerIsAligned(start, uint64))
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
int i
Definition: isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1256
MemoryContext TopTransactionContext
Definition: mcxt.c:170
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
MemoryContext TopMemoryContext
Definition: mcxt.c:165
void * palloc(Size size)
Definition: mcxt.c:1940
MemoryContext CurrentMemoryContext
Definition: mcxt.c:159
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
static bool checksum_failure
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static char * buf
Definition: pg_test_fsync.c:72
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
#define DELAY_CHKPT_START
Definition: proc.h:120
#define DELAY_CHKPT_COMPLETE
Definition: proc.h:121
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
#define ProcNumberForTempRelations()
Definition: procnumber.h:53
tree ctl
Definition: radixtree.h:1838
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:578
#define RelationNeedsWAL(relation)
Definition: rel.h:639
static void RelationCloseSmgr(Relation relation)
Definition: rel.h:593
#define AssertPendingSyncs_RelationCache()
Definition: relcache.h:135
struct RelFileLocator RelFileLocator
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ FSM_FORKNUM
Definition: relpath.h:59
@ VISIBILITYMAP_FORKNUM
Definition: relpath.h:60
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
Size mul_size(Size s1, Size s2)
Definition: shmem.c:510
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrdosyncall(SMgrRelation *rels, int nrels)
Definition: smgr.c:498
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *old_nblocks, BlockNumber *nblocks)
Definition: smgr.c:875
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
static void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer)
Definition: smgr.h:124
PGPROC * MyProc
Definition: proc.c:67
void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
Definition: storage.c:252
void RelationPreTruncate(Relation rel)
Definition: storage.c:450
struct PendingRelDelete PendingRelDelete
void SerializePendingSyncs(Size maxSize, char *startAddress)
Definition: storage.c:600
void AtSubCommit_smgr(void)
Definition: storage.c:955
int wal_skip_threshold
Definition: storage.c:40
Size EstimatePendingSyncsSpace(void)
Definition: storage.c:587
static HTAB * pendingSyncHash
Definition: storage.c:78
struct PendingRelSync PendingRelSync
static PendingRelDelete * pendingDeletes
Definition: storage.c:77
void AtSubAbort_smgr(void)
Definition: storage.c:975
void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence)
Definition: storage.c:478
void smgr_redo(XLogReaderState *record)
Definition: storage.c:981
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
Definition: storage.c:893
void PostPrepare_smgr(void)
Definition: storage.c:934
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
static void AddPendingSync(const RelFileLocator *rlocator)
Definition: storage.c:86
void RestorePendingSyncs(char *startAddress)
Definition: storage.c:651
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187
void RelationDropStorage(Relation rel)
Definition: storage.c:207
void RelationTruncate(Relation rel, BlockNumber nblocks)
Definition: storage.c:289
void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
Definition: storage.c:741
void smgrDoPendingDeletes(bool isCommit)
Definition: storage.c:673
#define SMGR_TRUNCATE_ALL
Definition: storage_xlog.h:43
#define SMGR_TRUNCATE_VM
Definition: storage_xlog.h:41
#define XLOG_SMGR_CREATE
Definition: storage_xlog.h:30
#define XLOG_SMGR_TRUNCATE
Definition: storage_xlog.h:31
#define SMGR_TRUNCATE_HEAP
Definition: storage_xlog.h:40
#define SMGR_TRUNCATE_FSM
Definition: storage_xlog.h:42
Definition: dynahash.c:220
int delayChkptFlags
Definition: proc.h:241
ProcNumber procNumber
Definition: storage.c:65
struct PendingRelDelete * next
Definition: storage.c:68
RelFileLocator rlocator
Definition: storage.c:64
RelFileLocator rlocator
Definition: storage.c:73
bool is_truncated
Definition: storage.c:74
RelFileLocator locator
RelFileNumber relNumber
ProcNumber rd_backend
Definition: rel.h:60
RelFileLocator rd_locator
Definition: rel.h:57
BlockNumber smgr_targblock
Definition: smgr.h:46
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
ForkNumber forkNum
Definition: storage_xlog.h:36
RelFileLocator rlocator
Definition: storage_xlog.h:35
RelFileLocator rlocator
Definition: storage_xlog.h:49
BlockNumber blkno
Definition: storage_xlog.h:48
BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
int GetCurrentTransactionNestLevel(void)
Definition: xact.c:929
bool IsInParallelMode(void)
Definition: xact.c:1089
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2923
#define XLogIsNeeded()
Definition: xlog.h:109
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const void *data, uint32 len)
Definition: xloginsert.c:364
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
Definition: xloginsert.c:1270
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define XLR_SPECIAL_REL_UPDATE
Definition: xlogrecord.h:82
void FreeFakeRelcacheEntry(Relation fakerel)
Definition: xlogutils.c:618
void XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nblocks)
Definition: xlogutils.c:660
Relation CreateFakeRelcacheEntry(RelFileLocator rlocator)
Definition: xlogutils.c:571