PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
storage.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * storage.c
4 * code to create and destroy physical storage for relations
5 *
6 * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/catalog/storage.c
12 *
13 * NOTES
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
16 *
17 *-------------------------------------------------------------------------
18 */
19
20#include "postgres.h"
21
23#include "access/xact.h"
24#include "access/xlog.h"
25#include "access/xloginsert.h"
26#include "access/xlogutils.h"
27#include "catalog/storage.h"
29#include "miscadmin.h"
30#include "storage/bulk_write.h"
31#include "storage/freespace.h"
32#include "storage/proc.h"
33#include "storage/smgr.h"
34#include "utils/hsearch.h"
35#include "utils/memutils.h"
36#include "utils/rel.h"
37
38/* GUC variables */
39int wal_skip_threshold = 2048; /* in kilobytes */
40
41/*
42 * We keep a list of all relations (represented as RelFileLocator values)
43 * that have been created or deleted in the current transaction. When
44 * a relation is created, we create the physical file immediately, but
45 * remember it so that we can delete the file again if the current
46 * transaction is aborted. Conversely, a deletion request is NOT
47 * executed immediately, but is just entered in the list. When and if
48 * the transaction commits, we can delete the physical file.
49 *
50 * To handle subtransactions, every entry is marked with its transaction
51 * nesting level. At subtransaction commit, we reassign the subtransaction's
52 * entries to the parent nesting level. At subtransaction abort, we can
53 * immediately execute the abort-time actions for all entries of the current
54 * nesting level.
55 *
56 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
57 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
58 * but I'm being paranoid.
59 */
60
61typedef struct PendingRelDelete
62{
63 RelFileLocator rlocator; /* relation that may need to be deleted */
64 ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */
65 bool atCommit; /* T=delete at commit; F=delete at abort */
66 int nestLevel; /* xact nesting level of request */
67 struct PendingRelDelete *next; /* linked-list link */
69
70typedef struct PendingRelSync
71{
73 bool is_truncated; /* Has the file experienced truncation? */
75
76static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
77static HTAB *pendingSyncHash = NULL;
78
79
80/*
81 * AddPendingSync
82 * Queue an at-commit fsync.
83 */
84static void
86{
87 PendingRelSync *pending;
88 bool found;
89
90 /* create the hash if not yet */
91 if (!pendingSyncHash)
92 {
94
95 ctl.keysize = sizeof(RelFileLocator);
96 ctl.entrysize = sizeof(PendingRelSync);
98 pendingSyncHash = hash_create("pending sync hash", 16, &ctl,
100 }
101
102 pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
103 Assert(!found);
104 pending->is_truncated = false;
105}
106
107/*
108 * RelationCreateStorage
109 * Create physical storage for a relation.
110 *
111 * Create the underlying disk file storage for the relation. This only
112 * creates the main fork; additional forks are created lazily by the
113 * modules that need them.
114 *
115 * This function is transactional. The creation is WAL-logged, and if the
116 * transaction aborts later on, the storage will be destroyed. A caller
117 * that does not want the storage to be destroyed in case of an abort may
118 * pass register_delete = false.
119 */
121RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
122 bool register_delete)
123{
124 SMgrRelation srel;
125 ProcNumber procNumber;
126 bool needs_wal;
127
128 Assert(!IsInParallelMode()); /* couldn't update pendingSyncHash */
129
130 switch (relpersistence)
131 {
132 case RELPERSISTENCE_TEMP:
133 procNumber = ProcNumberForTempRelations();
134 needs_wal = false;
135 break;
136 case RELPERSISTENCE_UNLOGGED:
137 procNumber = INVALID_PROC_NUMBER;
138 needs_wal = false;
139 break;
140 case RELPERSISTENCE_PERMANENT:
141 procNumber = INVALID_PROC_NUMBER;
142 needs_wal = true;
143 break;
144 default:
145 elog(ERROR, "invalid relpersistence: %c", relpersistence);
146 return NULL; /* placate compiler */
147 }
148
149 srel = smgropen(rlocator, procNumber);
150 smgrcreate(srel, MAIN_FORKNUM, false);
151
152 if (needs_wal)
154
155 /*
156 * Add the relation to the list of stuff to delete at abort, if we are
157 * asked to do so.
158 */
159 if (register_delete)
160 {
161 PendingRelDelete *pending;
162
163 pending = (PendingRelDelete *)
165 pending->rlocator = rlocator;
166 pending->procNumber = procNumber;
167 pending->atCommit = false; /* delete if abort */
169 pending->next = pendingDeletes;
170 pendingDeletes = pending;
171 }
172
173 if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
174 {
175 Assert(procNumber == INVALID_PROC_NUMBER);
176 AddPendingSync(&rlocator);
177 }
178
179 return srel;
180}
181
182/*
183 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
184 */
185void
186log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
187{
188 xl_smgr_create xlrec;
189
190 /*
191 * Make an XLOG entry reporting the file creation.
192 */
193 xlrec.rlocator = *rlocator;
194 xlrec.forkNum = forkNum;
195
197 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
199}
200
201/*
202 * RelationDropStorage
203 * Schedule unlinking of physical storage at transaction commit.
204 */
205void
207{
208 PendingRelDelete *pending;
209
210 /* Add the relation to the list of stuff to delete at commit */
211 pending = (PendingRelDelete *)
213 pending->rlocator = rel->rd_locator;
214 pending->procNumber = rel->rd_backend;
215 pending->atCommit = true; /* delete if commit */
217 pending->next = pendingDeletes;
218 pendingDeletes = pending;
219
220 /*
221 * NOTE: if the relation was created in this transaction, it will now be
222 * present in the pending-delete list twice, once with atCommit true and
223 * once with atCommit false. Hence, it will be physically deleted at end
224 * of xact in either case (and the other entry will be ignored by
225 * smgrDoPendingDeletes, so no error will occur). We could instead remove
226 * the existing list entry and delete the physical file immediately, but
227 * for now I'll keep the logic simple.
228 */
229
231}
232
233/*
234 * RelationPreserveStorage
235 * Mark a relation as not to be deleted after all.
236 *
237 * We need this function because relation mapping changes are committed
238 * separately from commit of the whole transaction, so it's still possible
239 * for the transaction to abort after the mapping update is done.
240 * When a new physical relation is installed in the map, it would be
241 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
242 * The relation mapper fixes this by telling us to not delete such relations
243 * after all as part of its commit.
244 *
245 * We also use this to reuse an old build of an index during ALTER TABLE, this
246 * time removing the delete-at-commit entry.
247 *
248 * No-op if the relation is not among those scheduled for deletion.
249 */
250void
252{
253 PendingRelDelete *pending;
254 PendingRelDelete *prev;
256
257 prev = NULL;
258 for (pending = pendingDeletes; pending != NULL; pending = next)
259 {
260 next = pending->next;
261 if (RelFileLocatorEquals(rlocator, pending->rlocator)
262 && pending->atCommit == atCommit)
263 {
264 /* unlink and delete list entry */
265 if (prev)
266 prev->next = next;
267 else
269 pfree(pending);
270 /* prev does not change */
271 }
272 else
273 {
274 /* unrelated entry, don't touch it */
275 prev = pending;
276 }
277 }
278}
279
280/*
281 * RelationTruncate
282 * Physically truncate a relation to the specified number of blocks.
283 *
284 * This includes getting rid of any buffers for the blocks that are to be
285 * dropped.
286 */
287void
289{
290 bool fsm;
291 bool vm;
292 bool need_fsm_vacuum = false;
293 ForkNumber forks[MAX_FORKNUM];
294 BlockNumber old_blocks[MAX_FORKNUM];
295 BlockNumber blocks[MAX_FORKNUM];
296 int nforks = 0;
297 SMgrRelation reln;
298
299 /*
300 * Make sure smgr_targblock etc aren't pointing somewhere past new end.
301 * (Note: don't rely on this reln pointer below this loop.)
302 */
303 reln = RelationGetSmgr(rel);
305 for (int i = 0; i <= MAX_FORKNUM; ++i)
307
308 /* Prepare for truncation of MAIN fork of the relation */
309 forks[nforks] = MAIN_FORKNUM;
310 old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
311 blocks[nforks] = nblocks;
312 nforks++;
313
314 /* Prepare for truncation of the FSM if it exists */
316 if (fsm)
317 {
318 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks);
319 if (BlockNumberIsValid(blocks[nforks]))
320 {
321 forks[nforks] = FSM_FORKNUM;
322 old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
323 nforks++;
324 need_fsm_vacuum = true;
325 }
326 }
327
328 /* Prepare for truncation of the visibility map too if it exists */
330 if (vm)
331 {
332 blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks);
333 if (BlockNumberIsValid(blocks[nforks]))
334 {
335 forks[nforks] = VISIBILITYMAP_FORKNUM;
336 old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
337 nforks++;
338 }
339 }
340
342
343 /*
344 * The code which follows can interact with concurrent checkpoints in two
345 * separate ways.
346 *
347 * First, the truncation operation might drop buffers that the checkpoint
348 * otherwise would have flushed. If it does, then it's essential that the
349 * files actually get truncated on disk before the checkpoint record is
350 * written. Otherwise, if reply begins from that checkpoint, the
351 * to-be-truncated blocks might still exist on disk but have older
352 * contents than expected, which can cause replay to fail. It's OK for the
353 * blocks to not exist on disk at all, but not for them to have the wrong
354 * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
355 * this code executes.
356 *
357 * Second, the call to smgrtruncate() below will in turn call
358 * RegisterSyncRequest(). We need the sync request created by that call to
359 * be processed before the checkpoint completes. CheckPointGuts() will
360 * call ProcessSyncRequests(), but if we register our sync request after
361 * that happens, then the WAL record for the truncation could end up
362 * preceding the checkpoint record, while the actual sync doesn't happen
363 * until the next checkpoint. To prevent that, we need to set
364 * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
365 * the redo pointer of a concurrent checkpoint, we're guaranteed that the
366 * corresponding sync request will be processed before the checkpoint
367 * completes.
368 */
371
372 /*
373 * We WAL-log the truncation first and then truncate in a critical
374 * section. Truncation drops buffers, even if dirty, and then truncates
375 * disk files. All of that work needs to complete before the lock is
376 * released, or else old versions of pages on disk that are missing recent
377 * changes would become accessible again. We'll try the whole operation
378 * again in crash recovery if we panic, but even then we can't give up
379 * because we don't want standbys' relation sizes to diverge and break
380 * replay or visibility invariants downstream. The critical section also
381 * suppresses interrupts.
382 *
383 * (See also pg_visibilitymap.c if changing this code.)
384 */
386
387 if (RelationNeedsWAL(rel))
388 {
389 /*
390 * Make an XLOG entry reporting the file truncation.
391 */
392 XLogRecPtr lsn;
393 xl_smgr_truncate xlrec;
394
395 xlrec.blkno = nblocks;
396 xlrec.rlocator = rel->rd_locator;
397 xlrec.flags = SMGR_TRUNCATE_ALL;
398
400 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
401
402 lsn = XLogInsert(RM_SMGR_ID,
404
405 /*
406 * Flush, because otherwise the truncation of the main relation might
407 * hit the disk before the WAL record, and the truncation of the FSM
408 * or visibility map. If we crashed during that window, we'd be left
409 * with a truncated heap, but the FSM or visibility map would still
410 * contain entries for the non-existent heap pages, and standbys would
411 * also never replay the truncation.
412 */
413 XLogFlush(lsn);
414 }
415
416 /*
417 * This will first remove any buffers from the buffer pool that should no
418 * longer exist after truncation is complete, and then truncate the
419 * corresponding files on disk.
420 */
421 smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
422
424
425 /* We've done all the critical work, so checkpoints are OK now. */
427
428 /*
429 * Update upper-level FSM pages to account for the truncation. This is
430 * important because the just-truncated pages were likely marked as
431 * all-free, and would be preferentially selected.
432 *
433 * NB: There's no point in delaying checkpoints until this is done.
434 * Because the FSM is not WAL-logged, we have to be prepared for the
435 * possibility of corruption after a crash anyway.
436 */
437 if (need_fsm_vacuum)
439}
440
441/*
442 * RelationPreTruncate
443 * Perform AM-independent work before a physical truncation.
444 *
445 * If an access method's relation_nontransactional_truncate does not call
446 * RelationTruncate(), it must call this before decreasing the table size.
447 */
448void
450{
451 PendingRelSync *pending;
452
453 if (!pendingSyncHash)
454 return;
455
457 &(RelationGetSmgr(rel)->smgr_rlocator.locator),
458 HASH_FIND, NULL);
459 if (pending)
460 pending->is_truncated = true;
461}
462
463/*
464 * Copy a fork's data, block by block.
465 *
466 * Note that this requires that there is no dirty data in shared buffers. If
467 * it's possible that there are, callers need to flush those using
468 * e.g. FlushRelationBuffers(rel).
469 *
470 * Also note that this is frequently called via locutions such as
471 * RelationCopyStorage(RelationGetSmgr(rel), ...);
472 * That's safe only because we perform only smgr and WAL operations here.
473 * If we invoked anything else, a relcache flush could cause our SMgrRelation
474 * argument to become a dangling pointer.
475 */
476void
478 ForkNumber forkNum, char relpersistence)
479{
480 bool use_wal;
481 bool copying_initfork;
482 BlockNumber nblocks;
483 BlockNumber blkno;
484 BulkWriteState *bulkstate;
485
486 /*
487 * The init fork for an unlogged relation in many respects has to be
488 * treated the same as normal relation, changes need to be WAL logged and
489 * it needs to be synced to disk.
490 */
491 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
492 forkNum == INIT_FORKNUM;
493
494 /*
495 * We need to log the copied data in WAL iff WAL archiving/streaming is
496 * enabled AND it's a permanent relation. This gives the same answer as
497 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
498 * current operation created new relation storage.
499 */
500 use_wal = XLogIsNeeded() &&
501 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
502
503 bulkstate = smgr_bulk_start_smgr(dst, forkNum, use_wal);
504
505 nblocks = smgrnblocks(src, forkNum);
506
507 for (blkno = 0; blkno < nblocks; blkno++)
508 {
510
511 /* If we got a cancel signal during the copy of the data, quit */
513
514 buf = smgr_bulk_get_buf(bulkstate);
515 smgrread(src, forkNum, blkno, (Page) buf);
516
517 if (!PageIsVerifiedExtended((Page) buf, blkno,
519 {
520 /*
521 * For paranoia's sake, capture the file path before invoking the
522 * ereport machinery. This guards against the possibility of a
523 * relcache flush caused by, e.g., an errcontext callback.
524 * (errcontext callbacks shouldn't be risking any such thing, but
525 * people have been known to forget that rule.)
526 */
529 forkNum);
530
533 errmsg("invalid page in block %u of relation %s",
534 blkno, relpath)));
535 }
536
537 /*
538 * Queue the page for WAL-logging and writing out. Unfortunately we
539 * don't know what kind of a page this is, so we have to log the full
540 * page including any unused space.
541 */
542 smgr_bulk_write(bulkstate, blkno, buf, false);
543 }
544 smgr_bulk_finish(bulkstate);
545}
546
547/*
548 * RelFileLocatorSkippingWAL
549 * Check if a BM_PERMANENT relfilelocator is using WAL.
550 *
551 * Changes to certain relations must not write WAL; see "Skipping WAL for
552 * New RelFileLocator" in src/backend/access/transam/README. Though it is
553 * known from Relation efficiently, this function is intended for the code
554 * paths not having access to Relation.
555 */
556bool
558{
559 if (!pendingSyncHash ||
560 hash_search(pendingSyncHash, &rlocator, HASH_FIND, NULL) == NULL)
561 return false;
562
563 return true;
564}
565
566/*
567 * EstimatePendingSyncsSpace
568 * Estimate space needed to pass syncs to parallel workers.
569 */
570Size
572{
573 long entries;
574
576 return mul_size(1 + entries, sizeof(RelFileLocator));
577}
578
579/*
580 * SerializePendingSyncs
581 * Serialize syncs for parallel workers.
582 */
583void
584SerializePendingSyncs(Size maxSize, char *startAddress)
585{
586 HTAB *tmphash;
587 HASHCTL ctl;
588 HASH_SEQ_STATUS scan;
589 PendingRelSync *sync;
590 PendingRelDelete *delete;
591 RelFileLocator *src;
592 RelFileLocator *dest = (RelFileLocator *) startAddress;
593
594 if (!pendingSyncHash)
595 goto terminate;
596
597 /* Create temporary hash to collect active relfilelocators */
598 ctl.keysize = sizeof(RelFileLocator);
599 ctl.entrysize = sizeof(RelFileLocator);
601 tmphash = hash_create("tmp relfilelocators",
604
605 /* collect all rlocator from pending syncs */
607 while ((sync = (PendingRelSync *) hash_seq_search(&scan)))
608 (void) hash_search(tmphash, &sync->rlocator, HASH_ENTER, NULL);
609
610 /* remove deleted rnodes */
611 for (delete = pendingDeletes; delete != NULL; delete = delete->next)
612 if (delete->atCommit)
613 (void) hash_search(tmphash, &delete->rlocator,
614 HASH_REMOVE, NULL);
615
616 hash_seq_init(&scan, tmphash);
617 while ((src = (RelFileLocator *) hash_seq_search(&scan)))
618 *dest++ = *src;
619
620 hash_destroy(tmphash);
621
622terminate:
623 MemSet(dest, 0, sizeof(RelFileLocator));
624}
625
626/*
627 * RestorePendingSyncs
628 * Restore syncs within a parallel worker.
629 *
630 * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
631 * answer to parallel workers. Only smgrDoPendingSyncs() reads the
632 * is_truncated field, at end of transaction. Hence, don't restore it.
633 */
634void
635RestorePendingSyncs(char *startAddress)
636{
637 RelFileLocator *rlocator;
638
639 Assert(pendingSyncHash == NULL);
640 for (rlocator = (RelFileLocator *) startAddress; rlocator->relNumber != 0;
641 rlocator++)
642 AddPendingSync(rlocator);
643}
644
645/*
646 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
647 *
648 * This also runs when aborting a subxact; we want to clean up a failed
649 * subxact immediately.
650 *
651 * Note: It's possible that we're being asked to remove a relation that has
652 * no physical storage in any fork. In particular, it's possible that we're
653 * cleaning up an old temporary relation for which RemovePgTempFiles has
654 * already recovered the physical storage.
655 */
656void
658{
659 int nestLevel = GetCurrentTransactionNestLevel();
660 PendingRelDelete *pending;
661 PendingRelDelete *prev;
663 int nrels = 0,
664 maxrels = 0;
665 SMgrRelation *srels = NULL;
666
667 prev = NULL;
668 for (pending = pendingDeletes; pending != NULL; pending = next)
669 {
670 next = pending->next;
671 if (pending->nestLevel < nestLevel)
672 {
673 /* outer-level entries should not be processed yet */
674 prev = pending;
675 }
676 else
677 {
678 /* unlink list entry first, so we don't retry on failure */
679 if (prev)
680 prev->next = next;
681 else
683 /* do deletion if called for */
684 if (pending->atCommit == isCommit)
685 {
686 SMgrRelation srel;
687
688 srel = smgropen(pending->rlocator, pending->procNumber);
689
690 /* allocate the initial array, or extend it, if needed */
691 if (maxrels == 0)
692 {
693 maxrels = 8;
694 srels = palloc(sizeof(SMgrRelation) * maxrels);
695 }
696 else if (maxrels <= nrels)
697 {
698 maxrels *= 2;
699 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
700 }
701
702 srels[nrels++] = srel;
703 }
704 /* must explicitly free the list entry */
705 pfree(pending);
706 /* prev does not change */
707 }
708 }
709
710 if (nrels > 0)
711 {
712 smgrdounlinkall(srels, nrels, false);
713
714 for (int i = 0; i < nrels; i++)
715 smgrclose(srels[i]);
716
717 pfree(srels);
718 }
719}
720
721/*
722 * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
723 */
724void
725smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
726{
727 PendingRelDelete *pending;
728 int nrels = 0,
729 maxrels = 0;
730 SMgrRelation *srels = NULL;
731 HASH_SEQ_STATUS scan;
732 PendingRelSync *pendingsync;
733
735
736 if (!pendingSyncHash)
737 return; /* no relation needs sync */
738
739 /* Abort -- just throw away all pending syncs */
740 if (!isCommit)
741 {
742 pendingSyncHash = NULL;
743 return;
744 }
745
747
748 /* Parallel worker -- just throw away all pending syncs */
749 if (isParallelWorker)
750 {
751 pendingSyncHash = NULL;
752 return;
753 }
754
755 /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
756 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
757 if (pending->atCommit)
758 (void) hash_search(pendingSyncHash, &pending->rlocator,
759 HASH_REMOVE, NULL);
760
762 while ((pendingsync = (PendingRelSync *) hash_seq_search(&scan)))
763 {
764 ForkNumber fork;
765 BlockNumber nblocks[MAX_FORKNUM + 1];
766 BlockNumber total_blocks = 0;
767 SMgrRelation srel;
768
769 srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
770
771 /*
772 * We emit newpage WAL records for smaller relations.
773 *
774 * Small WAL records have a chance to be flushed along with other
775 * backends' WAL records. We emit WAL records instead of syncing for
776 * files that are smaller than a certain threshold, expecting faster
777 * commit. The threshold is defined by the GUC wal_skip_threshold.
778 */
779 if (!pendingsync->is_truncated)
780 {
781 for (fork = 0; fork <= MAX_FORKNUM; fork++)
782 {
783 if (smgrexists(srel, fork))
784 {
785 BlockNumber n = smgrnblocks(srel, fork);
786
787 /* we shouldn't come here for unlogged relations */
788 Assert(fork != INIT_FORKNUM);
789 nblocks[fork] = n;
790 total_blocks += n;
791 }
792 else
793 nblocks[fork] = InvalidBlockNumber;
794 }
795 }
796
797 /*
798 * Sync file or emit WAL records for its contents.
799 *
800 * Although we emit WAL record if the file is small enough, do file
801 * sync regardless of the size if the file has experienced a
802 * truncation. It is because the file would be followed by trailing
803 * garbage blocks after a crash recovery if, while a past longer file
804 * had been flushed out, we omitted syncing-out of the file and
805 * emitted WAL instead. You might think that we could choose WAL if
806 * the current main fork is longer than ever, but there's a case where
807 * main fork is longer than ever but FSM fork gets shorter.
808 */
809 if (pendingsync->is_truncated ||
810 total_blocks * BLCKSZ / 1024 >= wal_skip_threshold)
811 {
812 /* allocate the initial array, or extend it, if needed */
813 if (maxrels == 0)
814 {
815 maxrels = 8;
816 srels = palloc(sizeof(SMgrRelation) * maxrels);
817 }
818 else if (maxrels <= nrels)
819 {
820 maxrels *= 2;
821 srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
822 }
823
824 srels[nrels++] = srel;
825 }
826 else
827 {
828 /* Emit WAL records for all blocks. The file is small enough. */
829 for (fork = 0; fork <= MAX_FORKNUM; fork++)
830 {
831 int n = nblocks[fork];
832 Relation rel;
833
834 if (!BlockNumberIsValid(n))
835 continue;
836
837 /*
838 * Emit WAL for the whole file. Unfortunately we don't know
839 * what kind of a page this is, so we have to log the full
840 * page including any unused space. ReadBufferExtended()
841 * counts some pgstat events; unfortunately, we discard them.
842 */
844 log_newpage_range(rel, fork, 0, n, false);
846 }
847 }
848 }
849
850 pendingSyncHash = NULL;
851
852 if (nrels > 0)
853 {
854 smgrdosyncall(srels, nrels);
855 pfree(srels);
856 }
857}
858
859/*
860 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
861 *
862 * The return value is the number of relations scheduled for termination.
863 * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
864 * If there are no relations to be deleted, *ptr is set to NULL.
865 *
866 * Only non-temporary relations are included in the returned list. This is OK
867 * because the list is used only in contexts where temporary relations don't
868 * matter: we're either writing to the two-phase state file (and transactions
869 * that have touched temp tables can't be prepared) or we're writing to xlog
870 * (and all temporary files will be zapped if we restart anyway, so no need
871 * for redo to do it also).
872 *
873 * Note that the list does not include anything scheduled for termination
874 * by upper-level transactions.
875 */
876int
878{
879 int nestLevel = GetCurrentTransactionNestLevel();
880 int nrels;
881 RelFileLocator *rptr;
882 PendingRelDelete *pending;
883
884 nrels = 0;
885 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
886 {
887 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
888 && pending->procNumber == INVALID_PROC_NUMBER)
889 nrels++;
890 }
891 if (nrels == 0)
892 {
893 *ptr = NULL;
894 return 0;
895 }
896 rptr = (RelFileLocator *) palloc(nrels * sizeof(RelFileLocator));
897 *ptr = rptr;
898 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
899 {
900 if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
901 && pending->procNumber == INVALID_PROC_NUMBER)
902 {
903 *rptr = pending->rlocator;
904 rptr++;
905 }
906 }
907 return nrels;
908}
909
910/*
911 * PostPrepare_smgr -- Clean up after a successful PREPARE
912 *
913 * What we have to do here is throw away the in-memory state about pending
914 * relation deletes. It's all been recorded in the 2PC state file and
915 * it's no longer smgr's job to worry about it.
916 */
917void
919{
920 PendingRelDelete *pending;
922
923 for (pending = pendingDeletes; pending != NULL; pending = next)
924 {
925 next = pending->next;
927 /* must explicitly free the list entry */
928 pfree(pending);
929 }
930}
931
932
933/*
934 * AtSubCommit_smgr() --- Take care of subtransaction commit.
935 *
936 * Reassign all items in the pending-deletes list to the parent transaction.
937 */
938void
940{
941 int nestLevel = GetCurrentTransactionNestLevel();
942 PendingRelDelete *pending;
943
944 for (pending = pendingDeletes; pending != NULL; pending = pending->next)
945 {
946 if (pending->nestLevel >= nestLevel)
947 pending->nestLevel = nestLevel - 1;
948 }
949}
950
951/*
952 * AtSubAbort_smgr() --- Take care of subtransaction abort.
953 *
954 * Delete created relations and forget about deleted relations.
955 * We can execute these operations immediately because we know this
956 * subtransaction will not commit.
957 */
958void
960{
962}
963
964void
966{
967 XLogRecPtr lsn = record->EndRecPtr;
968 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
969
970 /* Backup blocks are not used in smgr records */
972
973 if (info == XLOG_SMGR_CREATE)
974 {
975 xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
976 SMgrRelation reln;
977
978 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
979 smgrcreate(reln, xlrec->forkNum, true);
980 }
981 else if (info == XLOG_SMGR_TRUNCATE)
982 {
984 SMgrRelation reln;
985 Relation rel;
986 ForkNumber forks[MAX_FORKNUM];
987 BlockNumber blocks[MAX_FORKNUM];
988 BlockNumber old_blocks[MAX_FORKNUM];
989 int nforks = 0;
990 bool need_fsm_vacuum = false;
991
992 reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
993
994 /*
995 * Forcibly create relation if it doesn't exist (which suggests that
996 * it was dropped somewhere later in the WAL sequence). As in
997 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
998 * log as best we can until the drop is seen.
999 */
1000 smgrcreate(reln, MAIN_FORKNUM, true);
1001
1002 /*
1003 * Before we perform the truncation, update minimum recovery point to
1004 * cover this WAL record. Once the relation is truncated, there's no
1005 * going back. The buffer manager enforces the WAL-first rule for
1006 * normal updates to relation files, so that the minimum recovery
1007 * point is always updated before the corresponding change in the data
1008 * file is flushed to disk. We have to do the same manually here.
1009 *
1010 * Doing this before the truncation means that if the truncation fails
1011 * for some reason, you cannot start up the system even after restart,
1012 * until you fix the underlying situation so that the truncation will
1013 * succeed. Alternatively, we could update the minimum recovery point
1014 * after truncation, but that would leave a small window where the
1015 * WAL-first rule could be violated.
1016 */
1017 XLogFlush(lsn);
1018
1019 /* Prepare for truncation of MAIN fork */
1020 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1021 {
1022 forks[nforks] = MAIN_FORKNUM;
1023 old_blocks[nforks] = smgrnblocks(reln, MAIN_FORKNUM);
1024 blocks[nforks] = xlrec->blkno;
1025 nforks++;
1026
1027 /* Also tell xlogutils.c about it */
1029 }
1030
1031 /* Prepare for truncation of FSM and VM too */
1032 rel = CreateFakeRelcacheEntry(xlrec->rlocator);
1033
1034 if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 &&
1035 smgrexists(reln, FSM_FORKNUM))
1036 {
1037 blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno);
1038 if (BlockNumberIsValid(blocks[nforks]))
1039 {
1040 forks[nforks] = FSM_FORKNUM;
1041 old_blocks[nforks] = smgrnblocks(reln, FSM_FORKNUM);
1042 nforks++;
1043 need_fsm_vacuum = true;
1044 }
1045 }
1046 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 &&
1048 {
1049 blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno);
1050 if (BlockNumberIsValid(blocks[nforks]))
1051 {
1052 forks[nforks] = VISIBILITYMAP_FORKNUM;
1053 old_blocks[nforks] = smgrnblocks(reln, VISIBILITYMAP_FORKNUM);
1054 nforks++;
1055 }
1056 }
1057
1058 /* Do the real work to truncate relation forks */
1059 if (nforks > 0)
1060 {
1062 smgrtruncate(reln, forks, nforks, old_blocks, blocks);
1064 }
1065
1066 /*
1067 * Update upper-level FSM pages to account for the truncation. This is
1068 * important because the just-truncated pages were likely marked as
1069 * all-free, and would be preferentially selected.
1070 */
1071 if (need_fsm_vacuum)
1072 FreeSpaceMapVacuumRange(rel, xlrec->blkno,
1074
1076 }
1077 else
1078 elog(PANIC, "smgr_redo: unknown op code %u", info);
1079}
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static int32 next
Definition: blutils.c:219
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
Pointer Page
Definition: bufpage.h:81
#define PIV_LOG_WARNING
Definition: bufpage.h:468
#define PIV_REPORT_STAT
Definition: bufpage.h:469
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
Definition: bulk_write.c:323
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
Definition: bulk_write.c:347
BulkWriteState * smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
Definition: bulk_write.c:100
void smgr_bulk_finish(BulkWriteState *bulkstate)
Definition: bulk_write.c:130
uint8_t uint8
Definition: c.h:483
#define Assert(condition)
Definition: c.h:812
#define MemSet(start, val, len)
Definition: c.h:974
size_t Size
Definition: c.h:559
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:865
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1341
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
Definition: freespace.c:377
BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
Definition: freespace.c:275
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
int i
Definition: isn.c:72
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
MemoryContext TopTransactionContext
Definition: mcxt.c:154
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void * palloc(Size size)
Definition: mcxt.c:1317
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
static char * buf
Definition: pg_test_fsync.c:72
#define DELAY_CHKPT_START
Definition: proc.h:119
#define ProcNumberForTempRelations()
Definition: proc.h:324
#define DELAY_CHKPT_COMPLETE
Definition: proc.h:120
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
tree ctl
Definition: radixtree.h:1855
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
#define RelationNeedsWAL(relation)
Definition: rel.h:628
static void RelationCloseSmgr(Relation relation)
Definition: rel.h:582
#define AssertPendingSyncs_RelationCache()
Definition: relcache.h:135
struct RelFileLocator RelFileLocator
#define RelFileLocatorEquals(locator1, locator2)
ForkNumber
Definition: relpath.h:56
@ FSM_FORKNUM
Definition: relpath.h:59
@ VISIBILITYMAP_FORKNUM
Definition: relpath.h:60
@ MAIN_FORKNUM
Definition: relpath.h:58
@ INIT_FORKNUM
Definition: relpath.h:61
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:102
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:93
Size mul_size(Size s1, Size s2)
Definition: shmem.c:505
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:677
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:201
void smgrdosyncall(SMgrRelation *rels, int nrels)
Definition: smgr.c:429
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:414
void smgrclose(SMgrRelation reln)
Definition: smgr.c:323
void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *old_nblocks, BlockNumber *nblocks)
Definition: smgr.c:729
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:401
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:465
static void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer)
Definition: smgr.h:117
PGPROC * MyProc
Definition: proc.c:66
void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
Definition: storage.c:251
void RelationPreTruncate(Relation rel)
Definition: storage.c:449
struct PendingRelDelete PendingRelDelete
void SerializePendingSyncs(Size maxSize, char *startAddress)
Definition: storage.c:584
void AtSubCommit_smgr(void)
Definition: storage.c:939
int wal_skip_threshold
Definition: storage.c:39
Size EstimatePendingSyncsSpace(void)
Definition: storage.c:571
static HTAB * pendingSyncHash
Definition: storage.c:77
struct PendingRelSync PendingRelSync
static PendingRelDelete * pendingDeletes
Definition: storage.c:76
void AtSubAbort_smgr(void)
Definition: storage.c:959
void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence)
Definition: storage.c:477
void smgr_redo(XLogReaderState *record)
Definition: storage.c:965
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:557
int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
Definition: storage.c:877
void PostPrepare_smgr(void)
Definition: storage.c:918
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
static void AddPendingSync(const RelFileLocator *rlocator)
Definition: storage.c:85
void RestorePendingSyncs(char *startAddress)
Definition: storage.c:635
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186
void RelationDropStorage(Relation rel)
Definition: storage.c:206
void RelationTruncate(Relation rel, BlockNumber nblocks)
Definition: storage.c:288
void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
Definition: storage.c:725
void smgrDoPendingDeletes(bool isCommit)
Definition: storage.c:657
#define SMGR_TRUNCATE_ALL
Definition: storage_xlog.h:43
#define SMGR_TRUNCATE_VM
Definition: storage_xlog.h:41
#define XLOG_SMGR_CREATE
Definition: storage_xlog.h:30
#define XLOG_SMGR_TRUNCATE
Definition: storage_xlog.h:31
#define SMGR_TRUNCATE_HEAP
Definition: storage_xlog.h:40
#define SMGR_TRUNCATE_FSM
Definition: storage_xlog.h:42
Definition: dynahash.c:220
int delayChkptFlags
Definition: proc.h:240
ProcNumber procNumber
Definition: storage.c:64
struct PendingRelDelete * next
Definition: storage.c:67
RelFileLocator rlocator
Definition: storage.c:63
RelFileLocator rlocator
Definition: storage.c:72
bool is_truncated
Definition: storage.c:73
RelFileLocator locator
RelFileNumber relNumber
ProcNumber rd_backend
Definition: rel.h:60
RelFileLocator rd_locator
Definition: rel.h:57
BlockNumber smgr_targblock
Definition: smgr.h:45
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
ForkNumber forkNum
Definition: storage_xlog.h:36
RelFileLocator rlocator
Definition: storage_xlog.h:35
RelFileLocator rlocator
Definition: storage_xlog.h:49
BlockNumber blkno
Definition: storage_xlog.h:48
BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
int GetCurrentTransactionNestLevel(void)
Definition: xact.c:928
bool IsInParallelMode(void)
Definition: xact.c:1088
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2802
#define XLogIsNeeded()
Definition: xlog.h:109
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:474
void XLogRegisterData(const char *data, uint32 len)
Definition: xloginsert.c:364
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
Definition: xloginsert.c:1270
void XLogBeginInsert(void)
Definition: xloginsert.c:149
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define XLR_SPECIAL_REL_UPDATE
Definition: xlogrecord.h:82
void FreeFakeRelcacheEntry(Relation fakerel)
Definition: xlogutils.c:629
void XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nblocks)
Definition: xlogutils.c:671
Relation CreateFakeRelcacheEntry(RelFileLocator rlocator)
Definition: xlogutils.c:582