PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
smgr.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * smgr.c
4 * public interface routines to storage manager switch.
5 *
6 * All file system operations on relations dispatch through these routines.
7 * An SMgrRelation represents physical on-disk relation files that are open
8 * for reading and writing.
9 *
10 * When a relation is first accessed through the relation cache, the
11 * corresponding SMgrRelation entry is opened by calling smgropen(), and the
12 * reference is stored in the relation cache entry.
13 *
14 * Accesses that don't go through the relation cache open the SMgrRelation
15 * directly. That includes flushing buffers from the buffer cache, as well as
16 * all accesses in auxiliary processes like the checkpointer or the WAL redo
17 * in the startup process.
18 *
19 * Operations like CREATE, DROP, ALTER TABLE also hold SMgrRelation references
20 * independent of the relation cache. They need to prepare the physical files
21 * before updating the relation cache.
22 *
23 * There is a hash table that holds all the SMgrRelation entries in the
24 * backend. If you call smgropen() twice for the same rel locator, you get a
25 * reference to the same SMgrRelation. The reference is valid until the end of
26 * transaction. This makes repeated access to the same relation efficient,
27 * and allows caching things like the relation size in the SMgrRelation entry.
28 *
29 * At end of transaction, all SMgrRelation entries that haven't been pinned
30 * are removed. An SMgrRelation can hold kernel file system descriptors for
31 * the underlying files, and we'd like to close those reasonably soon if the
32 * file gets deleted. The SMgrRelations references held by the relcache are
33 * pinned to prevent them from being closed.
34 *
35 * There is another mechanism to close file descriptors early:
36 * PROCSIGNAL_BARRIER_SMGRRELEASE. It is a request to immediately close all
37 * file descriptors. Upon receiving that signal, the backend closes all file
38 * descriptors held open by SMgrRelations, but because it can happen in the
39 * middle of a transaction, we cannot destroy the SMgrRelation objects
40 * themselves, as there could pointers to them in active use. See
41 * smgrrelease() and smgrreleaseall().
42 *
43 * NB: We need to hold interrupts across most of the functions in this file,
44 * as otherwise interrupt processing, e.g. due to a < ERROR elog/ereport, can
45 * trigger procsignal processing, which in turn can trigger
46 * smgrreleaseall(). Most of the relevant code is not reentrant. It seems
47 * better to put the HOLD_INTERRUPTS()/RESUME_INTERRUPTS() here, instead of
48 * trying to push them down to md.c where possible: For one, every smgr
49 * implementation would be vulnerable, for another, a good bit of smgr.c code
50 * itself is affected too. Eventually we might want a more targeted solution,
51 * allowing e.g. a networked smgr implementation to be interrupted, but many
52 * other, more complicated, problems would need to be fixed for that to be
53 * viable (e.g. smgr.c is often called with interrupts already held).
54 *
55 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
56 * Portions Copyright (c) 1994, Regents of the University of California
57 *
58 *
59 * IDENTIFICATION
60 * src/backend/storage/smgr/smgr.c
61 *
62 *-------------------------------------------------------------------------
63 */
64#include "postgres.h"
65
66#include "access/xlogutils.h"
67#include "lib/ilist.h"
68#include "miscadmin.h"
69#include "storage/aio.h"
70#include "storage/bufmgr.h"
71#include "storage/ipc.h"
72#include "storage/md.h"
73#include "storage/smgr.h"
74#include "utils/hsearch.h"
75#include "utils/inval.h"
76
77
78/*
79 * This struct of function pointers defines the API between smgr.c and
80 * any individual storage manager module. Note that smgr subfunctions are
81 * generally expected to report problems via elog(ERROR). An exception is
82 * that smgr_unlink should use elog(WARNING), rather than erroring out,
83 * because we normally unlink relations during post-commit/abort cleanup,
84 * and so it's too late to raise an error. Also, various conditions that
85 * would normally be errors should be allowed during bootstrap and/or WAL
86 * recovery --- see comments in md.c for details.
87 */
88typedef struct f_smgr
89{
90 void (*smgr_init) (void); /* may be NULL */
91 void (*smgr_shutdown) (void); /* may be NULL */
92 void (*smgr_open) (SMgrRelation reln);
93 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
94 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
95 bool isRedo);
96 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
97 void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
98 bool isRedo);
99 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
100 BlockNumber blocknum, const void *buffer, bool skipFsync);
102 BlockNumber blocknum, int nblocks, bool skipFsync);
103 bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
104 BlockNumber blocknum, int nblocks);
106 BlockNumber blocknum);
107 void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
108 BlockNumber blocknum,
109 void **buffers, BlockNumber nblocks);
111 SMgrRelation reln, ForkNumber forknum,
112 BlockNumber blocknum,
113 void **buffers, BlockNumber nblocks);
114 void (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
115 BlockNumber blocknum,
116 const void **buffers, BlockNumber nblocks,
117 bool skipFsync);
118 void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
119 BlockNumber blocknum, BlockNumber nblocks);
121 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
122 BlockNumber old_blocks, BlockNumber nblocks);
123 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
125 int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
127
128static const f_smgr smgrsw[] = {
129 /* magnetic disk */
130 {
131 .smgr_init = mdinit,
132 .smgr_shutdown = NULL,
133 .smgr_open = mdopen,
134 .smgr_close = mdclose,
135 .smgr_create = mdcreate,
136 .smgr_exists = mdexists,
137 .smgr_unlink = mdunlink,
138 .smgr_extend = mdextend,
139 .smgr_zeroextend = mdzeroextend,
140 .smgr_prefetch = mdprefetch,
141 .smgr_maxcombine = mdmaxcombine,
142 .smgr_readv = mdreadv,
143 .smgr_startreadv = mdstartreadv,
144 .smgr_writev = mdwritev,
145 .smgr_writeback = mdwriteback,
146 .smgr_nblocks = mdnblocks,
147 .smgr_truncate = mdtruncate,
148 .smgr_immedsync = mdimmedsync,
149 .smgr_registersync = mdregistersync,
150 .smgr_fd = mdfd,
151 }
152};
153
154static const int NSmgr = lengthof(smgrsw);
155
156/*
157 * Each backend has a hashtable that stores all extant SMgrRelation objects.
158 * In addition, "unpinned" SMgrRelation objects are chained together in a list.
159 */
160static HTAB *SMgrRelationHash = NULL;
161
163
164/* local function prototypes */
165static void smgrshutdown(int code, Datum arg);
166static void smgrdestroy(SMgrRelation reln);
167
168static void smgr_aio_reopen(PgAioHandle *ioh);
169static char *smgr_aio_describe_identity(const PgAioTargetData *sd);
170
171
173 .name = "smgr",
174 .reopen = smgr_aio_reopen,
175 .describe_identity = smgr_aio_describe_identity,
176};
177
178
179/*
180 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
181 * managers.
182 *
183 * Note: smgrinit is called during backend startup (normal or standalone
184 * case), *not* during postmaster start. Therefore, any resources created
185 * here or destroyed in smgrshutdown are backend-local.
186 */
187void
189{
190 int i;
191
193
194 for (i = 0; i < NSmgr; i++)
195 {
196 if (smgrsw[i].smgr_init)
197 smgrsw[i].smgr_init();
198 }
199
201
202 /* register the shutdown proc */
204}
205
206/*
207 * on_proc_exit hook for smgr cleanup during backend shutdown
208 */
209static void
211{
212 int i;
213
215
216 for (i = 0; i < NSmgr; i++)
217 {
218 if (smgrsw[i].smgr_shutdown)
220 }
221
223}
224
225/*
226 * smgropen() -- Return an SMgrRelation object, creating it if need be.
227 *
228 * In versions of PostgreSQL prior to 17, this function returned an object
229 * with no defined lifetime. Now, however, the object remains valid for the
230 * lifetime of the transaction, up to the point where AtEOXact_SMgr() is
231 * called, making it much easier for callers to know for how long they can
232 * hold on to a pointer to the returned object. If this function is called
233 * outside of a transaction, the object remains valid until smgrdestroy() or
234 * smgrdestroyall() is called. Background processes that use smgr but not
235 * transactions typically do this once per checkpoint cycle.
236 *
237 * This does not attempt to actually open the underlying files.
238 */
241{
242 RelFileLocatorBackend brlocator;
243 SMgrRelation reln;
244 bool found;
245
247
249
250 if (SMgrRelationHash == NULL)
251 {
252 /* First time through: initialize the hash table */
253 HASHCTL ctl;
254
255 ctl.keysize = sizeof(RelFileLocatorBackend);
256 ctl.entrysize = sizeof(SMgrRelationData);
257 SMgrRelationHash = hash_create("smgr relation table", 400,
260 }
261
262 /* Look up or create an entry */
263 brlocator.locator = rlocator;
264 brlocator.backend = backend;
266 &brlocator,
267 HASH_ENTER, &found);
268
269 /* Initialize it if not present before */
270 if (!found)
271 {
272 /* hash_search already filled in the lookup key */
274 for (int i = 0; i <= MAX_FORKNUM; ++i)
276 reln->smgr_which = 0; /* we only have md.c at present */
277
278 /* it is not pinned yet */
279 reln->pincount = 0;
281
282 /* implementation-specific initialization */
283 smgrsw[reln->smgr_which].smgr_open(reln);
284 }
285
287
288 return reln;
289}
290
291/*
292 * smgrpin() -- Prevent an SMgrRelation object from being destroyed at end of
293 * transaction
294 */
295void
297{
298 if (reln->pincount == 0)
299 dlist_delete(&reln->node);
300 reln->pincount++;
301}
302
303/*
304 * smgrunpin() -- Allow an SMgrRelation object to be destroyed at end of
305 * transaction
306 *
307 * The object remains valid, but if there are no other pins on it, it is moved
308 * to the unpinned list where it will be destroyed by AtEOXact_SMgr().
309 */
310void
312{
313 Assert(reln->pincount > 0);
314 reln->pincount--;
315 if (reln->pincount == 0)
317}
318
319/*
320 * smgrdestroy() -- Delete an SMgrRelation object.
321 */
322static void
324{
325 ForkNumber forknum;
326
327 Assert(reln->pincount == 0);
328
330
331 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
332 smgrsw[reln->smgr_which].smgr_close(reln, forknum);
333
334 dlist_delete(&reln->node);
335
337 &(reln->smgr_rlocator),
338 HASH_REMOVE, NULL) == NULL)
339 elog(ERROR, "SMgrRelation hashtable corrupted");
340
342}
343
344/*
345 * smgrrelease() -- Release all resources used by this object.
346 *
347 * The object remains valid.
348 */
349void
351{
353
354 for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
355 {
356 smgrsw[reln->smgr_which].smgr_close(reln, forknum);
358 }
360
362}
363
364/*
365 * smgrclose() -- Close an SMgrRelation object.
366 *
367 * The SMgrRelation reference should not be used after this call. However,
368 * because we don't keep track of the references returned by smgropen(), we
369 * don't know if there are other references still pointing to the same object,
370 * so we cannot remove the SMgrRelation object yet. Therefore, this is just a
371 * synonym for smgrrelease() at the moment.
372 */
373void
375{
376 smgrrelease(reln);
377}
378
379/*
380 * smgrdestroyall() -- Release resources used by all unpinned objects.
381 *
382 * It must be known that there are no pointers to SMgrRelations, other than
383 * those pinned with smgrpin().
384 */
385void
387{
389
390 /* seems unsafe to accept interrupts while in a dlist_foreach_modify() */
392
393 /*
394 * Zap all unpinned SMgrRelations. We rely on smgrdestroy() to remove
395 * each one from the list.
396 */
398 {
400 iter.cur);
401
402 smgrdestroy(rel);
403 }
404
406}
407
408/*
409 * smgrreleaseall() -- Release resources used by all objects.
410 */
411void
413{
414 HASH_SEQ_STATUS status;
415 SMgrRelation reln;
416
417 /* Nothing to do if hashtable not set up */
418 if (SMgrRelationHash == NULL)
419 return;
420
421 /* seems unsafe to accept interrupts while iterating */
423
425
426 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
427 {
428 smgrrelease(reln);
429 }
430
432}
433
434/*
435 * smgrreleaserellocator() -- Release resources for given RelFileLocator, if
436 * it's open.
437 *
438 * This has the same effects as smgrrelease(smgropen(rlocator)), but avoids
439 * uselessly creating a hashtable entry only to drop it again when no
440 * such entry exists already.
441 */
442void
444{
445 SMgrRelation reln;
446
447 /* Nothing to do if hashtable not set up */
448 if (SMgrRelationHash == NULL)
449 return;
450
452 &rlocator,
453 HASH_FIND, NULL);
454 if (reln != NULL)
455 smgrrelease(reln);
456}
457
458/*
459 * smgrexists() -- Does the underlying file for a fork exist?
460 */
461bool
463{
464 bool ret;
465
467 ret = smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
469
470 return ret;
471}
472
473/*
474 * smgrcreate() -- Create a new relation.
475 *
476 * Given an already-created (but presumably unused) SMgrRelation,
477 * cause the underlying disk file or other storage for the fork
478 * to be created.
479 */
480void
481smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
482{
484 smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
486}
487
488/*
489 * smgrdosyncall() -- Immediately sync all forks of all given relations
490 *
491 * All forks of all given relations are synced out to the store.
492 *
493 * This is equivalent to FlushRelationBuffers() for each smgr relation,
494 * then calling smgrimmedsync() for all forks of each relation, but it's
495 * significantly quicker so should be preferred when possible.
496 */
497void
499{
500 int i = 0;
501 ForkNumber forknum;
502
503 if (nrels == 0)
504 return;
505
506 FlushRelationsAllBuffers(rels, nrels);
507
509
510 /*
511 * Sync the physical file(s).
512 */
513 for (i = 0; i < nrels; i++)
514 {
515 int which = rels[i]->smgr_which;
516
517 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
518 {
519 if (smgrsw[which].smgr_exists(rels[i], forknum))
520 smgrsw[which].smgr_immedsync(rels[i], forknum);
521 }
522 }
523
525}
526
527/*
528 * smgrdounlinkall() -- Immediately unlink all forks of all given relations
529 *
530 * All forks of all given relations are removed from the store. This
531 * should not be used during transactional operations, since it can't be
532 * undone.
533 *
534 * If isRedo is true, it is okay for the underlying file(s) to be gone
535 * already.
536 */
537void
538smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
539{
540 int i = 0;
541 RelFileLocatorBackend *rlocators;
542 ForkNumber forknum;
543
544 if (nrels == 0)
545 return;
546
547 /*
548 * It would be unsafe to process interrupts between DropRelationBuffers()
549 * and unlinking the underlying files. This probably should be a critical
550 * section, but we're not there yet.
551 */
553
554 /*
555 * Get rid of any remaining buffers for the relations. bufmgr will just
556 * drop them without bothering to write the contents.
557 */
558 DropRelationsAllBuffers(rels, nrels);
559
560 /*
561 * create an array which contains all relations to be dropped, and close
562 * each relation's forks at the smgr level while at it
563 */
564 rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
565 for (i = 0; i < nrels; i++)
566 {
567 RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
568 int which = rels[i]->smgr_which;
569
570 rlocators[i] = rlocator;
571
572 /* Close the forks at smgr level */
573 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
574 smgrsw[which].smgr_close(rels[i], forknum);
575 }
576
577 /*
578 * Send a shared-inval message to force other backends to close any
579 * dangling smgr references they may have for these rels. We should do
580 * this before starting the actual unlinking, in case we fail partway
581 * through that step. Note that the sinval messages will eventually come
582 * back to this backend, too, and thereby provide a backstop that we
583 * closed our own smgr rel.
584 */
585 for (i = 0; i < nrels; i++)
586 CacheInvalidateSmgr(rlocators[i]);
587
588 /*
589 * Delete the physical file(s).
590 *
591 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
592 * ERROR, because we've already decided to commit or abort the current
593 * xact.
594 */
595
596 for (i = 0; i < nrels; i++)
597 {
598 int which = rels[i]->smgr_which;
599
600 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
601 smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
602 }
603
604 pfree(rlocators);
605
607}
608
609
610/*
611 * smgrextend() -- Add a new block to a file.
612 *
613 * The semantics are nearly the same as smgrwrite(): write at the
614 * specified position. However, this is to be used for the case of
615 * extending a relation (i.e., blocknum is at or beyond the current
616 * EOF). Note that we assume writing a block beyond current EOF
617 * causes intervening file space to become filled with zeroes.
618 */
619void
621 const void *buffer, bool skipFsync)
622{
624
625 smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
626 buffer, skipFsync);
627
628 /*
629 * Normally we expect this to increase nblocks by one, but if the cached
630 * value isn't as expected, just invalidate it so the next call asks the
631 * kernel.
632 */
633 if (reln->smgr_cached_nblocks[forknum] == blocknum)
634 reln->smgr_cached_nblocks[forknum] = blocknum + 1;
635 else
637
639}
640
641/*
642 * smgrzeroextend() -- Add new zeroed out blocks to a file.
643 *
644 * Similar to smgrextend(), except the relation can be extended by
645 * multiple blocks at once and the added blocks will be filled with
646 * zeroes.
647 */
648void
650 int nblocks, bool skipFsync)
651{
653
654 smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
655 nblocks, skipFsync);
656
657 /*
658 * Normally we expect this to increase the fork size by nblocks, but if
659 * the cached value isn't as expected, just invalidate it so the next call
660 * asks the kernel.
661 */
662 if (reln->smgr_cached_nblocks[forknum] == blocknum)
663 reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
664 else
666
668}
669
670/*
671 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
672 *
673 * In recovery only, this can return false to indicate that a file
674 * doesn't exist (presumably it has been dropped by a later WAL
675 * record).
676 */
677bool
679 int nblocks)
680{
681 bool ret;
682
684 ret = smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
686
687 return ret;
688}
689
690/*
691 * smgrmaxcombine() - Return the maximum number of total blocks that can be
692 * combined with an IO starting at blocknum.
693 *
694 * The returned value includes the IO for blocknum itself.
695 */
696uint32
698 BlockNumber blocknum)
699{
700 uint32 ret;
701
703 ret = smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
705
706 return ret;
707}
708
709/*
710 * smgrreadv() -- read a particular block range from a relation into the
711 * supplied buffers.
712 *
713 * This routine is called from the buffer manager in order to
714 * instantiate pages in the shared buffer cache. All storage managers
715 * return pages in the format that POSTGRES expects.
716 *
717 * If more than one block is intended to be read, callers need to use
718 * smgrmaxcombine() to check how many blocks can be combined into one IO.
719 */
720void
722 void **buffers, BlockNumber nblocks)
723{
725 smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers,
726 nblocks);
728}
729
730/*
731 * smgrstartreadv() -- asynchronous version of smgrreadv()
732 *
733 * This starts an asynchronous readv IO using the IO handle `ioh`. Other than
734 * `ioh` all parameters are the same as smgrreadv().
735 *
736 * Completion callbacks above smgr will be passed the result as the number of
737 * successfully read blocks if the read [partially] succeeds (Buffers for
738 * blocks not successfully read might bear unspecified modifications, up to
739 * the full nblocks). This maintains the abstraction that smgr operates on the
740 * level of blocks, rather than bytes.
741 *
742 * Compared to smgrreadv(), more responsibilities fall on the caller:
743 * - Partial reads need to be handled by the caller re-issuing IO for the
744 * unread blocks
745 * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are
746 * responsible for pgaio_result_report() to mirror that news to the user (if
747 * the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if
748 * PGAIO_RS_ERROR).
749 * - Under Valgrind, the "buffers" memory may or may not change status to
750 * DEFINED, depending on io_method and concurrent activity.
751 */
752void
754 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
755 void **buffers, BlockNumber nblocks)
756{
759 reln, forknum, blocknum, buffers,
760 nblocks);
762}
763
764/*
765 * smgrwritev() -- Write the supplied buffers out.
766 *
767 * This is to be used only for updating already-existing blocks of a
768 * relation (ie, those before the current EOF). To extend a relation,
769 * use smgrextend().
770 *
771 * This is not a synchronous write -- the block is not necessarily
772 * on disk at return, only dumped out to the kernel. However,
773 * provisions will be made to fsync the write before the next checkpoint.
774 *
775 * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
776 * something that prevents a concurrent checkpoint from "racing ahead" of the
777 * write. One way to prevent that is by holding a lock on the buffer; the
778 * buffer manager's writes are protected by that. The bulk writer facility
779 * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
780 * checkpoint happened; that relies on the fact that no other backend can be
781 * concurrently modifying the page.
782 *
783 * skipFsync indicates that the caller will make other provisions to
784 * fsync the relation, so we needn't bother. Temporary relations also
785 * do not require fsync.
786 *
787 * If more than one block is intended to be read, callers need to use
788 * smgrmaxcombine() to check how many blocks can be combined into one IO.
789 */
790void
792 const void **buffers, BlockNumber nblocks, bool skipFsync)
793{
795 smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
796 buffers, nblocks, skipFsync);
798}
799
800/*
801 * smgrwriteback() -- Trigger kernel writeback for the supplied range of
802 * blocks.
803 */
804void
806 BlockNumber nblocks)
807{
809 smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
810 nblocks);
812}
813
814/*
815 * smgrnblocks() -- Calculate the number of blocks in the
816 * supplied relation.
817 */
820{
821 BlockNumber result;
822
823 /* Check and return if we get the cached value for the number of blocks. */
824 result = smgrnblocks_cached(reln, forknum);
825 if (result != InvalidBlockNumber)
826 return result;
827
829
830 result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
831
832 reln->smgr_cached_nblocks[forknum] = result;
833
835
836 return result;
837}
838
839/*
840 * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
841 * relation.
842 *
843 * Returns an InvalidBlockNumber when not in recovery and when the relation
844 * fork size is not cached.
845 */
848{
849 /*
850 * For now, this function uses cached values only in recovery due to lack
851 * of a shared invalidation mechanism for changes in file size. Code
852 * elsewhere reads smgr_cached_nblocks and copes with stale data.
853 */
854 if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
855 return reln->smgr_cached_nblocks[forknum];
856
857 return InvalidBlockNumber;
858}
859
860/*
861 * smgrtruncate() -- Truncate the given forks of supplied relation to
862 * each specified numbers of blocks
863 *
864 * The truncation is done immediately, so this can't be rolled back.
865 *
866 * The caller must hold AccessExclusiveLock on the relation, to ensure that
867 * other backends receive the smgr invalidation event that this function sends
868 * before they access any forks of the relation again. The current size of
869 * the forks should be provided in old_nblocks. This function should normally
870 * be called in a critical section, but the current size must be checked
871 * outside the critical section, and no interrupts or smgr functions relating
872 * to this relation should be called in between.
873 */
874void
875smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
876 BlockNumber *old_nblocks, BlockNumber *nblocks)
877{
878 int i;
879
880 /*
881 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
882 * just drop them without bothering to write the contents.
883 */
884 DropRelationBuffers(reln, forknum, nforks, nblocks);
885
886 /*
887 * Send a shared-inval message to force other backends to close any smgr
888 * references they may have for this rel. This is useful because they
889 * might have open file pointers to segments that got removed, and/or
890 * smgr_targblock variables pointing past the new rel end. (The inval
891 * message will come back to our backend, too, causing a
892 * probably-unnecessary local smgr flush. But we don't expect that this
893 * is a performance-critical path.) As in the unlink code, we want to be
894 * sure the message is sent before we start changing things on-disk.
895 */
897
898 /* Do the truncation */
899 for (i = 0; i < nforks; i++)
900 {
901 /* Make the cached size is invalid if we encounter an error. */
902 reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
903
904 smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
905 old_nblocks[i], nblocks[i]);
906
907 /*
908 * We might as well update the local smgr_cached_nblocks values. The
909 * smgr cache inval message that this function sent will cause other
910 * backends to invalidate their copies of smgr_cached_nblocks, and
911 * these ones too at the next command boundary. But ensure they aren't
912 * outright wrong until then.
913 */
914 reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
915 }
916}
917
918/*
919 * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
920 *
921 * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
922 * true, to register the fsyncs that were skipped earlier.
923 *
924 * Note: be mindful that a checkpoint could already have happened between the
925 * smgrwrite or smgrextend calls and this! In that case, the checkpoint
926 * already missed fsyncing this relation, and you should use smgrimmedsync
927 * instead. Most callers should use the bulk loading facility in bulk_write.c
928 * which handles all that.
929 */
930void
932{
934 smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
936}
937
938/*
939 * smgrimmedsync() -- Force the specified relation to stable storage.
940 *
941 * Synchronously force all previous writes to the specified relation
942 * down to disk.
943 *
944 * This is useful for building completely new relations (eg, new
945 * indexes). Instead of incrementally WAL-logging the index build
946 * steps, we can just write completed index pages to disk with smgrwrite
947 * or smgrextend, and then fsync the completed index file before
948 * committing the transaction. (This is sufficient for purposes of
949 * crash recovery, since it effectively duplicates forcing a checkpoint
950 * for the completed index. But it is *not* sufficient if one wishes
951 * to use the WAL log for PITR or replication purposes: in that case
952 * we have to make WAL entries as well.)
953 *
954 * The preceding writes should specify skipFsync = true to avoid
955 * duplicative fsyncs.
956 *
957 * Note that you need to do FlushRelationBuffers() first if there is
958 * any possibility that there are dirty buffers for the relation;
959 * otherwise the sync is not very meaningful.
960 *
961 * Most callers should use the bulk loading facility in bulk_write.c
962 * instead of calling this directly.
963 */
964void
966{
968 smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
970}
971
972/*
973 * Return fd for the specified block number and update *off to the appropriate
974 * position.
975 *
976 * This is only to be used for when AIO needs to perform the IO in a different
977 * process than where it was issued (e.g. in an IO worker).
978 */
979static int
980smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
981{
982 int fd;
983
984 /*
985 * The caller needs to prevent interrupts from being processed, otherwise
986 * the FD could be closed prematurely.
987 */
989
990 fd = smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
991
992 return fd;
993}
994
995/*
996 * AtEOXact_SMgr
997 *
998 * This routine is called during transaction commit or abort (it doesn't
999 * particularly care which). All unpinned SMgrRelation objects are destroyed.
1000 *
1001 * We do this as a compromise between wanting transient SMgrRelations to
1002 * live awhile (to amortize the costs of blind writes of multiple blocks)
1003 * and needing them to not live forever (since we're probably holding open
1004 * a kernel file descriptor for the underlying file, and we need to ensure
1005 * that gets closed reasonably soon if the file gets deleted).
1006 */
1007void
1009{
1011}
1012
1013/*
1014 * This routine is called when we are ordered to release all open files by a
1015 * ProcSignalBarrier.
1016 */
1017bool
1019{
1021 return true;
1022}
1023
1024/*
1025 * Set target of the IO handle to be smgr and initialize all the relevant
1026 * pieces of data.
1027 */
1028void
1030 SMgrRelationData *smgr,
1031 ForkNumber forknum,
1032 BlockNumber blocknum,
1033 int nblocks,
1034 bool skip_fsync)
1035{
1037
1039
1040 /* backend is implied via IO owner */
1041 sd->smgr.rlocator = smgr->smgr_rlocator.locator;
1042 sd->smgr.forkNum = forknum;
1043 sd->smgr.blockNum = blocknum;
1044 sd->smgr.nblocks = nblocks;
1045 sd->smgr.is_temp = SmgrIsTemp(smgr);
1046 /* Temp relations should never be fsync'd */
1047 sd->smgr.skip_fsync = skip_fsync && !SmgrIsTemp(smgr);
1048}
1049
1050/*
1051 * Callback for the smgr AIO target, to reopen the file (e.g. because the IO
1052 * is executed in a worker).
1053 */
1054static void
1056{
1059 SMgrRelation reln;
1060 ProcNumber procno;
1061 uint32 off;
1062
1063 /*
1064 * The caller needs to prevent interrupts from being processed, otherwise
1065 * the FD could be closed again before we get to executing the IO.
1066 */
1068
1069 if (sd->smgr.is_temp)
1070 procno = pgaio_io_get_owner(ioh);
1071 else
1072 procno = INVALID_PROC_NUMBER;
1073
1074 reln = smgropen(sd->smgr.rlocator, procno);
1075 switch (pgaio_io_get_op(ioh))
1076 {
1077 case PGAIO_OP_INVALID:
1079 break;
1080 case PGAIO_OP_READV:
1081 od->read.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1082 Assert(off == od->read.offset);
1083 break;
1084 case PGAIO_OP_WRITEV:
1085 od->write.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1086 Assert(off == od->write.offset);
1087 break;
1088 }
1089}
1090
1091/*
1092 * Callback for the smgr AIO target, describing the target of the IO.
1093 */
1094static char *
1096{
1097 RelPathStr path;
1098 char *desc;
1099
1100 path = relpathbackend(sd->smgr.rlocator,
1101 sd->smgr.is_temp ?
1103 sd->smgr.forkNum);
1104
1105 if (sd->smgr.nblocks == 0)
1106 desc = psprintf(_("file \"%s\""), path.str);
1107 else if (sd->smgr.nblocks == 1)
1108 desc = psprintf(_("block %u in file \"%s\""),
1109 sd->smgr.blockNum,
1110 path.str);
1111 else
1112 desc = psprintf(_("blocks %u..%u in file \"%s\""),
1113 sd->smgr.blockNum,
1114 sd->smgr.blockNum + sd->smgr.nblocks - 1,
1115 path.str);
1116
1117 return desc;
1118}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:343
@ PGAIO_TID_SMGR
Definition: aio.h:120
@ PGAIO_OP_WRITEV
Definition: aio.h:93
@ PGAIO_OP_INVALID
Definition: aio.h:90
@ PGAIO_OP_READV
Definition: aio.h:92
PgAioOpData * pgaio_io_get_op_data(PgAioHandle *ioh)
Definition: aio_io.c:58
PgAioOp pgaio_io_get_op(PgAioHandle *ioh)
Definition: aio_io.c:52
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:72
void pgaio_io_set_target(PgAioHandle *ioh, PgAioTargetID targetid)
Definition: aio_target.c:63
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
void FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
Definition: bufmgr.c:5033
void DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: bufmgr.c:4540
void DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
Definition: bufmgr.c:4663
#define pg_unreachable()
Definition: c.h:332
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385
#define _(x)
Definition: elog.c:91
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
ProcNumber MyProcNumber
Definition: globals.c:91
Assert(PointerIsAligned(start, uint64))
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
static void dlist_init(dlist_head *head)
Definition: ilist.h:314
static void dlist_delete(dlist_node *node)
Definition: ilist.h:405
#define dlist_foreach_modify(iter, lhead)
Definition: ilist.h:640
static void dlist_push_tail(dlist_head *head, dlist_node *node)
Definition: ilist.h:364
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
void CacheInvalidateSmgr(RelFileLocatorBackend rlocator)
Definition: inval.c:1751
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:309
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:2146
void * palloc(Size size)
Definition: mcxt.c:1939
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:327
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition: md.c:1277
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition: md.c:1049
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:193
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: md.c:837
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1213
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: md.c:212
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: md.c:1470
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: md.c:477
void mdinit(void)
Definition: md.c:180
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:703
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: md.c:542
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: md.c:1154
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: md.c:823
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: md.c:975
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: md.c:726
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1366
void mdopen(SMgrRelation reln)
Definition: md.c:692
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1417
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:136
#define INTERRUPTS_CAN_BE_PROCESSED()
Definition: miscadmin.h:130
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:134
void * arg
uintptr_t Datum
Definition: postgres.h:69
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
tree ctl
Definition: radixtree.h:1838
struct RelFileLocatorBackend RelFileLocatorBackend
ForkNumber
Definition: relpath.h:56
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
#define RelFileNumberIsValid(relnumber)
Definition: relpath.h:27
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
static HTAB * SMgrRelationHash
Definition: smgr.c:160
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
void smgrrelease(SMgrRelation reln)
Definition: smgr.c:350
static void smgrdestroy(SMgrRelation reln)
Definition: smgr.c:323
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrinit(void)
Definition: smgr.c:188
void smgrdestroyall(void)
Definition: smgr.c:386
void smgrreleaseall(void)
Definition: smgr.c:412
static char * smgr_aio_describe_identity(const PgAioTargetData *sd)
Definition: smgr.c:1095
static dlist_head unpinned_relns
Definition: smgr.c:162
void smgrpin(SMgrRelation reln)
Definition: smgr.c:296
void smgrunpin(SMgrRelation reln)
Definition: smgr.c:311
void smgrdosyncall(SMgrRelation *rels, int nrels)
Definition: smgr.c:498
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:965
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *old_nblocks, BlockNumber *nblocks)
Definition: smgr.c:875
static const int NSmgr
Definition: smgr.c:154
bool ProcessBarrierSmgrRelease(void)
Definition: smgr.c:1018
void AtEOXact_SMgr(void)
Definition: smgr.c:1008
static void smgrshutdown(int code, Datum arg)
Definition: smgr.c:210
static int smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: smgr.c:980
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
void smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition: smgr.c:791
void smgrreleaserellocator(RelFileLocatorBackend rlocator)
Definition: smgr.c:443
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
void smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:721
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
void smgrregistersync(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:931
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
static void smgr_aio_reopen(PgAioHandle *ioh)
Definition: smgr.c:1055
const PgAioTargetInfo aio_smgr_target_info
Definition: smgr.c:172
static const f_smgr smgrsw[]
Definition: smgr.c:128
struct f_smgr f_smgr
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition: smgr.c:1029
#define SmgrIsTemp(smgr)
Definition: smgr.h:74
struct SMgrRelationData SMgrRelationData
SMgrRelationData * SMgrRelation
Definition: smgr.h:72
Definition: dynahash.c:220
const char * name
Definition: aio.h:170
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
BlockNumber smgr_targblock
Definition: smgr.h:46
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38
dlist_node node
Definition: smgr.h:69
int smgr_which
Definition: smgr.h:55
int pincount
Definition: smgr.h:68
dlist_node * cur
Definition: ilist.h:200
Definition: smgr.c:89
bool(* smgr_prefetch)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:103
void(* smgr_writeback)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:118
void(* smgr_extend)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:99
void(* smgr_create)(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:94
BlockNumber(* smgr_nblocks)(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:120
void(* smgr_truncate)(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
Definition: smgr.c:121
uint32(* smgr_maxcombine)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:105
void(* smgr_registersync)(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:124
void(* smgr_immedsync)(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:123
void(* smgr_zeroextend)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:101
void(* smgr_readv)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:107
void(* smgr_unlink)(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: smgr.c:97
void(* smgr_startreadv)(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:110
int(* smgr_fd)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: smgr.c:125
void(* smgr_open)(SMgrRelation reln)
Definition: smgr.c:92
void(* smgr_shutdown)(void)
Definition: smgr.c:91
void(* smgr_writev)(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition: smgr.c:114
void(* smgr_init)(void)
Definition: smgr.c:90
bool(* smgr_exists)(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:96
void(* smgr_close)(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:93
uint64 offset
Definition: aio.h:140
int fd
Definition: aio.h:138
struct PgAioOpData::@123 write
struct PgAioOpData::@122 read
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
BlockNumber nblocks
Definition: aio_types.h:67
struct PgAioTargetData::@124 smgr
ForkNumber forkNum
Definition: aio_types.h:68
bool InRecovery
Definition: xlogutils.c:50