PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
aio.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio.c
4 * AIO - Core Logic
5 *
6 * For documentation about how AIO works on a higher level, including a
7 * schematic example, see README.md.
8 *
9 *
10 * AIO is a complicated subsystem. To keep things navigable, it is split
11 * across a number of files:
12 *
13 * - method_*.c - different ways of executing AIO (e.g. worker process)
14 *
15 * - aio_target.c - IO on different kinds of targets
16 *
17 * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 *
19 * - aio_callback.c - callbacks at IO operation lifecycle events
20 *
21 * - aio_init.c - per-server and per-backend initialization
22 *
23 * - aio.c - all other topics
24 *
25 * - read_stream.c - helper for reading buffered relation data
26 *
27 * - README.md - higher-level overview over AIO
28 *
29 *
30 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 * Portions Copyright (c) 1994, Regents of the University of California
32 *
33 * IDENTIFICATION
34 * src/backend/storage/aio/aio.c
35 *
36 *-------------------------------------------------------------------------
37 */
38
39#include "postgres.h"
40
41#include "lib/ilist.h"
42#include "miscadmin.h"
43#include "port/atomics.h"
44#include "storage/aio.h"
46#include "storage/aio_subsys.h"
47#include "utils/guc.h"
48#include "utils/guc_hooks.h"
49#include "utils/resowner.h"
50#include "utils/wait_event_types.h"
51
52#ifdef USE_INJECTION_POINTS
54#endif
55
56
57static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
58static void pgaio_io_reclaim(PgAioHandle *ioh);
60static void pgaio_io_wait_for_free(void);
61static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
62static const char *pgaio_io_state_get_name(PgAioHandleState s);
63static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
64
65
66/* Options for io_method. */
68 {"sync", IOMETHOD_SYNC, false},
69 {"worker", IOMETHOD_WORKER, false},
70#ifdef IOMETHOD_IO_URING_ENABLED
71 {"io_uring", IOMETHOD_IO_URING, false},
72#endif
73 {NULL, 0, false}
74};
75
76/* GUCs */
79
80/* global control for AIO */
82
83/* current backend's per-backend state */
85
86
87static const IoMethodOps *const pgaio_method_ops_table[] = {
90#ifdef IOMETHOD_IO_URING_ENABLED
91 [IOMETHOD_IO_URING] = &pgaio_uring_ops,
92#endif
93};
94
95/* callbacks for the configured io_method, set by assign_io_method */
97
98
99/*
100 * Currently there's no infrastructure to pass arguments to injection points,
101 * so we instead set this up for the duration of the injection point
102 * invocation. See pgaio_io_call_inj().
103 */
104#ifdef USE_INJECTION_POINTS
105static PgAioHandle *pgaio_inj_cur_handle;
106#endif
107
108
109
110/* --------------------------------------------------------------------------------
111 * Public Functions related to PgAioHandle
112 * --------------------------------------------------------------------------------
113 */
114
115/*
116 * Acquire an AioHandle, waiting for IO completion if necessary.
117 *
118 * Each backend can only have one AIO handle that has been "handed out" to
119 * code, but not yet submitted or released. This restriction is necessary to
120 * ensure that it is possible for code to wait for an unused handle by waiting
121 * for in-flight IO to complete. There is a limited number of handles in each
122 * backend, if multiple handles could be handed out without being submitted,
123 * waiting for all in-flight IO to complete would not guarantee that handles
124 * free up.
125 *
126 * It is cheap to acquire an IO handle, unless all handles are in use. In that
127 * case this function waits for the oldest IO to complete. If that is not
128 * desirable, use pgaio_io_acquire_nb().
129 *
130 * If a handle was acquired but then does not turn out to be needed,
131 * e.g. because pgaio_io_acquire() is called before starting an IO in a
132 * critical section, the handle needs to be released with pgaio_io_release().
133 *
134 *
135 * To react to the completion of the IO as soon as it is known to have
136 * completed, callbacks can be registered with pgaio_io_register_callbacks().
137 *
138 * To actually execute IO using the returned handle, the pgaio_io_start_*()
139 * family of functions is used. In many cases the pgaio_io_start_*() call will
140 * not be done directly by code that acquired the handle, but by lower level
141 * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
142 * AIO, it typically will pass the handle to smgr.c, which will pass it on to
143 * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
144 * forwarding allows the various layers to react to the IO's completion by
145 * registering callbacks. These callbacks in turn can translate a lower
146 * layer's result into a result understandable by a higher layer.
147 *
148 * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
149 * not submitted to the kernel). Unless in batchmode
150 * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
151 * execution. Note that, whether in batchmode or not, the IO might even
152 * complete before the functions return.
153 *
154 * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
155 * referenced by the IO issuing code. To e.g. wait for IO, references to the
156 * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
157 * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
158 *
159 *
160 * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
161 * passed to pgaio_io_acquire(). Once the issuing backend has called
162 * pgaio_wref_wait(), the PgAioReturn contains information about whether the
163 * operation succeeded and details about the first failure, if any. The error
164 * can be raised / logged with pgaio_result_report().
165 *
166 * The lifetime of the memory pointed to be *ret needs to be at least as long
167 * as the passed in resowner. If the resowner releases resources before the IO
168 * completes (typically due to an error), the reference to *ret will be
169 * cleared. In case of resowner cleanup *ret will not be updated with the
170 * results of the IO operation.
171 */
174{
175 PgAioHandle *h;
176
177 while (true)
178 {
179 h = pgaio_io_acquire_nb(resowner, ret);
180
181 if (h != NULL)
182 return h;
183
184 /*
185 * Evidently all handles by this backend are in use. Just wait for
186 * some to complete.
187 */
189 }
190}
191
192/*
193 * Acquire an AioHandle, returning NULL if no handles are free.
194 *
195 * See pgaio_io_acquire(). The only difference is that this function will return
196 * NULL if there are no idle handles, instead of blocking.
197 */
200{
202 {
205 }
206
208 elog(ERROR, "API violation: Only one IO can be handed out");
209
211 {
213 PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
214
215 Assert(ioh->state == PGAIO_HS_IDLE);
217
220
221 if (resowner)
223
224 if (ret)
225 {
226 ioh->report_return = ret;
228 }
229
230 return ioh;
231 }
232
233 return NULL;
234}
235
236/*
237 * Release IO handle that turned out to not be required.
238 *
239 * See pgaio_io_acquire() for more details.
240 */
241void
243{
245 {
247 Assert(ioh->resowner);
248
250 pgaio_io_reclaim(ioh);
251 }
252 else
253 {
254 elog(ERROR, "release in unexpected state");
255 }
256}
257
258/*
259 * Release IO handle during resource owner cleanup.
260 */
261void
262pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
263{
264 PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
265
266 Assert(ioh->resowner);
267
269 ioh->resowner = NULL;
270
271 switch (ioh->state)
272 {
273 case PGAIO_HS_IDLE:
274 elog(ERROR, "unexpected");
275 break;
278
280 {
282 if (!on_error)
283 elog(WARNING, "leaked AIO handle");
284 }
285
286 pgaio_io_reclaim(ioh);
287 break;
288 case PGAIO_HS_DEFINED:
289 case PGAIO_HS_STAGED:
290 if (!on_error)
291 elog(WARNING, "AIO handle was not submitted");
293 break;
298 /* this is expected to happen */
299 break;
300 }
301
302 /*
303 * Need to unregister the reporting of the IO's result, the memory it's
304 * referencing likely has gone away.
305 */
306 if (ioh->report_return)
307 ioh->report_return = NULL;
308}
309
310/*
311 * Add a [set of] flags to the IO.
312 *
313 * Note that this combines flags with already set flags, rather than set flags
314 * to explicitly the passed in parameters. This is to allow multiple callsites
315 * to set flags.
316 */
317void
319{
321
322 ioh->flags |= flag;
323}
324
325/*
326 * Returns an ID uniquely identifying the IO handle. This is only really
327 * useful for logging, as handles are reused across multiple IOs.
328 */
329int
331{
332 Assert(ioh >= pgaio_ctl->io_handles &&
334 return ioh - pgaio_ctl->io_handles;
335}
336
337/*
338 * Return the ProcNumber for the process that can use an IO handle. The
339 * mapping from IO handles to PGPROCs is static, therefore this even works
340 * when the corresponding PGPROC is not in use.
341 */
344{
345 return ioh->owner_procno;
346}
347
348/*
349 * Return a wait reference for the IO. Only wait references can be used to
350 * wait for an IOs completion, as handles themselves can be reused after
351 * completion. See also the comment above pgaio_io_acquire().
352 */
353void
355{
357 ioh->state == PGAIO_HS_DEFINED ||
358 ioh->state == PGAIO_HS_STAGED);
359 Assert(ioh->generation != 0);
360
361 iow->aio_index = ioh - pgaio_ctl->io_handles;
362 iow->generation_upper = (uint32) (ioh->generation >> 32);
363 iow->generation_lower = (uint32) ioh->generation;
364}
365
366
367
368/* --------------------------------------------------------------------------------
369 * Internal Functions related to PgAioHandle
370 * --------------------------------------------------------------------------------
371 */
372
373static inline void
375{
377 "updating state to %s",
378 pgaio_io_state_get_name(new_state));
379
380 /*
381 * Ensure the changes signified by the new state are visible before the
382 * new state becomes visible.
383 */
385
386 ioh->state = new_state;
387}
388
389static void
391{
392 Assert(!ioh->resowner);
394
397}
398
399/*
400 * Stage IO for execution and, if appropriate, submit it immediately.
401 *
402 * Should only be called from pgaio_io_start_*().
403 */
404void
406{
407 bool needs_synchronous;
408
412
413 ioh->op = op;
414 ioh->result = 0;
415
417
418 /* allow a new IO to be staged */
420
422
424
425 /*
426 * Synchronous execution has to be executed, well, synchronously, so check
427 * that first.
428 */
429 needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
430
432 "staged (synchronous: %d, in_batch: %d)",
433 needs_synchronous, pgaio_my_backend->in_batchmode);
434
435 if (!needs_synchronous)
436 {
439
440 /*
441 * Unless code explicitly opted into batching IOs, submit the IO
442 * immediately.
443 */
446 }
447 else
448 {
451 }
452}
453
454bool
456{
457 /*
458 * If the caller said to execute the IO synchronously, do so.
459 *
460 * XXX: We could optimize the logic when to execute synchronously by first
461 * checking if there are other IOs in flight and only synchronously
462 * executing if not. Unclear whether that'll be sufficiently common to be
463 * worth worrying about.
464 */
465 if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
466 return true;
467
468 /* Check if the IO method requires synchronous execution of IO */
471
472 return false;
473}
474
475/*
476 * Handle IO being processed by IO method.
477 *
478 * Should be called by IO methods / synchronous IO execution, just before the
479 * IO is performed.
480 */
481void
483{
485
487}
488
489/*
490 * Handle IO getting completed by a method.
491 *
492 * Should be called by IO methods / synchronous IO execution, just after the
493 * IO has been performed.
494 *
495 * Expects to be called in a critical section. We expect IOs to be usable for
496 * WAL etc, which requires being able to execute completion callbacks in a
497 * critical section.
498 */
499void
501{
503
505
506 ioh->result = result;
507
509
510 pgaio_io_call_inj(ioh, "AIO_PROCESS_COMPLETION_BEFORE_SHARED");
511
513
515
516 /* condition variable broadcast ensures state is visible before wakeup */
518
519 /* contains call to pgaio_io_call_complete_local() */
520 if (ioh->owner_procno == MyProcNumber)
521 pgaio_io_reclaim(ioh);
522}
523
524/*
525 * Has the IO completed and thus the IO handle been reused?
526 *
527 * This is useful when waiting for IO completion at a low level (e.g. in an IO
528 * method's ->wait_one() callback).
529 */
530bool
532{
533 *state = ioh->state;
535
536 return ioh->generation != ref_generation;
537}
538
539/*
540 * Wait for IO to complete. External code should never use this, outside of
541 * the AIO subsystem waits are only allowed via pgaio_wref_wait().
542 */
543static void
544pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
545{
547 bool am_owner;
548
549 am_owner = ioh->owner_procno == MyProcNumber;
550
551 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
552 return;
553
554 if (am_owner)
555 {
560 {
561 elog(PANIC, "waiting for own IO in wrong state: %d",
562 state);
563 }
564 }
565
566 while (true)
567 {
568 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
569 return;
570
571 switch (state)
572 {
573 case PGAIO_HS_IDLE:
575 elog(ERROR, "IO in wrong state: %d", state);
576 break;
577
579
580 /*
581 * If we need to wait via the IO method, do so now. Don't
582 * check via the IO method if the issuing backend is executing
583 * the IO synchronously.
584 */
586 {
587 pgaio_method_ops->wait_one(ioh, ref_generation);
588 continue;
589 }
590 /* fallthrough */
591
592 /* waiting for owner to submit */
593 case PGAIO_HS_DEFINED:
594 case PGAIO_HS_STAGED:
595 /* waiting for reaper to complete */
596 /* fallthrough */
598 /* shouldn't be able to hit this otherwise */
600 /* ensure we're going to get woken up */
602
603 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
604 {
607 break;
608 ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
609 }
610
612 break;
613
616 /* see above */
617 if (am_owner)
618 pgaio_io_reclaim(ioh);
619 return;
620 }
621 }
622}
623
624/*
625 * Make IO handle ready to be reused after IO has completed or after the
626 * handle has been released without being used.
627 */
628static void
630{
631 /* This is only ok if it's our IO */
633 Assert(ioh->state != PGAIO_HS_IDLE);
634
635 /*
636 * It's a bit ugly, but right now the easiest place to put the execution
637 * of local completion callbacks is this function, as we need to execute
638 * local callbacks just before reclaiming at multiple callsites.
639 */
641 {
642 PgAioResult local_result;
643
644 local_result = pgaio_io_call_complete_local(ioh);
646
647 if (ioh->report_return)
648 {
649 ioh->report_return->result = local_result;
651 }
652 }
653
655 "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
657 ioh->distilled_result.id,
659 ioh->result);
660
661 /* if the IO has been defined, it's on the in-flight list, remove */
662 if (ioh->state != PGAIO_HS_HANDED_OUT)
664
665 if (ioh->resowner)
666 {
668 ioh->resowner = NULL;
669 }
670
671 Assert(!ioh->resowner);
672
673 ioh->op = PGAIO_OP_INVALID;
675 ioh->flags = 0;
676 ioh->num_callbacks = 0;
677 ioh->handle_data_len = 0;
678 ioh->report_return = NULL;
679 ioh->result = 0;
681
682 /* XXX: the barrier is probably superfluous */
684 ioh->generation++;
685
687
688 /*
689 * We push the IO to the head of the idle IO list, that seems more cache
690 * efficient in cases where only a few IOs are used.
691 */
693}
694
695/*
696 * Wait for an IO handle to become usable.
697 *
698 * This only really is useful for pgaio_io_acquire().
699 */
700static void
702{
703 int reclaimed = 0;
704
705 pgaio_debug(DEBUG2, "waiting for self with %d pending",
707
708 /*
709 * First check if any of our IOs actually have completed - when using
710 * worker, that'll often be the case. We could do so as part of the loop
711 * below, but that'd potentially lead us to wait for some IO submitted
712 * before.
713 */
714 for (int i = 0; i < io_max_concurrency; i++)
715 {
717
719 {
720 pgaio_io_reclaim(ioh);
721 reclaimed++;
722 }
723 }
724
725 if (reclaimed > 0)
726 return;
727
728 /*
729 * If we have any unsubmitted IOs, submit them now. We'll start waiting in
730 * a second, so it's better they're in flight. This also addresses the
731 * edge-case that all IOs are unsubmitted.
732 */
735
737 elog(ERROR, "no free IOs despite no in-flight IOs");
738
739 /*
740 * Wait for the oldest in-flight IO to complete.
741 *
742 * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
743 * for that specific IO to complete, we just need *any* IO to complete.
744 */
745 {
748
749 switch (ioh->state)
750 {
751 /* should not be in in-flight list */
752 case PGAIO_HS_IDLE:
753 case PGAIO_HS_DEFINED:
755 case PGAIO_HS_STAGED:
757 elog(ERROR, "shouldn't get here with io:%d in state %d",
758 pgaio_io_get_id(ioh), ioh->state);
759 break;
760
764 "waiting for free io with %d in flight",
766
767 /*
768 * In a more general case this would be racy, because the
769 * generation could increase after we read ioh->state above.
770 * But we are only looking at IOs by the current backend and
771 * the IO can only be recycled by this backend.
772 */
773 pgaio_io_wait(ioh, ioh->generation);
774 break;
775
777 /* it's possible that another backend just finished this IO */
778 pgaio_io_reclaim(ioh);
779 break;
780 }
781
783 elog(PANIC, "no idle IO after waiting for IO to terminate");
784 return;
785 }
786}
787
788/*
789 * Internal - code outside of AIO should never need this and it'd be hard for
790 * such code to be safe.
791 */
792static PgAioHandle *
794{
795 PgAioHandle *ioh;
796
798
799 ioh = &pgaio_ctl->io_handles[iow->aio_index];
800
801 *ref_generation = ((uint64) iow->generation_upper) << 32 |
802 iow->generation_lower;
803
804 Assert(*ref_generation != 0);
805
806 return ioh;
807}
808
809static const char *
811{
812#define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
813 switch (s)
814 {
816 PGAIO_HS_TOSTR_CASE(HANDED_OUT);
817 PGAIO_HS_TOSTR_CASE(DEFINED);
818 PGAIO_HS_TOSTR_CASE(STAGED);
819 PGAIO_HS_TOSTR_CASE(SUBMITTED);
820 PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
821 PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
822 PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
823 }
824#undef PGAIO_HS_TOSTR_CASE
825
826 return NULL; /* silence compiler */
827}
828
829const char *
831{
832 return pgaio_io_state_get_name(ioh->state);
833}
834
835const char *
837{
838 switch (rs)
839 {
840 case PGAIO_RS_UNKNOWN:
841 return "UNKNOWN";
842 case PGAIO_RS_OK:
843 return "OK";
844 case PGAIO_RS_WARNING:
845 return "WARNING";
846 case PGAIO_RS_PARTIAL:
847 return "PARTIAL";
848 case PGAIO_RS_ERROR:
849 return "ERROR";
850 }
851
852 return NULL; /* silence compiler */
853}
854
855
856
857/* --------------------------------------------------------------------------------
858 * Functions primarily related to IO Wait References
859 * --------------------------------------------------------------------------------
860 */
861
862/*
863 * Mark a wait reference as invalid
864 */
865void
867{
869}
870
871/* Is the wait reference valid? */
872bool
874{
875 return iow->aio_index != PG_UINT32_MAX;
876}
877
878/*
879 * Similar to pgaio_io_get_id(), just for wait references.
880 */
881int
883{
885 return iow->aio_index;
886}
887
888/*
889 * Wait for the IO to have completed. Can be called in any process, not just
890 * in the issuing backend.
891 */
892void
894{
895 uint64 ref_generation;
896 PgAioHandle *ioh;
897
898 ioh = pgaio_io_from_wref(iow, &ref_generation);
899
900 pgaio_io_wait(ioh, ref_generation);
901}
902
903/*
904 * Check if the referenced IO completed, without blocking.
905 */
906bool
908{
909 uint64 ref_generation;
911 bool am_owner;
912 PgAioHandle *ioh;
913
914 ioh = pgaio_io_from_wref(iow, &ref_generation);
915
916 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
917 return true;
918
919 if (state == PGAIO_HS_IDLE)
920 return true;
921
922 am_owner = ioh->owner_procno == MyProcNumber;
923
926 {
927 if (am_owner)
928 pgaio_io_reclaim(ioh);
929 return true;
930 }
931
932 /*
933 * XXX: It likely would be worth checking in with the io method, to give
934 * the IO method a chance to check if there are completion events queued.
935 */
936
937 return false;
938}
939
940
941
942/* --------------------------------------------------------------------------------
943 * Actions on multiple IOs.
944 * --------------------------------------------------------------------------------
945 */
946
947/*
948 * Submit IOs in batches going forward.
949 *
950 * Submitting multiple IOs at once can be substantially faster than doing so
951 * one-by-one. At the same time, submitting multiple IOs at once requires more
952 * care to avoid deadlocks.
953 *
954 * Consider backend A staging an IO for buffer 1 and then trying to start IO
955 * on buffer 2, while backend B does the inverse. If A submitted the IO before
956 * moving on to buffer 2, this works just fine, B will wait for the IO to
957 * complete. But if batching were used, each backend will wait for IO that has
958 * not yet been submitted to complete, i.e. forever.
959 *
960 * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
961 * allowed; error recovery will end the batch.)
962 *
963 * To avoid deadlocks, code needs to ensure that it will not wait for another
964 * backend while there is unsubmitted IO. E.g. by using conditional lock
965 * acquisition when acquiring buffer locks. To check if there currently are
966 * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
967 * pgaio_submit_staged().
968 *
969 * It is not allowed to enter batchmode while already in batchmode, it's
970 * unlikely to ever be needed, as code needs to be explicitly aware of being
971 * called in batchmode, to avoid the deadlock risks explained above.
972 *
973 * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
974 * e.g. because too many IOs have been staged or because pgaio_submit_staged()
975 * was called.
976 */
977void
979{
981 elog(ERROR, "starting batch while batch already in progress");
983}
984
985/*
986 * Stop submitting IOs in batches.
987 */
988void
990{
992
995}
996
997/*
998 * Are there staged but unsubmitted IOs?
999 *
1000 * See comment above pgaio_enter_batchmode() for why code may need to check if
1001 * there is IO in that state.
1002 */
1003bool
1005{
1008 return pgaio_my_backend->num_staged_ios > 0;
1009}
1010
1011/*
1012 * Submit all staged but not yet submitted IOs.
1013 *
1014 * Unless in batch mode, this never needs to be called, as IOs get submitted
1015 * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1016 * before waiting on another backend, to avoid the risk of deadlocks. See
1017 * pgaio_enter_batchmode().
1018 */
1019void
1021{
1022 int total_submitted = 0;
1023 int did_submit;
1024
1026 return;
1027
1028
1030
1033
1035
1036 total_submitted += did_submit;
1037
1038 Assert(total_submitted == did_submit);
1039
1041
1043 "aio: submitted %d IOs",
1044 total_submitted);
1045}
1046
1047
1048
1049/* --------------------------------------------------------------------------------
1050 * Other
1051 * --------------------------------------------------------------------------------
1052 */
1053
1054
1055/*
1056 * Perform AIO related cleanup after an error.
1057 *
1058 * This should be called early in the error recovery paths, as later steps may
1059 * need to issue AIO (e.g. to record a transaction abort WAL record).
1060 */
1061void
1063{
1064 /*
1065 * It is possible that code errored out after pgaio_enter_batchmode() but
1066 * before pgaio_exit_batchmode() was called. In that case we need to
1067 * submit the IO now.
1068 */
1070 {
1072
1074 }
1075
1076 /*
1077 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1078 */
1080}
1081
1082/*
1083 * Perform AIO related checks at (sub-)transactional boundaries.
1084 *
1085 * This should be called late during (sub-)transactional commit/abort, after
1086 * all steps that might need to perform AIO, so that we can verify that the
1087 * AIO subsystem is in a valid state at the end of a transaction.
1088 */
1089void
1090AtEOXact_Aio(bool is_commit)
1091{
1092 /*
1093 * We should never be in batch mode at transactional boundaries. In case
1094 * an error was thrown while in batch mode, pgaio_error_cleanup() should
1095 * have exited batchmode.
1096 *
1097 * In case we are in batchmode somehow, make sure to submit all staged
1098 * IOs, other backends may need them to complete to continue.
1099 */
1101 {
1103 elog(WARNING, "open AIO batch at end of (sub-)transaction");
1104 }
1105
1106 /*
1107 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1108 */
1110}
1111
1112/*
1113 * Need to submit staged but not yet submitted IOs using the fd, otherwise
1114 * the IO would end up targeting something bogus.
1115 */
1116void
1118{
1119 /*
1120 * Might be called before AIO is initialized or in a subprocess that
1121 * doesn't use AIO.
1122 */
1123 if (!pgaio_my_backend)
1124 return;
1125
1126 /*
1127 * For now just submit all staged IOs - we could be more selective, but
1128 * it's probably not worth it.
1129 */
1131
1132 /*
1133 * If requested by the IO method, wait for all IOs that use the
1134 * to-be-closed FD.
1135 */
1137 {
1138 /*
1139 * As waiting for one IO to complete may complete multiple IOs, we
1140 * can't just use a mutable list iterator. The maximum number of
1141 * in-flight IOs is fairly small, so just restart the loop after
1142 * waiting for an IO.
1143 */
1145 {
1146 dlist_iter iter;
1147 PgAioHandle *ioh = NULL;
1148
1150 {
1151 ioh = dclist_container(PgAioHandle, node, iter.cur);
1152
1153 if (pgaio_io_uses_fd(ioh, fd))
1154 break;
1155 else
1156 ioh = NULL;
1157 }
1158
1159 if (!ioh)
1160 break;
1161
1162 /* see comment in pgaio_io_wait_for_free() about raciness */
1163 pgaio_io_wait(ioh, ioh->generation);
1164 }
1165 }
1166}
1167
1168/*
1169 * Registered as before_shmem_exit() callback in pgaio_init_backend()
1170 */
1171void
1173{
1176
1177 /* first clean up resources as we would at a transaction boundary */
1178 AtEOXact_Aio(code == 0);
1179
1180 /*
1181 * Before exiting, make sure that all IOs are finished. That has two main
1182 * purposes:
1183 *
1184 * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1185 * an AIO exiting before IO completed
1186 *
1187 * - It'd be confusing to see partially finished IOs in stats views etc
1188 */
1190 {
1192
1193 /* see comment in pgaio_io_wait_for_free() about raciness */
1194 pgaio_io_wait(ioh, ioh->generation);
1195 }
1196
1197 pgaio_my_backend = NULL;
1198}
1199
1200void
1201assign_io_method(int newval, void *extra)
1202{
1205
1207}
1208
1209bool
1211{
1212 if (*newval == -1)
1213 {
1214 /*
1215 * Auto-tuning will be applied later during startup, as auto-tuning
1216 * depends on the value of various GUCs.
1217 */
1218 return true;
1219 }
1220 else if (*newval == 0)
1221 {
1222 GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1223 return false;
1224 }
1225
1226 return true;
1227}
1228
1229
1230
1231/* --------------------------------------------------------------------------------
1232 * Injection point support
1233 * --------------------------------------------------------------------------------
1234 */
1235
1236#ifdef USE_INJECTION_POINTS
1237
1238/*
1239 * Call injection point with support for pgaio_inj_io_get().
1240 */
1241void
1242pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point)
1243{
1244 pgaio_inj_cur_handle = ioh;
1245
1246 PG_TRY();
1247 {
1248 InjectionPointCached(injection_point);
1249 }
1250 PG_FINALLY();
1251 {
1252 pgaio_inj_cur_handle = NULL;
1253 }
1254 PG_END_TRY();
1255}
1256
1257/*
1258 * Return IO associated with injection point invocation. This is only needed
1259 * as injection points currently don't support arguments.
1260 */
1262pgaio_inj_io_get(void)
1263{
1264 return pgaio_inj_cur_handle;
1265}
1266
1267#endif
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition: aio.c:500
int io_method
Definition: aio.c:77
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:873
int pgaio_io_get_id(PgAioHandle *ioh)
Definition: aio.c:330
PgAioBackend * pgaio_my_backend
Definition: aio.c:84
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition: aio.c:836
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:173
void assign_io_method(int newval, void *extra)
Definition: aio.c:1201
static void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
Definition: aio.c:374
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:866
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition: aio.c:455
static void pgaio_io_wait_for_free(void)
Definition: aio.c:701
#define PGAIO_HS_TOSTR_CASE(sym)
static const char * pgaio_io_state_get_name(PgAioHandleState s)
Definition: aio.c:810
void pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Definition: aio.c:262
static void pgaio_io_resowner_register(PgAioHandle *ioh)
Definition: aio.c:390
static PgAioHandle * pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
Definition: aio.c:793
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:354
void pgaio_closing_fd(int fd)
Definition: aio.c:1117
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition: aio.c:405
int io_max_concurrency
Definition: aio.c:78
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
bool pgaio_have_staged(void)
Definition: aio.c:1004
PgAioCtl * pgaio_ctl
Definition: aio.c:81
const IoMethodOps * pgaio_method_ops
Definition: aio.c:96
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:907
static const IoMethodOps *const pgaio_method_ops_table[]
Definition: aio.c:87
static void pgaio_io_reclaim(PgAioHandle *ioh)
Definition: aio.c:629
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:343
void pgaio_enter_batchmode(void)
Definition: aio.c:978
void pgaio_submit_staged(void)
Definition: aio.c:1020
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition: aio.c:830
const struct config_enum_entry io_method_options[]
Definition: aio.c:67
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition: aio.c:531
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition: aio.c:482
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:893
void pgaio_error_cleanup(void)
Definition: aio.c:1062
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:242
int pgaio_wref_get_id(PgAioWaitRef *iow)
Definition: aio.c:882
void AtEOXact_Aio(bool is_commit)
Definition: aio.c:1090
void pgaio_shutdown(int code, Datum arg)
Definition: aio.c:1172
bool check_io_max_concurrency(int *newval, void **extra, GucSource source)
Definition: aio.c:1210
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio.c:544
void pgaio_exit_batchmode(void)
Definition: aio.c:989
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:199
@ IOMETHOD_WORKER
Definition: aio.h:35
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_TID_INVALID
Definition: aio.h:119
PgAioOp
Definition: aio.h:88
@ PGAIO_OP_INVALID
Definition: aio.h:90
PgAioHandleFlags
Definition: aio.h:49
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
#define DEFAULT_IO_METHOD
Definition: aio.h:42
void pgaio_io_call_stage(PgAioHandle *ioh)
Definition: aio_callback.c:197
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
Definition: aio_callback.c:280
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
Definition: aio_callback.c:223
PgAioHandleState
Definition: aio_internal.h:44
@ PGAIO_HS_STAGED
Definition: aio_internal.h:66
@ PGAIO_HS_COMPLETED_SHARED
Definition: aio_internal.h:82
@ PGAIO_HS_DEFINED
Definition: aio_internal.h:59
@ PGAIO_HS_SUBMITTED
Definition: aio_internal.h:69
@ PGAIO_HS_IDLE
Definition: aio_internal.h:46
@ PGAIO_HS_HANDED_OUT
Definition: aio_internal.h:53
@ PGAIO_HS_COMPLETED_IO
Definition: aio_internal.h:72
@ PGAIO_HS_COMPLETED_LOCAL
Definition: aio_internal.h:89
#define pgaio_io_call_inj(ioh, injection_point)
Definition: aio_internal.h:407
#define pgaio_debug(elevel, msg,...)
Definition: aio_internal.h:376
#define pgaio_debug_io(elevel, ioh, msg,...)
Definition: aio_internal.h:389
#define PGAIO_SUBMIT_BATCH_SIZE
Definition: aio_internal.h:28
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition: aio_io.c:116
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
Definition: aio_io.c:197
bool pgaio_io_has_target(PgAioHandle *ioh)
Definition: aio_target.c:40
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
#define pg_read_barrier()
Definition: atomics.h:156
#define pg_write_barrier()
Definition: atomics.h:157
#define PG_UINT32_MAX
Definition: c.h:561
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define DEBUG3
Definition: elog.h:28
#define PG_TRY(...)
Definition: elog.h:372
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PG_END_TRY(...)
Definition: elog.h:397
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define PG_FINALLY(...)
Definition: elog.h:389
#define DEBUG5
Definition: elog.h:26
#define DEBUG4
Definition: elog.h:27
ProcNumber MyProcNumber
Definition: globals.c:91
bool IsUnderPostmaster
Definition: globals.c:121
volatile uint32 CritSectionCount
Definition: globals.c:46
#define newval
#define GUC_check_errdetail
Definition: guc.h:481
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
#define dclist_head_element(type, membername, lhead)
Definition: ilist.h:955
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static bool dclist_is_empty(const dclist_head *head)
Definition: ilist.h:682
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static dlist_node * dclist_pop_head_node(dclist_head *head)
Definition: ilist.h:789
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
void InjectionPointCached(const char *name)
int i
Definition: isn.c:77
const IoMethodOps pgaio_sync_ops
Definition: method_sync.c:28
const IoMethodOps pgaio_worker_ops
Definition: method_worker.c:83
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
static rewind_source * source
Definition: pg_rewind.c:89
uintptr_t Datum
Definition: postgres.h:69
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int ProcNumber
Definition: procnumber.h:24
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1104
void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1110
bool wait_on_fd_before_close
Definition: aio_internal.h:262
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
Definition: aio_internal.h:302
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio_internal.h:323
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
Definition: aio_internal.h:288
uint32 io_handle_off
Definition: aio_internal.h:188
dclist_head in_flight_ios
Definition: aio_internal.h:219
uint16 num_staged_ios
Definition: aio_internal.h:208
dclist_head idle_ios
Definition: aio_internal.h:191
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
Definition: aio_internal.h:209
PgAioHandle * handed_out_io
Definition: aio_internal.h:200
PgAioHandle * io_handles
Definition: aio_internal.h:246
uint32 io_handle_count
Definition: aio_internal.h:245
PgAioTargetData target_data
Definition: aio_internal.h:181
struct ResourceOwnerData * resowner
Definition: aio_internal.h:142
int32 owner_procno
Definition: aio_internal.h:125
PgAioResult distilled_result
Definition: aio_internal.h:156
dlist_node node
Definition: aio_internal.h:140
uint8 handle_data_len
Definition: aio_internal.h:122
PgAioOp op
Definition: aio_internal.h:105
PgAioReturn * report_return
Definition: aio_internal.h:171
uint64 generation
Definition: aio_internal.h:146
uint8 num_callbacks
Definition: aio_internal.h:110
PgAioHandleState state
Definition: aio_internal.h:99
dlist_node resowner_node
Definition: aio_internal.h:143
PgAioTargetID target
Definition: aio_internal.h:102
ConditionVariable cv
Definition: aio_internal.h:153
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
uint32 generation_upper
Definition: aio_types.h:45
uint32 aio_index
Definition: aio_types.h:35
uint32 generation_lower
Definition: aio_types.h:46
Definition: guc.h:174
dlist_node * cur
Definition: ilist.h:179
Definition: regguts.h:323
char * flag(int b)
Definition: test-ctype.c:33