PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
aio.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio.c
4 * AIO - Core Logic
5 *
6 * For documentation about how AIO works on a higher level, including a
7 * schematic example, see README.md.
8 *
9 *
10 * AIO is a complicated subsystem. To keep things navigable, it is split
11 * across a number of files:
12 *
13 * - method_*.c - different ways of executing AIO (e.g. worker process)
14 *
15 * - aio_target.c - IO on different kinds of targets
16 *
17 * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 *
19 * - aio_callback.c - callbacks at IO operation lifecycle events
20 *
21 * - aio_init.c - per-server and per-backend initialization
22 *
23 * - aio.c - all other topics
24 *
25 * - read_stream.c - helper for reading buffered relation data
26 *
27 *
28 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
29 * Portions Copyright (c) 1994, Regents of the University of California
30 *
31 * IDENTIFICATION
32 * src/backend/storage/aio/aio.c
33 *
34 *-------------------------------------------------------------------------
35 */
36
37#include "postgres.h"
38
39#include "lib/ilist.h"
40#include "miscadmin.h"
41#include "port/atomics.h"
42#include "storage/aio.h"
44#include "storage/aio_subsys.h"
45#include "utils/guc.h"
46#include "utils/guc_hooks.h"
47#include "utils/resowner.h"
48#include "utils/wait_event_types.h"
49
50#ifdef USE_INJECTION_POINTS
52#endif
53
54
55static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
56static void pgaio_io_reclaim(PgAioHandle *ioh);
58static void pgaio_io_wait_for_free(void);
59static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
60static const char *pgaio_io_state_get_name(PgAioHandleState s);
61static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
62
63
64/* Options for io_method. */
66 {"sync", IOMETHOD_SYNC, false},
67 {"worker", IOMETHOD_WORKER, false},
68 {NULL, 0, false}
69};
70
71/* GUCs */
74
75/* global control for AIO */
77
78/* current backend's per-backend state */
80
81
82static const IoMethodOps *const pgaio_method_ops_table[] = {
85};
86
87/* callbacks for the configured io_method, set by assign_io_method */
89
90
91/*
92 * Currently there's no infrastructure to pass arguments to injection points,
93 * so we instead set this up for the duration of the injection point
94 * invocation. See pgaio_io_call_inj().
95 */
96#ifdef USE_INJECTION_POINTS
97static PgAioHandle *pgaio_inj_cur_handle;
98#endif
99
100
101
102/* --------------------------------------------------------------------------------
103 * Public Functions related to PgAioHandle
104 * --------------------------------------------------------------------------------
105 */
106
107/*
108 * Acquire an AioHandle, waiting for IO completion if necessary.
109 *
110 * Each backend can only have one AIO handle that has been "handed out" to
111 * code, but not yet submitted or released. This restriction is necessary to
112 * ensure that it is possible for code to wait for an unused handle by waiting
113 * for in-flight IO to complete. There is a limited number of handles in each
114 * backend, if multiple handles could be handed out without being submitted,
115 * waiting for all in-flight IO to complete would not guarantee that handles
116 * free up.
117 *
118 * It is cheap to acquire an IO handle, unless all handles are in use. In that
119 * case this function waits for the oldest IO to complete. If that is not
120 * desirable, use pgaio_io_acquire_nb().
121 *
122 * If a handle was acquired but then does not turn out to be needed,
123 * e.g. because pgaio_io_acquire() is called before starting an IO in a
124 * critical section, the handle needs to be released with pgaio_io_release().
125 *
126 *
127 * To react to the completion of the IO as soon as it is known to have
128 * completed, callbacks can be registered with pgaio_io_register_callbacks().
129 *
130 * To actually execute IO using the returned handle, the pgaio_io_prep_*()
131 * family of functions is used. In many cases the pgaio_io_prep_*() call will
132 * not be done directly by code that acquired the handle, but by lower level
133 * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
134 * AIO, it typically will pass the handle to smgr.c, which will pass it on to
135 * md.c, on to fd.c, which then finally calls pgaio_io_prep_*(). This
136 * forwarding allows the various layers to react to the IO's completion by
137 * registering callbacks. These callbacks in turn can translate a lower
138 * layer's result into a result understandable by a higher layer.
139 *
140 * During pgaio_io_prep_*() the IO is staged (i.e. prepared for execution but
141 * not submitted to the kernel). Unless in batchmode
142 * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
143 * execution. Note that, whether in batchmode or not, the IO might even
144 * complete before the functions return.
145 *
146 * After pgaio_io_prep_*() the AioHandle is "consumed" and may not be
147 * referenced by the IO issuing code. To e.g. wait for IO, references to the
148 * IO can be established with pgaio_io_get_wref() *before* pgaio_io_prep_*()
149 * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
150 *
151 *
152 * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
153 * passed to pgaio_io_acquire(). Once the issuing backend has called
154 * pgaio_wref_wait(), the PgAioReturn contains information about whether the
155 * operation succeeded and details about the first failure, if any. The error
156 * can be raised / logged with pgaio_result_report().
157 *
158 * The lifetime of the memory pointed to be *ret needs to be at least as long
159 * as the passed in resowner. If the resowner releases resources before the IO
160 * completes (typically due to an error), the reference to *ret will be
161 * cleared. In case of resowner cleanup *ret will not be updated with the
162 * results of the IO operation.
163 */
166{
167 PgAioHandle *h;
168
169 while (true)
170 {
171 h = pgaio_io_acquire_nb(resowner, ret);
172
173 if (h != NULL)
174 return h;
175
176 /*
177 * Evidently all handles by this backend are in use. Just wait for
178 * some to complete.
179 */
181 }
182}
183
184/*
185 * Acquire an AioHandle, returning NULL if no handles are free.
186 *
187 * See pgaio_io_acquire(). The only difference is that this function will return
188 * NULL if there are no idle handles, instead of blocking.
189 */
192{
194 {
197 }
198
200 elog(ERROR, "API violation: Only one IO can be handed out");
201
203 {
205 PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
206
207 Assert(ioh->state == PGAIO_HS_IDLE);
209
212
213 if (resowner)
215
216 if (ret)
217 {
218 ioh->report_return = ret;
220 }
221
222 return ioh;
223 }
224
225 return NULL;
226}
227
228/*
229 * Release IO handle that turned out to not be required.
230 *
231 * See pgaio_io_acquire() for more details.
232 */
233void
235{
237 {
239 Assert(ioh->resowner);
240
242 pgaio_io_reclaim(ioh);
243 }
244 else
245 {
246 elog(ERROR, "release in unexpected state");
247 }
248}
249
250/*
251 * Release IO handle during resource owner cleanup.
252 */
253void
254pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
255{
256 PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
257
258 Assert(ioh->resowner);
259
261 ioh->resowner = NULL;
262
263 switch (ioh->state)
264 {
265 case PGAIO_HS_IDLE:
266 elog(ERROR, "unexpected");
267 break;
270
272 {
274 if (!on_error)
275 elog(WARNING, "leaked AIO handle");
276 }
277
278 pgaio_io_reclaim(ioh);
279 break;
280 case PGAIO_HS_DEFINED:
281 case PGAIO_HS_STAGED:
282 if (!on_error)
283 elog(WARNING, "AIO handle was not submitted");
285 break;
290 /* this is expected to happen */
291 break;
292 }
293
294 /*
295 * Need to unregister the reporting of the IO's result, the memory it's
296 * referencing likely has gone away.
297 */
298 if (ioh->report_return)
299 ioh->report_return = NULL;
300}
301
302/*
303 * Add a [set of] flags to the IO.
304 *
305 * Note that this combines flags with already set flags, rather than set flags
306 * to explicitly the passed in parameters. This is to allow multiple callsites
307 * to set flags.
308 */
309void
311{
313
314 ioh->flags |= flag;
315}
316
317/*
318 * Returns an ID uniquely identifying the IO handle. This is only really
319 * useful for logging, as handles are reused across multiple IOs.
320 */
321int
323{
324 Assert(ioh >= pgaio_ctl->io_handles &&
326 return ioh - pgaio_ctl->io_handles;
327}
328
329/*
330 * Return the ProcNumber for the process that can use an IO handle. The
331 * mapping from IO handles to PGPROCs is static, therefore this even works
332 * when the corresponding PGPROC is not in use.
333 */
336{
337 return ioh->owner_procno;
338}
339
340/*
341 * Return a wait reference for the IO. Only wait references can be used to
342 * wait for an IOs completion, as handles themselves can be reused after
343 * completion. See also the comment above pgaio_io_acquire().
344 */
345void
347{
349 ioh->state == PGAIO_HS_DEFINED ||
350 ioh->state == PGAIO_HS_STAGED);
351 Assert(ioh->generation != 0);
352
353 iow->aio_index = ioh - pgaio_ctl->io_handles;
354 iow->generation_upper = (uint32) (ioh->generation >> 32);
355 iow->generation_lower = (uint32) ioh->generation;
356}
357
358
359
360/* --------------------------------------------------------------------------------
361 * Internal Functions related to PgAioHandle
362 * --------------------------------------------------------------------------------
363 */
364
365static inline void
367{
369 "updating state to %s",
370 pgaio_io_state_get_name(new_state));
371
372 /*
373 * Ensure the changes signified by the new state are visible before the
374 * new state becomes visible.
375 */
377
378 ioh->state = new_state;
379}
380
381static void
383{
384 Assert(!ioh->resowner);
386
389}
390
391/*
392 * Stage IO for execution and, if appropriate, submit it immediately.
393 *
394 * Should only be called from pgaio_io_prep_*().
395 */
396void
398{
399 bool needs_synchronous;
400
404
405 ioh->op = op;
406 ioh->result = 0;
407
409
410 /* allow a new IO to be staged */
412
414
416
417 /*
418 * Synchronous execution has to be executed, well, synchronously, so check
419 * that first.
420 */
421 needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
422
424 "prepared (synchronous: %d, in_batch: %d)",
425 needs_synchronous, pgaio_my_backend->in_batchmode);
426
427 if (!needs_synchronous)
428 {
431
432 /*
433 * Unless code explicitly opted into batching IOs, submit the IO
434 * immediately.
435 */
438 }
439 else
440 {
443 }
444}
445
446bool
448{
449 /*
450 * If the caller said to execute the IO synchronously, do so.
451 *
452 * XXX: We could optimize the logic when to execute synchronously by first
453 * checking if there are other IOs in flight and only synchronously
454 * executing if not. Unclear whether that'll be sufficiently common to be
455 * worth worrying about.
456 */
457 if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
458 return true;
459
460 /* Check if the IO method requires synchronous execution of IO */
463
464 return false;
465}
466
467/*
468 * Handle IO being processed by IO method.
469 *
470 * Should be called by IO methods / synchronous IO execution, just before the
471 * IO is performed.
472 */
473void
475{
477
479}
480
481/*
482 * Handle IO getting completed by a method.
483 *
484 * Should be called by IO methods / synchronous IO execution, just after the
485 * IO has been performed.
486 *
487 * Expects to be called in a critical section. We expect IOs to be usable for
488 * WAL etc, which requires being able to execute completion callbacks in a
489 * critical section.
490 */
491void
493{
495
497
498 ioh->result = result;
499
501
502 pgaio_io_call_inj(ioh, "AIO_PROCESS_COMPLETION_BEFORE_SHARED");
503
505
507
508 /* condition variable broadcast ensures state is visible before wakeup */
510
511 /* contains call to pgaio_io_call_complete_local() */
512 if (ioh->owner_procno == MyProcNumber)
513 pgaio_io_reclaim(ioh);
514}
515
516/*
517 * Has the IO completed and thus the IO handle been reused?
518 *
519 * This is useful when waiting for IO completion at a low level (e.g. in an IO
520 * method's ->wait_one() callback).
521 */
522bool
524{
525 *state = ioh->state;
527
528 return ioh->generation != ref_generation;
529}
530
531/*
532 * Wait for IO to complete. External code should never use this, outside of
533 * the AIO subsystem waits are only allowed via pgaio_wref_wait().
534 */
535static void
536pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
537{
539 bool am_owner;
540
541 am_owner = ioh->owner_procno == MyProcNumber;
542
543 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
544 return;
545
546 if (am_owner)
547 {
552 {
553 elog(PANIC, "waiting for own IO in wrong state: %d",
554 state);
555 }
556 }
557
558 while (true)
559 {
560 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
561 return;
562
563 switch (state)
564 {
565 case PGAIO_HS_IDLE:
567 elog(ERROR, "IO in wrong state: %d", state);
568 break;
569
571
572 /*
573 * If we need to wait via the IO method, do so now. Don't
574 * check via the IO method if the issuing backend is executing
575 * the IO synchronously.
576 */
578 {
579 pgaio_method_ops->wait_one(ioh, ref_generation);
580 continue;
581 }
582 /* fallthrough */
583
584 /* waiting for owner to submit */
585 case PGAIO_HS_DEFINED:
586 case PGAIO_HS_STAGED:
587 /* waiting for reaper to complete */
588 /* fallthrough */
590 /* shouldn't be able to hit this otherwise */
592 /* ensure we're going to get woken up */
594
595 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
596 {
599 break;
600 ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
601 }
602
604 break;
605
608 /* see above */
609 if (am_owner)
610 pgaio_io_reclaim(ioh);
611 return;
612 }
613 }
614}
615
616/*
617 * Make IO handle ready to be reused after IO has completed or after the
618 * handle has been released without being used.
619 */
620static void
622{
623 /* This is only ok if it's our IO */
625 Assert(ioh->state != PGAIO_HS_IDLE);
626
627 /*
628 * It's a bit ugly, but right now the easiest place to put the execution
629 * of shared completion callbacks is this function, as we need to execute
630 * local callbacks just before reclaiming at multiple callsites.
631 */
633 {
636 }
637
639 "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
641 ioh->distilled_result.id,
643 ioh->result);
644
645 /* if the IO has been defined, we might need to do more work */
646 if (ioh->state != PGAIO_HS_HANDED_OUT)
647 {
649
650 if (ioh->report_return)
651 {
654 }
655 }
656
657 if (ioh->resowner)
658 {
660 ioh->resowner = NULL;
661 }
662
663 Assert(!ioh->resowner);
664
665 ioh->op = PGAIO_OP_INVALID;
667 ioh->flags = 0;
668 ioh->num_callbacks = 0;
669 ioh->handle_data_len = 0;
670 ioh->report_return = NULL;
671 ioh->result = 0;
673
674 /* XXX: the barrier is probably superfluous */
676 ioh->generation++;
677
679
680 /*
681 * We push the IO to the head of the idle IO list, that seems more cache
682 * efficient in cases where only a few IOs are used.
683 */
685}
686
687/*
688 * Wait for an IO handle to become usable.
689 *
690 * This only really is useful for pgaio_io_acquire().
691 */
692static void
694{
695 int reclaimed = 0;
696
697 pgaio_debug(DEBUG2, "waiting for self with %d pending",
699
700 /*
701 * First check if any of our IOs actually have completed - when using
702 * worker, that'll often be the case. We could do so as part of the loop
703 * below, but that'd potentially lead us to wait for some IO submitted
704 * before.
705 */
706 for (int i = 0; i < io_max_concurrency; i++)
707 {
709
711 {
712 pgaio_io_reclaim(ioh);
713 reclaimed++;
714 }
715 }
716
717 if (reclaimed > 0)
718 return;
719
720 /*
721 * If we have any unsubmitted IOs, submit them now. We'll start waiting in
722 * a second, so it's better they're in flight. This also addresses the
723 * edge-case that all IOs are unsubmitted.
724 */
727
729 elog(ERROR, "no free IOs despite no in-flight IOs");
730
731 /*
732 * Wait for the oldest in-flight IO to complete.
733 *
734 * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
735 * for that specific IO to complete, we just need *any* IO to complete.
736 */
737 {
740
741 switch (ioh->state)
742 {
743 /* should not be in in-flight list */
744 case PGAIO_HS_IDLE:
745 case PGAIO_HS_DEFINED:
747 case PGAIO_HS_STAGED:
749 elog(ERROR, "shouldn't get here with io:%d in state %d",
750 pgaio_io_get_id(ioh), ioh->state);
751 break;
752
756 "waiting for free io with %d in flight",
758
759 /*
760 * In a more general case this would be racy, because the
761 * generation could increase after we read ioh->state above.
762 * But we are only looking at IOs by the current backend and
763 * the IO can only be recycled by this backend.
764 */
765 pgaio_io_wait(ioh, ioh->generation);
766 break;
767
769 /* it's possible that another backend just finished this IO */
770 pgaio_io_reclaim(ioh);
771 break;
772 }
773
775 elog(PANIC, "no idle IO after waiting for IO to terminate");
776 return;
777 }
778}
779
780/*
781 * Internal - code outside of AIO should never need this and it'd be hard for
782 * such code to be safe.
783 */
784static PgAioHandle *
786{
787 PgAioHandle *ioh;
788
790
791 ioh = &pgaio_ctl->io_handles[iow->aio_index];
792
793 *ref_generation = ((uint64) iow->generation_upper) << 32 |
794 iow->generation_lower;
795
796 Assert(*ref_generation != 0);
797
798 return ioh;
799}
800
801static const char *
803{
804#define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
805 switch (s)
806 {
808 PGAIO_HS_TOSTR_CASE(HANDED_OUT);
809 PGAIO_HS_TOSTR_CASE(DEFINED);
810 PGAIO_HS_TOSTR_CASE(STAGED);
811 PGAIO_HS_TOSTR_CASE(SUBMITTED);
812 PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
813 PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
814 PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
815 }
816#undef PGAIO_HS_TOSTR_CASE
817
818 return NULL; /* silence compiler */
819}
820
821const char *
823{
824 return pgaio_io_state_get_name(ioh->state);
825}
826
827const char *
829{
830 switch (rs)
831 {
832 case PGAIO_RS_UNKNOWN:
833 return "UNKNOWN";
834 case PGAIO_RS_OK:
835 return "OK";
836 case PGAIO_RS_PARTIAL:
837 return "PARTIAL";
838 case PGAIO_RS_ERROR:
839 return "ERROR";
840 }
841
842 return NULL; /* silence compiler */
843}
844
845
846
847/* --------------------------------------------------------------------------------
848 * Functions primarily related to IO Wait References
849 * --------------------------------------------------------------------------------
850 */
851
852/*
853 * Mark a wait reference as invalid
854 */
855void
857{
859}
860
861/* Is the wait reference valid? */
862bool
864{
865 return iow->aio_index != PG_UINT32_MAX;
866}
867
868/*
869 * Similar to pgaio_io_get_id(), just for wait references.
870 */
871int
873{
875 return iow->aio_index;
876}
877
878/*
879 * Wait for the IO to have completed. Can be called in any process, not just
880 * in the issuing backend.
881 */
882void
884{
885 uint64 ref_generation;
886 PgAioHandle *ioh;
887
888 ioh = pgaio_io_from_wref(iow, &ref_generation);
889
890 pgaio_io_wait(ioh, ref_generation);
891}
892
893/*
894 * Check if the referenced IO completed, without blocking.
895 */
896bool
898{
899 uint64 ref_generation;
901 bool am_owner;
902 PgAioHandle *ioh;
903
904 ioh = pgaio_io_from_wref(iow, &ref_generation);
905
906 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
907 return true;
908
909 if (state == PGAIO_HS_IDLE)
910 return true;
911
912 am_owner = ioh->owner_procno == MyProcNumber;
913
916 {
917 if (am_owner)
918 pgaio_io_reclaim(ioh);
919 return true;
920 }
921
922 /*
923 * XXX: It likely would be worth checking in with the io method, to give
924 * the IO method a chance to check if there are completion events queued.
925 */
926
927 return false;
928}
929
930
931
932/* --------------------------------------------------------------------------------
933 * Actions on multiple IOs.
934 * --------------------------------------------------------------------------------
935 */
936
937/*
938 * Submit IOs in batches going forward.
939 *
940 * Submitting multiple IOs at once can be substantially faster than doing so
941 * one-by-one. At the same time, submitting multiple IOs at once requires more
942 * care to avoid deadlocks.
943 *
944 * Consider backend A staging an IO for buffer 1 and then trying to start IO
945 * on buffer 2, while backend B does the inverse. If A submitted the IO before
946 * moving on to buffer 2, this works just fine, B will wait for the IO to
947 * complete. But if batching were used, each backend will wait for IO that has
948 * not yet been submitted to complete, i.e. forever.
949 *
950 * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
951 * allowed; error recovery will end the batch.)
952 *
953 * To avoid deadlocks, code needs to ensure that it will not wait for another
954 * backend while there is unsubmitted IO. E.g. by using conditional lock
955 * acquisition when acquiring buffer locks. To check if there currently are
956 * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
957 * pgaio_submit_staged().
958 *
959 * It is not allowed to enter batchmode while already in batchmode, it's
960 * unlikely to ever be needed, as code needs to be explicitly aware of being
961 * called in batchmode, to avoid the deadlock risks explained above.
962 *
963 * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
964 * e.g. because too many IOs have been staged or because pgaio_submit_staged()
965 * was called.
966 */
967void
969{
971 elog(ERROR, "starting batch while batch already in progress");
973}
974
975/*
976 * Stop submitting IOs in batches.
977 */
978void
980{
982
985}
986
987/*
988 * Are there staged but unsubmitted IOs?
989 *
990 * See comment above pgaio_enter_batchmode() for why code may need to check if
991 * there is IO in that state.
992 */
993bool
995{
999}
1000
1001/*
1002 * Submit all staged but not yet submitted IOs.
1003 *
1004 * Unless in batch mode, this never needs to be called, as IOs get submitted
1005 * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1006 * before waiting on another backend, to avoid the risk of deadlocks. See
1007 * pgaio_enter_batchmode().
1008 */
1009void
1011{
1012 int total_submitted = 0;
1013 int did_submit;
1014
1016 return;
1017
1018
1020
1023
1025
1026 total_submitted += did_submit;
1027
1028 Assert(total_submitted == did_submit);
1029
1031
1033 "aio: submitted %d IOs",
1034 total_submitted);
1035}
1036
1037
1038
1039/* --------------------------------------------------------------------------------
1040 * Other
1041 * --------------------------------------------------------------------------------
1042 */
1043
1044
1045/*
1046 * Perform AIO related cleanup after an error.
1047 *
1048 * This should be called early in the error recovery paths, as later steps may
1049 * need to issue AIO (e.g. to record a transaction abort WAL record).
1050 */
1051void
1053{
1054 /*
1055 * It is possible that code errored out after pgaio_enter_batchmode() but
1056 * before pgaio_exit_batchmode() was called. In that case we need to
1057 * submit the IO now.
1058 */
1060 {
1062
1064 }
1065
1066 /*
1067 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1068 */
1070}
1071
1072/*
1073 * Perform AIO related checks at (sub-)transactional boundaries.
1074 *
1075 * This should be called late during (sub-)transactional commit/abort, after
1076 * all steps that might need to perform AIO, so that we can verify that the
1077 * AIO subsystem is in a valid state at the end of a transaction.
1078 */
1079void
1080AtEOXact_Aio(bool is_commit)
1081{
1082 /*
1083 * We should never be in batch mode at transactional boundaries. In case
1084 * an error was thrown while in batch mode, pgaio_error_cleanup() should
1085 * have exited batchmode.
1086 *
1087 * In case we are in batchmode somehow, make sure to submit all staged
1088 * IOs, other backends may need them to complete to continue.
1089 */
1091 {
1093 elog(WARNING, "open AIO batch at end of (sub-)transaction");
1094 }
1095
1096 /*
1097 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1098 */
1100}
1101
1102/*
1103 * Need to submit staged but not yet submitted IOs using the fd, otherwise
1104 * the IO would end up targeting something bogus.
1105 */
1106void
1108{
1109 /*
1110 * Might be called before AIO is initialized or in a subprocess that
1111 * doesn't use AIO.
1112 */
1113 if (!pgaio_my_backend)
1114 return;
1115
1116 /*
1117 * For now just submit all staged IOs - we could be more selective, but
1118 * it's probably not worth it.
1119 */
1121}
1122
1123/*
1124 * Registered as before_shmem_exit() callback in pgaio_init_backend()
1125 */
1126void
1128{
1131
1132 /* first clean up resources as we would at a transaction boundary */
1133 AtEOXact_Aio(code == 0);
1134
1135 /*
1136 * Before exiting, make sure that all IOs are finished. That has two main
1137 * purposes:
1138 *
1139 * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1140 * an AIO exiting before IO completed
1141 *
1142 * - It'd be confusing to see partially finished IOs in stats views etc
1143 */
1145 {
1147
1148 /* see comment in pgaio_io_wait_for_free() about raciness */
1149 pgaio_io_wait(ioh, ioh->generation);
1150 }
1151
1152 pgaio_my_backend = NULL;
1153}
1154
1155void
1156assign_io_method(int newval, void *extra)
1157{
1160
1162}
1163
1164bool
1166{
1167 if (*newval == -1)
1168 {
1169 /*
1170 * Auto-tuning will be applied later during startup, as auto-tuning
1171 * depends on the value of various GUCs.
1172 */
1173 return true;
1174 }
1175 else if (*newval == 0)
1176 {
1177 GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1178 return false;
1179 }
1180
1181 return true;
1182}
1183
1184
1185
1186/* --------------------------------------------------------------------------------
1187 * Injection point support
1188 * --------------------------------------------------------------------------------
1189 */
1190
1191#ifdef USE_INJECTION_POINTS
1192
1193/*
1194 * Call injection point with support for pgaio_inj_io_get().
1195 */
1196void
1197pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point)
1198{
1199 pgaio_inj_cur_handle = ioh;
1200
1201 PG_TRY();
1202 {
1203 InjectionPointCached(injection_point);
1204 }
1205 PG_FINALLY();
1206 {
1207 pgaio_inj_cur_handle = NULL;
1208 }
1209 PG_END_TRY();
1210}
1211
1212/*
1213 * Return IO associated with injection point invocation. This is only needed
1214 * as injection points currently don't support arguments.
1215 */
1217pgaio_inj_io_get(void)
1218{
1219 return pgaio_inj_cur_handle;
1220}
1221
1222#endif
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition: aio.c:492
int io_method
Definition: aio.c:72
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:863
int pgaio_io_get_id(PgAioHandle *ioh)
Definition: aio.c:322
PgAioBackend * pgaio_my_backend
Definition: aio.c:79
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition: aio.c:828
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:165
void assign_io_method(int newval, void *extra)
Definition: aio.c:1156
static void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
Definition: aio.c:366
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:856
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition: aio.c:447
static void pgaio_io_wait_for_free(void)
Definition: aio.c:693
#define PGAIO_HS_TOSTR_CASE(sym)
static const char * pgaio_io_state_get_name(PgAioHandleState s)
Definition: aio.c:802
void pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Definition: aio.c:254
static void pgaio_io_resowner_register(PgAioHandle *ioh)
Definition: aio.c:382
static PgAioHandle * pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
Definition: aio.c:785
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:346
void pgaio_closing_fd(int fd)
Definition: aio.c:1107
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition: aio.c:397
int io_max_concurrency
Definition: aio.c:73
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:310
bool pgaio_have_staged(void)
Definition: aio.c:994
PgAioCtl * pgaio_ctl
Definition: aio.c:76
const IoMethodOps * pgaio_method_ops
Definition: aio.c:88
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:897
static const IoMethodOps *const pgaio_method_ops_table[]
Definition: aio.c:82
static void pgaio_io_reclaim(PgAioHandle *ioh)
Definition: aio.c:621
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:335
void pgaio_enter_batchmode(void)
Definition: aio.c:968
void pgaio_submit_staged(void)
Definition: aio.c:1010
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition: aio.c:822
const struct config_enum_entry io_method_options[]
Definition: aio.c:65
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition: aio.c:523
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition: aio.c:474
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:883
void pgaio_error_cleanup(void)
Definition: aio.c:1052
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:234
int pgaio_wref_get_id(PgAioWaitRef *iow)
Definition: aio.c:872
void AtEOXact_Aio(bool is_commit)
Definition: aio.c:1080
void pgaio_shutdown(int code, Datum arg)
Definition: aio.c:1127
bool check_io_max_concurrency(int *newval, void **extra, GucSource source)
Definition: aio.c:1165
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio.c:536
void pgaio_exit_batchmode(void)
Definition: aio.c:979
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:191
@ IOMETHOD_WORKER
Definition: aio.h:30
@ IOMETHOD_SYNC
Definition: aio.h:29
@ PGAIO_TID_INVALID
Definition: aio.h:111
PgAioOp
Definition: aio.h:80
@ PGAIO_OP_INVALID
Definition: aio.h:82
PgAioHandleFlags
Definition: aio.h:41
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:62
#define DEFAULT_IO_METHOD
Definition: aio.h:34
void pgaio_io_call_stage(PgAioHandle *ioh)
Definition: aio_callback.c:188
void pgaio_io_call_complete_local(PgAioHandle *ioh)
Definition: aio_callback.c:268
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
Definition: aio_callback.c:214
PgAioHandleState
Definition: aio_internal.h:39
@ PGAIO_HS_STAGED
Definition: aio_internal.h:61
@ PGAIO_HS_COMPLETED_SHARED
Definition: aio_internal.h:77
@ PGAIO_HS_DEFINED
Definition: aio_internal.h:54
@ PGAIO_HS_SUBMITTED
Definition: aio_internal.h:64
@ PGAIO_HS_IDLE
Definition: aio_internal.h:41
@ PGAIO_HS_HANDED_OUT
Definition: aio_internal.h:48
@ PGAIO_HS_COMPLETED_IO
Definition: aio_internal.h:67
@ PGAIO_HS_COMPLETED_LOCAL
Definition: aio_internal.h:84
#define pgaio_io_call_inj(ioh, injection_point)
Definition: aio_internal.h:376
#define pgaio_debug(elevel, msg,...)
Definition: aio_internal.h:345
#define pgaio_debug_io(elevel, ioh, msg,...)
Definition: aio_internal.h:358
#define PGAIO_SUBMIT_BATCH_SIZE
Definition: aio_internal.h:28
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition: aio_io.c:116
bool pgaio_io_has_target(PgAioHandle *ioh)
Definition: aio_target.c:38
PgAioResultStatus
Definition: aio_types.h:75
@ PGAIO_RS_OK
Definition: aio_types.h:77
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:76
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:78
@ PGAIO_RS_ERROR
Definition: aio_types.h:79
#define pg_read_barrier()
Definition: atomics.h:156
#define pg_write_barrier()
Definition: atomics.h:157
#define PG_UINT32_MAX
Definition: c.h:561
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
#define DEBUG3
Definition: elog.h:28
#define PG_TRY(...)
Definition: elog.h:371
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PG_END_TRY(...)
Definition: elog.h:396
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define PG_FINALLY(...)
Definition: elog.h:388
#define DEBUG5
Definition: elog.h:26
#define DEBUG4
Definition: elog.h:27
ProcNumber MyProcNumber
Definition: globals.c:89
bool IsUnderPostmaster
Definition: globals.c:119
volatile uint32 CritSectionCount
Definition: globals.c:44
#define newval
#define GUC_check_errdetail
Definition: guc.h:481
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
#define dclist_head_element(type, membername, lhead)
Definition: ilist.h:955
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static bool dclist_is_empty(const dclist_head *head)
Definition: ilist.h:682
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static dlist_node * dclist_pop_head_node(dclist_head *head)
Definition: ilist.h:789
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
void InjectionPointCached(const char *name)
int i
Definition: isn.c:74
const IoMethodOps pgaio_sync_ops
Definition: method_sync.c:28
const IoMethodOps pgaio_worker_ops
Definition: method_worker.c:82
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
void * arg
static rewind_source * source
Definition: pg_rewind.c:89
uintptr_t Datum
Definition: postgres.h:69
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int ProcNumber
Definition: procnumber.h:24
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1104
void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1110
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
Definition: aio_internal.h:285
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio_internal.h:294
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
Definition: aio_internal.h:274
uint32 io_handle_off
Definition: aio_internal.h:183
dclist_head in_flight_ios
Definition: aio_internal.h:214
uint16 num_staged_ios
Definition: aio_internal.h:203
dclist_head idle_ios
Definition: aio_internal.h:186
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
Definition: aio_internal.h:204
PgAioHandle * handed_out_io
Definition: aio_internal.h:195
PgAioHandle * io_handles
Definition: aio_internal.h:241
uint32 io_handle_count
Definition: aio_internal.h:240
PgAioTargetData target_data
Definition: aio_internal.h:176
struct ResourceOwnerData * resowner
Definition: aio_internal.h:137
int32 owner_procno
Definition: aio_internal.h:120
PgAioResult distilled_result
Definition: aio_internal.h:151
dlist_node node
Definition: aio_internal.h:135
uint8 handle_data_len
Definition: aio_internal.h:117
PgAioOp op
Definition: aio_internal.h:100
PgAioReturn * report_return
Definition: aio_internal.h:166
uint64 generation
Definition: aio_internal.h:141
uint8 num_callbacks
Definition: aio_internal.h:105
PgAioHandleState state
Definition: aio_internal.h:94
dlist_node resowner_node
Definition: aio_internal.h:138
PgAioTargetID target
Definition: aio_internal.h:97
ConditionVariable cv
Definition: aio_internal.h:148
uint32 status
Definition: aio_types.h:95
uint32 error_data
Definition: aio_types.h:98
uint32 id
Definition: aio_types.h:92
PgAioResult result
Definition: aio_types.h:112
PgAioTargetData target_data
Definition: aio_types.h:113
uint32 generation_upper
Definition: aio_types.h:45
uint32 aio_index
Definition: aio_types.h:35
uint32 generation_lower
Definition: aio_types.h:46
Definition: guc.h:174
Definition: regguts.h:323
char * flag(int b)
Definition: test-ctype.c:33