PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
aio_internal.h
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio_internal.h
4 * AIO related declarations that should only be used by the AIO subsystem
5 * internally.
6 *
7 *
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 * src/include/storage/aio_internal.h
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef AIO_INTERNAL_H
16#define AIO_INTERNAL_H
17
18
19#include "lib/ilist.h"
20#include "port/pg_iovec.h"
21#include "storage/aio.h"
23
24
25/*
26 * The maximum number of IOs that can be batch submitted at once.
27 */
28#define PGAIO_SUBMIT_BATCH_SIZE 32
29
30
31
32/*
33 * State machine for handles. With some exceptions, noted below, handles move
34 * linearly through all states.
35 *
36 * State changes should all go through pgaio_io_update_state().
37 *
38 * Note that the externally visible functions to start IO
39 * (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
40 * PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
41 * PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
42 */
43typedef enum PgAioHandleState
44{
45 /* not in use */
47
48 /*
49 * Returned by pgaio_io_acquire(). The next state is either DEFINED (if
50 * pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
51 * called).
52 */
54
55 /*
56 * pgaio_io_start_*() has been called, but IO is not yet staged. At this
57 * point the handle has all the information for the IO to be executed.
58 */
60
61 /*
62 * stage() callbacks have been called, handle ready to be submitted for
63 * execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
64 * IO will be submitted immediately after.
65 */
67
68 /* IO has been submitted to the IO method for execution */
70
71 /* IO finished, but result has not yet been processed */
73
74 /*
75 * IO completed, shared completion has been called.
76 *
77 * If the IO completion occurs in the issuing backend, local callbacks
78 * will immediately be called. Otherwise the handle stays in
79 * COMPLETED_SHARED until the issuing backend waits for the completion of
80 * the IO.
81 */
83
84 /*
85 * IO completed, local completion has been called.
86 *
87 * After this the handle will be made reusable and go into IDLE state.
88 */
91
92
94
95/* typedef is in aio_types.h */
97{
98 /* all state updates should go through pgaio_io_update_state() */
100
101 /* what are we operating on */
103
104 /* which IO operation */
106
107 /* bitfield of PgAioHandleFlags */
109
111
112 /* using the proper type here would use more space */
114
115 /* data forwarded to each callback */
117
118 /*
119 * Length of data associated with handle using
120 * pgaio_io_set_handle_data_*().
121 */
123
124 /* XXX: could be optimized out with some pointer math */
126
127 /* raw result of the IO operation */
129
141
144
145 /* incremented every time the IO handle is reused */
147
148 /*
149 * To wait for the IO to complete other backends can wait on this CV. Note
150 * that, if in SUBMITTED state, a waiter first needs to check if it needs
151 * to do work via IoMethodOps->wait_one().
152 */
154
155 /* result of shared callback, passed to issuer callback */
157
158 /*
159 * Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
160 *
161 * At the moment there's no need to differentiate between the two, but
162 * that won't necessarily stay that way.
163 */
165
166 /*
167 * If not NULL, this memory location will be updated with information
168 * about the IOs completion iff the issuing backend learns about the IOs
169 * completion.
170 */
172
173 /* Data necessary for the IO to be performed */
175
176 /*
177 * Data necessary to identify the object undergoing IO to higher-level
178 * code. Needs to be sufficient to allow another backend to reopen the
179 * file.
180 */
182};
183
184
185typedef struct PgAioBackend
186{
187 /* index into PgAioCtl->io_handles */
189
190 /* IO Handles that currently are not used */
192
193 /*
194 * Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
195 * without having been either defined (by actually associating it with IO)
196 * or released (with pgaio_io_release()). This restriction is necessary to
197 * guarantee that we always can acquire an IO. ->handed_out_io is used to
198 * enforce that rule.
199 */
201
202 /* Are we currently in batchmode? See pgaio_enter_batchmode(). */
204
205 /*
206 * IOs that are defined, but not yet submitted.
207 */
210
211 /*
212 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
213 * in-flight anymore, but have been waited-for and completed by another
214 * backend. Once this backend sees such an IO it'll be reclaimed.
215 *
216 * The list is ordered by submission time, with more recently submitted
217 * IOs being appended at the end.
218 */
221
222
223typedef struct PgAioCtl
224{
227
228 /*
229 * Array of iovec structs. Each iovec is owned by a specific backend. The
230 * allocation is in PgAioCtl to allow the maximum number of iovecs for
231 * individual IOs to be configurable with PGC_POSTMASTER GUC.
232 */
234 struct iovec *iovecs;
235
236 /*
237 * For, e.g., an IO covering multiple buffers in shared / temp buffers, we
238 * need to get Buffer IDs during completion to be able to change the
239 * BufferDesc state accordingly. This space can be used to store e.g.
240 * Buffer IDs. Note that the actual iovec might be shorter than this,
241 * because we combine neighboring pages into one larger iovec entry.
242 */
244
248
249
250
251/*
252 * Callbacks used to implement an IO method.
253 */
254typedef struct IoMethodOps
255{
256 /* properties */
257
258 /*
259 * If an FD is about to be closed, do we need to wait for all in-flight
260 * IOs referencing that FD?
261 */
263
264
265 /* global initialization */
266
267 /*
268 * Amount of additional shared memory to reserve for the io_method. Called
269 * just like a normal ipci.c style *Size() function. Optional.
270 */
271 size_t (*shmem_size) (void);
272
273 /*
274 * Initialize shared memory. First time is true if AIO's shared memory was
275 * just initialized, false otherwise. Optional.
276 */
277 void (*shmem_init) (bool first_time);
278
279 /*
280 * Per-backend initialization. Optional.
281 */
282 void (*init_backend) (void);
283
284
285 /* handling of IOs */
286
287 /* optional */
289
290 /*
291 * Start executing passed in IOs.
292 *
293 * Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
294 * returns, other backends might have advanced the state further.)
295 *
296 * Will not be called if ->needs_synchronous_execution() returned true.
297 *
298 * num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
299 *
300 * Always called in a critical section.
301 */
302 int (*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);
303
304 /* ---
305 * Wait for the IO to complete. Optional.
306 *
307 * On return, state shall be on of
308 * - PGAIO_HS_COMPLETED_IO
309 * - PGAIO_HS_COMPLETED_SHARED
310 * - PGAIO_HS_COMPLETED_LOCAL
311 *
312 * The callback must not block if the handle is already in one of those
313 * states, or has been reused (see pgaio_io_was_recycled()). If, on
314 * return, the state is PGAIO_HS_COMPLETED_IO, state will reach
315 * PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
316 * method.
317 *
318 * If not provided, it needs to be guaranteed that the IO method calls
319 * pgaio_io_process_completion() without further interaction by the
320 * issuing backend.
321 * ---
322 */
323 void (*wait_one) (PgAioHandle *ioh,
324 uint64 ref_generation);
326
327
328/* aio.c */
329extern bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state);
330extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
331extern void pgaio_io_process_completion(PgAioHandle *ioh, int result);
332extern void pgaio_io_prepare_submit(PgAioHandle *ioh);
334extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
336extern void pgaio_shutdown(int code, Datum arg);
337
338/* aio_callback.c */
339extern void pgaio_io_call_stage(PgAioHandle *ioh);
342
343/* aio_io.c */
345extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
346extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
347extern int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);
348
349/* aio_target.c */
350extern bool pgaio_io_can_reopen(PgAioHandle *ioh);
351extern void pgaio_io_reopen(PgAioHandle *ioh);
352extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);
353
354
355/*
356 * The AIO subsystem has fairly verbose debug logging support. This can be
357 * enabled/disabled at build time. The reason for this is that
358 * a) the verbosity can make debugging things on higher levels hard
359 * b) even if logging can be skipped due to elevel checks, it still causes a
360 * measurable slowdown
361 *
362 * XXX: This likely should be eventually be disabled by default, at least in
363 * non-assert builds.
364 */
365#define PGAIO_VERBOSE 1
366
367/*
368 * Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
369 *
370 * This intentionally still compiles the code, guarded by a constant if (0),
371 * if verbose logging is disabled, to make it less likely that debug logging
372 * is silently broken.
373 *
374 * The current definition requires passing at least one argument.
375 */
376#define pgaio_debug(elevel, msg, ...) \
377 do { \
378 if (PGAIO_VERBOSE) \
379 ereport(elevel, \
380 errhidestmt(true), errhidecontext(true), \
381 errmsg_internal(msg, \
382 __VA_ARGS__)); \
383 } while(0)
384
385/*
386 * Simple ereport() wrapper. Note that the definition requires passing at
387 * least one argument.
388 */
389#define pgaio_debug_io(elevel, ioh, msg, ...) \
390 pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
391 pgaio_io_get_id(ioh), \
392 pgaio_io_get_op_name(ioh), \
393 pgaio_io_get_target_name(ioh), \
394 pgaio_io_get_state_name(ioh), \
395 __VA_ARGS__)
396
397
398#ifdef USE_INJECTION_POINTS
399
400extern void pgaio_io_call_inj(PgAioHandle *ioh, const char *injection_point);
401
402/* just for use in tests, from within injection points */
403extern PgAioHandle *pgaio_inj_io_get(void);
404
405#else
406
407#define pgaio_io_call_inj(ioh, injection_point) (void) 0
408
409/*
410 * no fallback for pgaio_inj_io_get, all code using injection points better be
411 * guarded by USE_INJECTION_POINTS.
412 */
413
414#endif
415
416
417/* Declarations for the tables of function pointers exposed by each IO method. */
420#ifdef IOMETHOD_IO_URING_ENABLED
421extern PGDLLIMPORT const IoMethodOps pgaio_uring_ops;
422#endif
423
427
428
429
430#endif /* AIO_INTERNAL_H */
#define PGAIO_HANDLE_MAX_CALLBACKS
Definition: aio.h:267
PgAioTargetID
Definition: aio.h:117
PgAioOp
Definition: aio.h:88
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition: aio.c:500
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition: aio_io.c:116
struct IoMethodOps IoMethodOps
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition: aio.c:836
void pgaio_io_call_stage(PgAioHandle *ioh)
Definition: aio_callback.c:197
PGDLLIMPORT const IoMethodOps pgaio_worker_ops
Definition: method_worker.c:83
PgAioHandleState
Definition: aio_internal.h:44
@ PGAIO_HS_STAGED
Definition: aio_internal.h:66
@ PGAIO_HS_COMPLETED_SHARED
Definition: aio_internal.h:82
@ PGAIO_HS_DEFINED
Definition: aio_internal.h:59
@ PGAIO_HS_SUBMITTED
Definition: aio_internal.h:69
@ PGAIO_HS_IDLE
Definition: aio_internal.h:46
@ PGAIO_HS_HANDED_OUT
Definition: aio_internal.h:53
@ PGAIO_HS_COMPLETED_IO
Definition: aio_internal.h:72
@ PGAIO_HS_COMPLETED_LOCAL
Definition: aio_internal.h:89
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition: aio.c:455
const char * pgaio_io_get_op_name(PgAioHandle *ioh)
Definition: aio_io.c:175
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
Definition: aio_callback.c:280
#define pgaio_io_call_inj(ioh, injection_point)
Definition: aio_internal.h:407
void pgaio_io_reopen(PgAioHandle *ioh)
Definition: aio_target.c:110
struct PgAioCtl PgAioCtl
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
Definition: aio_io.c:197
bool pgaio_io_can_reopen(PgAioHandle *ioh)
Definition: aio_target.c:99
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
Definition: aio_callback.c:223
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition: aio.c:405
PGDLLIMPORT PgAioBackend * pgaio_my_backend
Definition: aio.c:84
struct PgAioBackend PgAioBackend
int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov)
Definition: aio_io.c:219
PGDLLIMPORT PgAioCtl * pgaio_ctl
Definition: aio.c:81
PGDLLIMPORT const IoMethodOps pgaio_sync_ops
Definition: method_sync.c:28
PGDLLIMPORT const IoMethodOps * pgaio_method_ops
Definition: aio.c:96
const char * pgaio_io_get_target_name(PgAioHandle *ioh)
Definition: aio_target.c:50
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition: aio.c:830
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition: aio.c:531
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition: aio.c:482
void pgaio_shutdown(int code, Datum arg)
Definition: aio.c:1172
#define PGAIO_SUBMIT_BATCH_SIZE
Definition: aio_internal.h:28
PgAioResultStatus
Definition: aio_types.h:79
#define PGDLLIMPORT
Definition: c.h:1291
uint8_t uint8
Definition: c.h:500
int32_t int32
Definition: c.h:498
uint64_t uint64
Definition: c.h:503
uint16_t uint16
Definition: c.h:501
uint32_t uint32
Definition: c.h:502
void * arg
uintptr_t Datum
Definition: postgres.h:69
static int fd(const char *x, int i)
Definition: preproc-init.c:105
size_t(* shmem_size)(void)
Definition: aio_internal.h:271
bool wait_on_fd_before_close
Definition: aio_internal.h:262
void(* shmem_init)(bool first_time)
Definition: aio_internal.h:277
void(* init_backend)(void)
Definition: aio_internal.h:282
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
Definition: aio_internal.h:302
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio_internal.h:323
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
Definition: aio_internal.h:288
uint32 io_handle_off
Definition: aio_internal.h:188
dclist_head in_flight_ios
Definition: aio_internal.h:219
uint16 num_staged_ios
Definition: aio_internal.h:208
dclist_head idle_ios
Definition: aio_internal.h:191
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
Definition: aio_internal.h:209
PgAioHandle * handed_out_io
Definition: aio_internal.h:200
uint32 iovec_count
Definition: aio_internal.h:233
struct iovec * iovecs
Definition: aio_internal.h:234
PgAioHandle * io_handles
Definition: aio_internal.h:246
uint32 io_handle_count
Definition: aio_internal.h:245
int backend_state_count
Definition: aio_internal.h:225
uint64 * handle_data
Definition: aio_internal.h:243
PgAioBackend * backend_state
Definition: aio_internal.h:226
PgAioTargetData target_data
Definition: aio_internal.h:181
struct ResourceOwnerData * resowner
Definition: aio_internal.h:142
int32 owner_procno
Definition: aio_internal.h:125
PgAioResult distilled_result
Definition: aio_internal.h:156
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS]
Definition: aio_internal.h:113
dlist_node node
Definition: aio_internal.h:140
uint8 handle_data_len
Definition: aio_internal.h:122
PgAioOp op
Definition: aio_internal.h:105
PgAioReturn * report_return
Definition: aio_internal.h:171
PgAioOpData op_data
Definition: aio_internal.h:174
uint32 iovec_off
Definition: aio_internal.h:164
uint64 generation
Definition: aio_internal.h:146
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS]
Definition: aio_internal.h:116
uint8 num_callbacks
Definition: aio_internal.h:110
PgAioHandleState state
Definition: aio_internal.h:99
dlist_node resowner_node
Definition: aio_internal.h:143
PgAioTargetID target
Definition: aio_internal.h:102
ConditionVariable cv
Definition: aio_internal.h:153
Definition: regguts.h:323