PostgreSQL Source Code git master
Loading...
Searching...
No Matches
aio_internal.h
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio_internal.h
4 * AIO related declarations that should only be used by the AIO subsystem
5 * internally.
6 *
7 *
8 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
10 *
11 * src/include/storage/aio_internal.h
12 *
13 *-------------------------------------------------------------------------
14 */
15#ifndef AIO_INTERNAL_H
16#define AIO_INTERNAL_H
17
18
19#include "lib/ilist.h"
20#include "port/pg_iovec.h"
21#include "storage/aio.h"
23#include "storage/ipc.h"
24#include "storage/shmem.h"
25
26
27/*
28 * The maximum number of IOs that can be batch submitted at once.
29 */
30#define PGAIO_SUBMIT_BATCH_SIZE 32
31
32
33
34/*
35 * State machine for handles. With some exceptions, noted below, handles move
36 * linearly through all states.
37 *
38 * State changes should all go through pgaio_io_update_state().
39 *
40 * Note that the externally visible functions to start IO
41 * (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
42 * PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
43 * PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
44 */
45typedef enum PgAioHandleState
46{
47 /* not in use */
49
50 /*
51 * Returned by pgaio_io_acquire(). The next state is either DEFINED (if
52 * pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
53 * called).
54 */
56
57 /*
58 * pgaio_io_start_*() has been called, but IO is not yet staged. At this
59 * point the handle has all the information for the IO to be executed.
60 */
62
63 /*
64 * stage() callbacks have been called, handle ready to be submitted for
65 * execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
66 * IO will be submitted immediately after.
67 */
69
70 /* IO has been submitted to the IO method for execution */
72
73 /* IO finished, but result has not yet been processed */
75
76 /*
77 * IO completed, shared completion has been called.
78 *
79 * If the IO completion occurs in the issuing backend, local callbacks
80 * will immediately be called. Otherwise the handle stays in
81 * COMPLETED_SHARED until the issuing backend waits for the completion of
82 * the IO.
83 */
85
86 /*
87 * IO completed, local completion has been called.
88 *
89 * After this the handle will be made reusable and go into IDLE state.
90 */
93
94
96
97/*
98 * Typedef is in aio_types.h
99 *
100 * We don't use the underlying enums for state, target and op to avoid wasting
101 * space. We tried using bitfields, but several compilers generate rather
102 * horrid code for that.
103 */
105{
106 /* all state updates should go through pgaio_io_update_state() */
108
109 /* what are we operating on */
111
112 /* which IO operation */
114
115 /* bitfield of PgAioHandleFlags */
117
119
120 /* using the proper type here would use more space */
122
123 /* data forwarded to each callback */
125
126 /*
127 * Length of data associated with handle using
128 * pgaio_io_set_handle_data_*().
129 */
131
132 /* XXX: could be optimized out with some pointer math */
134
135 /* raw result of the IO operation */
137
149
152
153 /* incremented every time the IO handle is reused */
155
156 /*
157 * To wait for the IO to complete other backends can wait on this CV. Note
158 * that, if in SUBMITTED state, a waiter first needs to check if it needs
159 * to do work via IoMethodOps->wait_one().
160 */
162
163 /* result of shared callback, passed to issuer callback */
165
166 /*
167 * Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
168 *
169 * At the moment there's no need to differentiate between the two, but
170 * that won't necessarily stay that way.
171 */
173
174 /*
175 * If not NULL, this memory location will be updated with information
176 * about the IOs completion iff the issuing backend learns about the IOs
177 * completion.
178 */
180
181 /* Data necessary for the IO to be performed */
183
184 /*
185 * Data necessary to identify the object undergoing IO to higher-level
186 * code. Needs to be sufficient to allow another backend to reopen the
187 * file.
188 */
190};
191
192
193typedef struct PgAioBackend
194{
195 /* index into PgAioCtl->io_handles */
197
198 /* IO Handles that currently are not used */
200
201 /*
202 * Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
203 * without having been either defined (by actually associating it with IO)
204 * or released (with pgaio_io_release()). This restriction is necessary to
205 * guarantee that we always can acquire an IO. ->handed_out_io is used to
206 * enforce that rule.
207 */
209
210 /* Are we currently in batchmode? See pgaio_enter_batchmode(). */
212
213 /*
214 * IOs that are defined, but not yet submitted.
215 */
218
219 /*
220 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
221 * in-flight anymore, but have been waited-for and completed by another
222 * backend. Once this backend sees such an IO it'll be reclaimed.
223 *
224 * The list is ordered by submission time, with more recently submitted
225 * IOs being appended at the end.
226 */
229
230
231typedef struct PgAioCtl
232{
235
236 /*
237 * Array of iovec structs. Each iovec is owned by a specific backend. The
238 * allocation is in PgAioCtl to allow the maximum number of iovecs for
239 * individual IOs to be configurable with PGC_POSTMASTER GUC.
240 */
242 struct iovec *iovecs;
243
244 /*
245 * For, e.g., an IO covering multiple buffers in shared / temp buffers, we
246 * need to get Buffer IDs during completion to be able to change the
247 * BufferDesc state accordingly. This space can be used to store e.g.
248 * Buffer IDs. Note that the actual iovec might be shorter than this,
249 * because we combine neighboring pages into one larger iovec entry.
250 */
252
256
257
258
259/*
260 * Callbacks used to implement an IO method.
261 */
262typedef struct IoMethodOps
263{
264 /* properties */
265
266 /*
267 * If an FD is about to be closed, do we need to wait for all in-flight
268 * IOs referencing that FD?
269 */
271
272 /* global initialization */
274
275 /*
276 * Per-backend initialization. Optional.
277 */
279
280
281 /* handling of IOs */
282
283 /* optional */
285
286 /*
287 * Start executing passed in IOs.
288 *
289 * Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
290 * returns, other backends might have advanced the state further.)
291 *
292 * Will not be called if ->needs_synchronous_execution() returned true.
293 *
294 * num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
295 *
296 * Always called in a critical section.
297 */
298 int (*submit) (uint16 num_staged_ios, PgAioHandle **staged_ios);
299
300 /* ---
301 * Wait for the IO to complete. Optional.
302 *
303 * On return, state shall be on of
304 * - PGAIO_HS_COMPLETED_IO
305 * - PGAIO_HS_COMPLETED_SHARED
306 * - PGAIO_HS_COMPLETED_LOCAL
307 *
308 * The callback must not block if the handle is already in one of those
309 * states, or has been reused (see pgaio_io_was_recycled()). If, on
310 * return, the state is PGAIO_HS_COMPLETED_IO, state will reach
311 * PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
312 * method.
313 *
314 * If not provided, it needs to be guaranteed that the IO method calls
315 * pgaio_io_process_completion() without further interaction by the
316 * issuing backend.
317 * ---
318 */
321
322 /* ---
323 * Check if IO has already completed. Optional.
324 *
325 * Some IO methods need to poll a kernel object to see if IO has already
326 * completed in the background. This callback allows to do so.
327 *
328 * This callback may not wait for IO to complete, however it is allowed,
329 * although not desirable, to wait for short-lived locks. It is ok from a
330 * correctness perspective to not process any/all available completions,
331 * it just can lead to inferior performance.
332 * ---
333 */
337
338
339/* aio.c */
341extern void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op);
345extern const char *pgaio_io_get_state_name(PgAioHandle *ioh);
347extern void pgaio_shutdown(int code, Datum arg);
348
349/* aio_callback.c */
353
354/* aio_io.c */
356extern const char *pgaio_io_get_op_name(PgAioHandle *ioh);
357extern bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd);
358extern int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov);
359
360/* aio_target.c */
362extern void pgaio_io_reopen(PgAioHandle *ioh);
363extern const char *pgaio_io_get_target_name(PgAioHandle *ioh);
364
365
366/*
367 * The AIO subsystem has fairly verbose debug logging support. This can be
368 * enabled/disabled at build time. The reason for this is that
369 * a) the verbosity can make debugging things on higher levels hard
370 * b) even if logging can be skipped due to elevel checks, it still causes a
371 * measurable slowdown
372 *
373 * XXX: This likely should be eventually be disabled by default, at least in
374 * non-assert builds.
375 */
376#define PGAIO_VERBOSE 1
377
378/*
379 * Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
380 *
381 * This intentionally still compiles the code, guarded by a constant if (0),
382 * if verbose logging is disabled, to make it less likely that debug logging
383 * is silently broken.
384 *
385 * The current definition requires passing at least one argument.
386 */
387#define pgaio_debug(elevel, msg, ...) \
388 do { \
389 if (PGAIO_VERBOSE) \
390 ereport(elevel, \
391 errhidestmt(true), errhidecontext(true), \
392 errmsg_internal(msg, \
393 __VA_ARGS__)); \
394 } while(0)
395
396/*
397 * Simple ereport() wrapper. Note that the definition requires passing at
398 * least one argument.
399 */
400#define pgaio_debug_io(elevel, ioh, msg, ...) \
401 pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
402 pgaio_io_get_id(ioh), \
403 pgaio_io_get_op_name(ioh), \
404 pgaio_io_get_target_name(ioh), \
405 pgaio_io_get_state_name(ioh), \
406 __VA_ARGS__)
407
408/* Declarations for the tables of function pointers exposed by each IO method. */
411#ifdef IOMETHOD_IO_URING_ENABLED
413#endif
414
418
419
420
421#endif /* AIO_INTERNAL_H */
#define PGAIO_HANDLE_MAX_CALLBACKS
Definition aio.h:267
PgAioOp
Definition aio.h:88
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition aio.c:528
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition aio_io.c:116
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition aio.c:934
void pgaio_io_call_stage(PgAioHandle *ioh)
PGDLLIMPORT const IoMethodOps pgaio_worker_ops
PgAioHandleState
@ PGAIO_HS_STAGED
@ PGAIO_HS_COMPLETED_SHARED
@ PGAIO_HS_DEFINED
@ PGAIO_HS_SUBMITTED
@ PGAIO_HS_IDLE
@ PGAIO_HS_HANDED_OUT
@ PGAIO_HS_COMPLETED_IO
@ PGAIO_HS_COMPLETED_LOCAL
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition aio.c:483
const char * pgaio_io_get_op_name(PgAioHandle *ioh)
Definition aio_io.c:175
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
void pgaio_io_reopen(PgAioHandle *ioh)
Definition aio_target.c:116
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
Definition aio_io.c:197
bool pgaio_io_can_reopen(PgAioHandle *ioh)
Definition aio_target.c:103
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition aio.c:424
PGDLLIMPORT PgAioBackend * pgaio_my_backend
Definition aio.c:81
int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:219
PGDLLIMPORT PgAioCtl * pgaio_ctl
Definition aio.c:78
PGDLLIMPORT const IoMethodOps pgaio_sync_ops
Definition method_sync.c:28
PGDLLIMPORT const IoMethodOps * pgaio_method_ops
Definition aio.c:96
const char * pgaio_io_get_target_name(PgAioHandle *ioh)
Definition aio_target.c:50
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition aio.c:928
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition aio.c:559
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition aio.c:510
void pgaio_shutdown(int code, Datum arg)
Definition aio.c:1298
#define PGAIO_SUBMIT_BATCH_SIZE
PgAioResultStatus
Definition aio_types.h:79
#define PGDLLIMPORT
Definition c.h:1421
uint8_t uint8
Definition c.h:622
int32_t int32
Definition c.h:620
uint64_t uint64
Definition c.h:625
uint16_t uint16
Definition c.h:623
uint32_t uint32
Definition c.h:624
uint32 result
Datum arg
Definition elog.c:1322
uint64_t Datum
Definition postgres.h:70
static int fd(const char *x, int i)
static int fb(int x)
bool wait_on_fd_before_close
void(* init_backend)(void)
void(* check_one)(PgAioHandle *ioh, uint64 ref_generation)
ShmemCallbacks shmem_callbacks
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
uint32 io_handle_off
dclist_head in_flight_ios
uint16 num_staged_ios
dclist_head idle_ios
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
PgAioHandle * handed_out_io
uint32 iovec_count
struct iovec * iovecs
PgAioHandle * io_handles
uint32 io_handle_count
int backend_state_count
uint64 * handle_data
PgAioBackend * backend_state
PgAioTargetData target_data
struct ResourceOwnerData * resowner
int32 owner_procno
PgAioResult distilled_result
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS]
dlist_node node
uint8 handle_data_len
PgAioReturn * report_return
PgAioOpData op_data
uint32 iovec_off
uint64 generation
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS]
uint8 num_callbacks
dlist_node resowner_node
ConditionVariable cv