PostgreSQL Source Code git master
aio.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * aio.c
4 * AIO - Core Logic
5 *
6 * For documentation about how AIO works on a higher level, including a
7 * schematic example, see README.md.
8 *
9 *
10 * AIO is a complicated subsystem. To keep things navigable, it is split
11 * across a number of files:
12 *
13 * - method_*.c - different ways of executing AIO (e.g. worker process)
14 *
15 * - aio_target.c - IO on different kinds of targets
16 *
17 * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 *
19 * - aio_callback.c - callbacks at IO operation lifecycle events
20 *
21 * - aio_init.c - per-server and per-backend initialization
22 *
23 * - aio.c - all other topics
24 *
25 * - read_stream.c - helper for reading buffered relation data
26 *
27 * - README.md - higher-level overview over AIO
28 *
29 *
30 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 * Portions Copyright (c) 1994, Regents of the University of California
32 *
33 * IDENTIFICATION
34 * src/backend/storage/aio/aio.c
35 *
36 *-------------------------------------------------------------------------
37 */
38
39#include "postgres.h"
40
41#include "lib/ilist.h"
42#include "miscadmin.h"
43#include "port/atomics.h"
44#include "storage/aio.h"
46#include "storage/aio_subsys.h"
47#include "utils/guc.h"
48#include "utils/guc_hooks.h"
50#include "utils/resowner.h"
51#include "utils/wait_event_types.h"
52
53
54static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55static void pgaio_io_reclaim(PgAioHandle *ioh);
57static void pgaio_io_wait_for_free(void);
58static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59static const char *pgaio_io_state_get_name(PgAioHandleState s);
60static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61
62
63/* Options for io_method. */
65 {"sync", IOMETHOD_SYNC, false},
66 {"worker", IOMETHOD_WORKER, false},
67#ifdef IOMETHOD_IO_URING_ENABLED
68 {"io_uring", IOMETHOD_IO_URING, false},
69#endif
70 {NULL, 0, false}
71};
72
73/* GUCs */
76
77/* global control for AIO */
79
80/* current backend's per-backend state */
82
83
84static const IoMethodOps *const pgaio_method_ops_table[] = {
87#ifdef IOMETHOD_IO_URING_ENABLED
88 [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89#endif
90};
91
92/* callbacks for the configured io_method, set by assign_io_method */
94
95
96/* --------------------------------------------------------------------------------
97 * Public Functions related to PgAioHandle
98 * --------------------------------------------------------------------------------
99 */
100
101/*
102 * Acquire an AioHandle, waiting for IO completion if necessary.
103 *
104 * Each backend can only have one AIO handle that has been "handed out" to
105 * code, but not yet submitted or released. This restriction is necessary to
106 * ensure that it is possible for code to wait for an unused handle by waiting
107 * for in-flight IO to complete. There is a limited number of handles in each
108 * backend, if multiple handles could be handed out without being submitted,
109 * waiting for all in-flight IO to complete would not guarantee that handles
110 * free up.
111 *
112 * It is cheap to acquire an IO handle, unless all handles are in use. In that
113 * case this function waits for the oldest IO to complete. If that is not
114 * desirable, use pgaio_io_acquire_nb().
115 *
116 * If a handle was acquired but then does not turn out to be needed,
117 * e.g. because pgaio_io_acquire() is called before starting an IO in a
118 * critical section, the handle needs to be released with pgaio_io_release().
119 *
120 *
121 * To react to the completion of the IO as soon as it is known to have
122 * completed, callbacks can be registered with pgaio_io_register_callbacks().
123 *
124 * To actually execute IO using the returned handle, the pgaio_io_start_*()
125 * family of functions is used. In many cases the pgaio_io_start_*() call will
126 * not be done directly by code that acquired the handle, but by lower level
127 * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
128 * AIO, it typically will pass the handle to smgr.c, which will pass it on to
129 * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
130 * forwarding allows the various layers to react to the IO's completion by
131 * registering callbacks. These callbacks in turn can translate a lower
132 * layer's result into a result understandable by a higher layer.
133 *
134 * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
135 * not submitted to the kernel). Unless in batchmode
136 * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
137 * execution. Note that, whether in batchmode or not, the IO might even
138 * complete before the functions return.
139 *
140 * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
141 * referenced by the IO issuing code. To e.g. wait for IO, references to the
142 * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
143 * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
144 *
145 *
146 * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
147 * passed to pgaio_io_acquire(). Once the issuing backend has called
148 * pgaio_wref_wait(), the PgAioReturn contains information about whether the
149 * operation succeeded and details about the first failure, if any. The error
150 * can be raised / logged with pgaio_result_report().
151 *
152 * The lifetime of the memory pointed to be *ret needs to be at least as long
153 * as the passed in resowner. If the resowner releases resources before the IO
154 * completes (typically due to an error), the reference to *ret will be
155 * cleared. In case of resowner cleanup *ret will not be updated with the
156 * results of the IO operation.
157 */
160{
161 PgAioHandle *h;
162
163 while (true)
164 {
165 h = pgaio_io_acquire_nb(resowner, ret);
166
167 if (h != NULL)
168 return h;
169
170 /*
171 * Evidently all handles by this backend are in use. Just wait for
172 * some to complete.
173 */
175 }
176}
177
178/*
179 * Acquire an AioHandle, returning NULL if no handles are free.
180 *
181 * See pgaio_io_acquire(). The only difference is that this function will return
182 * NULL if there are no idle handles, instead of blocking.
183 */
186{
188 {
191 }
192
194 elog(ERROR, "API violation: Only one IO can be handed out");
195
197 {
199 PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
200
201 Assert(ioh->state == PGAIO_HS_IDLE);
203
206
207 if (resowner)
209
210 if (ret)
211 {
212 ioh->report_return = ret;
214 }
215
216 return ioh;
217 }
218
219 return NULL;
220}
221
222/*
223 * Release IO handle that turned out to not be required.
224 *
225 * See pgaio_io_acquire() for more details.
226 */
227void
229{
231 {
233 Assert(ioh->resowner);
234
236 pgaio_io_reclaim(ioh);
237 }
238 else
239 {
240 elog(ERROR, "release in unexpected state");
241 }
242}
243
244/*
245 * Release IO handle during resource owner cleanup.
246 */
247void
248pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
249{
250 PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
251
252 Assert(ioh->resowner);
253
255 ioh->resowner = NULL;
256
257 switch (ioh->state)
258 {
259 case PGAIO_HS_IDLE:
260 elog(ERROR, "unexpected");
261 break;
264
266 {
268 if (!on_error)
269 elog(WARNING, "leaked AIO handle");
270 }
271
272 pgaio_io_reclaim(ioh);
273 break;
274 case PGAIO_HS_DEFINED:
275 case PGAIO_HS_STAGED:
276 if (!on_error)
277 elog(WARNING, "AIO handle was not submitted");
279 break;
284 /* this is expected to happen */
285 break;
286 }
287
288 /*
289 * Need to unregister the reporting of the IO's result, the memory it's
290 * referencing likely has gone away.
291 */
292 if (ioh->report_return)
293 ioh->report_return = NULL;
294}
295
296/*
297 * Add a [set of] flags to the IO.
298 *
299 * Note that this combines flags with already set flags, rather than set flags
300 * to explicitly the passed in parameters. This is to allow multiple callsites
301 * to set flags.
302 */
303void
305{
307
308 ioh->flags |= flag;
309}
310
311/*
312 * Returns an ID uniquely identifying the IO handle. This is only really
313 * useful for logging, as handles are reused across multiple IOs.
314 */
315int
317{
318 Assert(ioh >= pgaio_ctl->io_handles &&
320 return ioh - pgaio_ctl->io_handles;
321}
322
323/*
324 * Return the ProcNumber for the process that can use an IO handle. The
325 * mapping from IO handles to PGPROCs is static, therefore this even works
326 * when the corresponding PGPROC is not in use.
327 */
330{
331 return ioh->owner_procno;
332}
333
334/*
335 * Return a wait reference for the IO. Only wait references can be used to
336 * wait for an IOs completion, as handles themselves can be reused after
337 * completion. See also the comment above pgaio_io_acquire().
338 */
339void
341{
343 ioh->state == PGAIO_HS_DEFINED ||
344 ioh->state == PGAIO_HS_STAGED);
345 Assert(ioh->generation != 0);
346
347 iow->aio_index = ioh - pgaio_ctl->io_handles;
348 iow->generation_upper = (uint32) (ioh->generation >> 32);
349 iow->generation_lower = (uint32) ioh->generation;
350}
351
352
353
354/* --------------------------------------------------------------------------------
355 * Internal Functions related to PgAioHandle
356 * --------------------------------------------------------------------------------
357 */
358
359static inline void
361{
363 "updating state to %s",
364 pgaio_io_state_get_name(new_state));
365
366 /*
367 * Ensure the changes signified by the new state are visible before the
368 * new state becomes visible.
369 */
371
372 ioh->state = new_state;
373}
374
375static void
377{
378 Assert(!ioh->resowner);
380
383}
384
385/*
386 * Stage IO for execution and, if appropriate, submit it immediately.
387 *
388 * Should only be called from pgaio_io_start_*().
389 */
390void
392{
393 bool needs_synchronous;
394
398
399 ioh->op = op;
400 ioh->result = 0;
401
403
404 /* allow a new IO to be staged */
406
408
410
411 /*
412 * Synchronous execution has to be executed, well, synchronously, so check
413 * that first.
414 */
415 needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
416
418 "staged (synchronous: %d, in_batch: %d)",
419 needs_synchronous, pgaio_my_backend->in_batchmode);
420
421 if (!needs_synchronous)
422 {
425
426 /*
427 * Unless code explicitly opted into batching IOs, submit the IO
428 * immediately.
429 */
432 }
433 else
434 {
437 }
438}
439
440bool
442{
443 /*
444 * If the caller said to execute the IO synchronously, do so.
445 *
446 * XXX: We could optimize the logic when to execute synchronously by first
447 * checking if there are other IOs in flight and only synchronously
448 * executing if not. Unclear whether that'll be sufficiently common to be
449 * worth worrying about.
450 */
451 if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
452 return true;
453
454 /* Check if the IO method requires synchronous execution of IO */
457
458 return false;
459}
460
461/*
462 * Handle IO being processed by IO method.
463 *
464 * Should be called by IO methods / synchronous IO execution, just before the
465 * IO is performed.
466 */
467void
469{
471
473}
474
475/*
476 * Handle IO getting completed by a method.
477 *
478 * Should be called by IO methods / synchronous IO execution, just after the
479 * IO has been performed.
480 *
481 * Expects to be called in a critical section. We expect IOs to be usable for
482 * WAL etc, which requires being able to execute completion callbacks in a
483 * critical section.
484 */
485void
487{
489
491
492 ioh->result = result;
493
495
496 INJECTION_POINT("aio-process-completion-before-shared", ioh);
497
499
501
502 /* condition variable broadcast ensures state is visible before wakeup */
504
505 /* contains call to pgaio_io_call_complete_local() */
506 if (ioh->owner_procno == MyProcNumber)
507 pgaio_io_reclaim(ioh);
508}
509
510/*
511 * Has the IO completed and thus the IO handle been reused?
512 *
513 * This is useful when waiting for IO completion at a low level (e.g. in an IO
514 * method's ->wait_one() callback).
515 */
516bool
518{
519 *state = ioh->state;
521
522 return ioh->generation != ref_generation;
523}
524
525/*
526 * Wait for IO to complete. External code should never use this, outside of
527 * the AIO subsystem waits are only allowed via pgaio_wref_wait().
528 */
529static void
530pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
531{
533 bool am_owner;
534
535 am_owner = ioh->owner_procno == MyProcNumber;
536
537 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
538 return;
539
540 if (am_owner)
541 {
546 {
547 elog(PANIC, "waiting for own IO in wrong state: %d",
548 state);
549 }
550 }
551
552 while (true)
553 {
554 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
555 return;
556
557 switch (state)
558 {
559 case PGAIO_HS_IDLE:
561 elog(ERROR, "IO in wrong state: %d", state);
562 break;
563
565
566 /*
567 * If we need to wait via the IO method, do so now. Don't
568 * check via the IO method if the issuing backend is executing
569 * the IO synchronously.
570 */
572 {
573 pgaio_method_ops->wait_one(ioh, ref_generation);
574 continue;
575 }
576 /* fallthrough */
577
578 /* waiting for owner to submit */
579 case PGAIO_HS_DEFINED:
580 case PGAIO_HS_STAGED:
581 /* waiting for reaper to complete */
582 /* fallthrough */
584 /* shouldn't be able to hit this otherwise */
586 /* ensure we're going to get woken up */
588
589 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
590 {
593 break;
594 ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
595 }
596
598 break;
599
602 /* see above */
603 if (am_owner)
604 pgaio_io_reclaim(ioh);
605 return;
606 }
607 }
608}
609
610/*
611 * Make IO handle ready to be reused after IO has completed or after the
612 * handle has been released without being used.
613 */
614static void
616{
617 /* This is only ok if it's our IO */
619 Assert(ioh->state != PGAIO_HS_IDLE);
620
621 /*
622 * It's a bit ugly, but right now the easiest place to put the execution
623 * of local completion callbacks is this function, as we need to execute
624 * local callbacks just before reclaiming at multiple callsites.
625 */
627 {
628 PgAioResult local_result;
629
630 local_result = pgaio_io_call_complete_local(ioh);
632
633 if (ioh->report_return)
634 {
635 ioh->report_return->result = local_result;
637 }
638 }
639
641 "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
643 ioh->distilled_result.id,
645 ioh->result);
646
647 /* if the IO has been defined, it's on the in-flight list, remove */
648 if (ioh->state != PGAIO_HS_HANDED_OUT)
650
651 if (ioh->resowner)
652 {
654 ioh->resowner = NULL;
655 }
656
657 Assert(!ioh->resowner);
658
659 /*
660 * Update generation & state first, before resetting the IO's fields,
661 * otherwise a concurrent "viewer" could think the fields are valid, even
662 * though they are being reset. Increment the generation first, so that
663 * we can assert elsewhere that we never wait for an IDLE IO. While it's
664 * a bit weird for the state to go backwards for a generation, it's OK
665 * here, as there cannot be references to the "reborn" IO yet. Can't
666 * update both at once, so something has to give.
667 */
668 ioh->generation++;
670
671 /* ensure the state update is visible before we reset fields */
673
674 ioh->op = PGAIO_OP_INVALID;
676 ioh->flags = 0;
677 ioh->num_callbacks = 0;
678 ioh->handle_data_len = 0;
679 ioh->report_return = NULL;
680 ioh->result = 0;
682
683 /*
684 * We push the IO to the head of the idle IO list, that seems more cache
685 * efficient in cases where only a few IOs are used.
686 */
688}
689
690/*
691 * Wait for an IO handle to become usable.
692 *
693 * This only really is useful for pgaio_io_acquire().
694 */
695static void
697{
698 int reclaimed = 0;
699
700 pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs",
704
705 /*
706 * First check if any of our IOs actually have completed - when using
707 * worker, that'll often be the case. We could do so as part of the loop
708 * below, but that'd potentially lead us to wait for some IO submitted
709 * before.
710 */
711 for (int i = 0; i < io_max_concurrency; i++)
712 {
714
716 {
717 pgaio_io_reclaim(ioh);
718 reclaimed++;
719 }
720 }
721
722 if (reclaimed > 0)
723 return;
724
725 /*
726 * If we have any unsubmitted IOs, submit them now. We'll start waiting in
727 * a second, so it's better they're in flight. This also addresses the
728 * edge-case that all IOs are unsubmitted.
729 */
732
735 errmsg_internal("no free IOs despite no in-flight IOs"),
736 errdetail_internal("%d pending, %d in-flight, %d idle IOs",
740
741 /*
742 * Wait for the oldest in-flight IO to complete.
743 *
744 * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
745 * for that specific IO to complete, we just need *any* IO to complete.
746 */
747 {
750
751 switch (ioh->state)
752 {
753 /* should not be in in-flight list */
754 case PGAIO_HS_IDLE:
755 case PGAIO_HS_DEFINED:
757 case PGAIO_HS_STAGED:
759 elog(ERROR, "shouldn't get here with io:%d in state %d",
760 pgaio_io_get_id(ioh), ioh->state);
761 break;
762
766 "waiting for free io with %d in flight",
768
769 /*
770 * In a more general case this would be racy, because the
771 * generation could increase after we read ioh->state above.
772 * But we are only looking at IOs by the current backend and
773 * the IO can only be recycled by this backend.
774 */
775 pgaio_io_wait(ioh, ioh->generation);
776 break;
777
779 /* it's possible that another backend just finished this IO */
780 pgaio_io_reclaim(ioh);
781 break;
782 }
783
785 elog(PANIC, "no idle IO after waiting for IO to terminate");
786 return;
787 }
788}
789
790/*
791 * Internal - code outside of AIO should never need this and it'd be hard for
792 * such code to be safe.
793 */
794static PgAioHandle *
796{
797 PgAioHandle *ioh;
798
800
801 ioh = &pgaio_ctl->io_handles[iow->aio_index];
802
803 *ref_generation = ((uint64) iow->generation_upper) << 32 |
804 iow->generation_lower;
805
806 Assert(*ref_generation != 0);
807
808 return ioh;
809}
810
811static const char *
813{
814#define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
815 switch (s)
816 {
818 PGAIO_HS_TOSTR_CASE(HANDED_OUT);
819 PGAIO_HS_TOSTR_CASE(DEFINED);
820 PGAIO_HS_TOSTR_CASE(STAGED);
821 PGAIO_HS_TOSTR_CASE(SUBMITTED);
822 PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
823 PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
824 PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
825 }
826#undef PGAIO_HS_TOSTR_CASE
827
828 return NULL; /* silence compiler */
829}
830
831const char *
833{
834 return pgaio_io_state_get_name(ioh->state);
835}
836
837const char *
839{
840 switch (rs)
841 {
842 case PGAIO_RS_UNKNOWN:
843 return "UNKNOWN";
844 case PGAIO_RS_OK:
845 return "OK";
846 case PGAIO_RS_WARNING:
847 return "WARNING";
848 case PGAIO_RS_PARTIAL:
849 return "PARTIAL";
850 case PGAIO_RS_ERROR:
851 return "ERROR";
852 }
853
854 return NULL; /* silence compiler */
855}
856
857
858
859/* --------------------------------------------------------------------------------
860 * Functions primarily related to IO Wait References
861 * --------------------------------------------------------------------------------
862 */
863
864/*
865 * Mark a wait reference as invalid
866 */
867void
869{
871}
872
873/* Is the wait reference valid? */
874bool
876{
877 return iow->aio_index != PG_UINT32_MAX;
878}
879
880/*
881 * Similar to pgaio_io_get_id(), just for wait references.
882 */
883int
885{
887 return iow->aio_index;
888}
889
890/*
891 * Wait for the IO to have completed. Can be called in any process, not just
892 * in the issuing backend.
893 */
894void
896{
897 uint64 ref_generation;
898 PgAioHandle *ioh;
899
900 ioh = pgaio_io_from_wref(iow, &ref_generation);
901
902 pgaio_io_wait(ioh, ref_generation);
903}
904
905/*
906 * Check if the referenced IO completed, without blocking.
907 */
908bool
910{
911 uint64 ref_generation;
913 bool am_owner;
914 PgAioHandle *ioh;
915
916 ioh = pgaio_io_from_wref(iow, &ref_generation);
917
918 if (pgaio_io_was_recycled(ioh, ref_generation, &state))
919 return true;
920
921 if (state == PGAIO_HS_IDLE)
922 return true;
923
924 am_owner = ioh->owner_procno == MyProcNumber;
925
928 {
929 if (am_owner)
930 pgaio_io_reclaim(ioh);
931 return true;
932 }
933
934 /*
935 * XXX: It likely would be worth checking in with the io method, to give
936 * the IO method a chance to check if there are completion events queued.
937 */
938
939 return false;
940}
941
942
943
944/* --------------------------------------------------------------------------------
945 * Actions on multiple IOs.
946 * --------------------------------------------------------------------------------
947 */
948
949/*
950 * Submit IOs in batches going forward.
951 *
952 * Submitting multiple IOs at once can be substantially faster than doing so
953 * one-by-one. At the same time, submitting multiple IOs at once requires more
954 * care to avoid deadlocks.
955 *
956 * Consider backend A staging an IO for buffer 1 and then trying to start IO
957 * on buffer 2, while backend B does the inverse. If A submitted the IO before
958 * moving on to buffer 2, this works just fine, B will wait for the IO to
959 * complete. But if batching were used, each backend will wait for IO that has
960 * not yet been submitted to complete, i.e. forever.
961 *
962 * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
963 * allowed; error recovery will end the batch.)
964 *
965 * To avoid deadlocks, code needs to ensure that it will not wait for another
966 * backend while there is unsubmitted IO. E.g. by using conditional lock
967 * acquisition when acquiring buffer locks. To check if there currently are
968 * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
969 * pgaio_submit_staged().
970 *
971 * It is not allowed to enter batchmode while already in batchmode, it's
972 * unlikely to ever be needed, as code needs to be explicitly aware of being
973 * called in batchmode, to avoid the deadlock risks explained above.
974 *
975 * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
976 * e.g. because too many IOs have been staged or because pgaio_submit_staged()
977 * was called.
978 */
979void
981{
983 elog(ERROR, "starting batch while batch already in progress");
985}
986
987/*
988 * Stop submitting IOs in batches.
989 */
990void
992{
994
997}
998
999/*
1000 * Are there staged but unsubmitted IOs?
1001 *
1002 * See comment above pgaio_enter_batchmode() for why code may need to check if
1003 * there is IO in that state.
1004 */
1005bool
1007{
1010 return pgaio_my_backend->num_staged_ios > 0;
1011}
1012
1013/*
1014 * Submit all staged but not yet submitted IOs.
1015 *
1016 * Unless in batch mode, this never needs to be called, as IOs get submitted
1017 * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1018 * before waiting on another backend, to avoid the risk of deadlocks. See
1019 * pgaio_enter_batchmode().
1020 */
1021void
1023{
1024 int total_submitted = 0;
1025 int did_submit;
1026
1028 return;
1029
1030
1032
1035
1037
1038 total_submitted += did_submit;
1039
1040 Assert(total_submitted == did_submit);
1041
1043
1045 "aio: submitted %d IOs",
1046 total_submitted);
1047}
1048
1049
1050
1051/* --------------------------------------------------------------------------------
1052 * Other
1053 * --------------------------------------------------------------------------------
1054 */
1055
1056
1057/*
1058 * Perform AIO related cleanup after an error.
1059 *
1060 * This should be called early in the error recovery paths, as later steps may
1061 * need to issue AIO (e.g. to record a transaction abort WAL record).
1062 */
1063void
1065{
1066 /*
1067 * It is possible that code errored out after pgaio_enter_batchmode() but
1068 * before pgaio_exit_batchmode() was called. In that case we need to
1069 * submit the IO now.
1070 */
1072 {
1074
1076 }
1077
1078 /*
1079 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1080 */
1082}
1083
1084/*
1085 * Perform AIO related checks at (sub-)transactional boundaries.
1086 *
1087 * This should be called late during (sub-)transactional commit/abort, after
1088 * all steps that might need to perform AIO, so that we can verify that the
1089 * AIO subsystem is in a valid state at the end of a transaction.
1090 */
1091void
1092AtEOXact_Aio(bool is_commit)
1093{
1094 /*
1095 * We should never be in batch mode at transactional boundaries. In case
1096 * an error was thrown while in batch mode, pgaio_error_cleanup() should
1097 * have exited batchmode.
1098 *
1099 * In case we are in batchmode somehow, make sure to submit all staged
1100 * IOs, other backends may need them to complete to continue.
1101 */
1103 {
1105 elog(WARNING, "open AIO batch at end of (sub-)transaction");
1106 }
1107
1108 /*
1109 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1110 */
1112}
1113
1114/*
1115 * Need to submit staged but not yet submitted IOs using the fd, otherwise
1116 * the IO would end up targeting something bogus.
1117 */
1118void
1120{
1121 /*
1122 * Might be called before AIO is initialized or in a subprocess that
1123 * doesn't use AIO.
1124 */
1125 if (!pgaio_my_backend)
1126 return;
1127
1128 /*
1129 * For now just submit all staged IOs - we could be more selective, but
1130 * it's probably not worth it.
1131 */
1133 {
1135 "submitting %d IOs before FD %d gets closed",
1138 }
1139
1140 /*
1141 * If requested by the IO method, wait for all IOs that use the
1142 * to-be-closed FD.
1143 */
1145 {
1146 /*
1147 * As waiting for one IO to complete may complete multiple IOs, we
1148 * can't just use a mutable list iterator. The maximum number of
1149 * in-flight IOs is fairly small, so just restart the loop after
1150 * waiting for an IO.
1151 */
1153 {
1154 dlist_iter iter;
1155 PgAioHandle *ioh = NULL;
1156
1158 {
1159 ioh = dclist_container(PgAioHandle, node, iter.cur);
1160
1161 if (pgaio_io_uses_fd(ioh, fd))
1162 break;
1163 else
1164 ioh = NULL;
1165 }
1166
1167 if (!ioh)
1168 break;
1169
1171 "waiting for IO before FD %d gets closed, %d in-flight IOs",
1173
1174 /* see comment in pgaio_io_wait_for_free() about raciness */
1175 pgaio_io_wait(ioh, ioh->generation);
1176 }
1177 }
1178}
1179
1180/*
1181 * Registered as before_shmem_exit() callback in pgaio_init_backend()
1182 */
1183void
1185{
1188
1189 /* first clean up resources as we would at a transaction boundary */
1190 AtEOXact_Aio(code == 0);
1191
1192 /*
1193 * Before exiting, make sure that all IOs are finished. That has two main
1194 * purposes:
1195 *
1196 * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1197 * an AIO exiting before IO completed
1198 *
1199 * - It'd be confusing to see partially finished IOs in stats views etc
1200 */
1202 {
1204
1206 "waiting for IO to complete during shutdown, %d in-flight IOs",
1208
1209 /* see comment in pgaio_io_wait_for_free() about raciness */
1210 pgaio_io_wait(ioh, ioh->generation);
1211 }
1212
1213 pgaio_my_backend = NULL;
1214}
1215
1216void
1217assign_io_method(int newval, void *extra)
1218{
1221
1223}
1224
1225bool
1227{
1228 if (*newval == -1)
1229 {
1230 /*
1231 * Auto-tuning will be applied later during startup, as auto-tuning
1232 * depends on the value of various GUCs.
1233 */
1234 return true;
1235 }
1236 else if (*newval == 0)
1237 {
1238 GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1239 return false;
1240 }
1241
1242 return true;
1243}
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition: aio.c:486
int io_method
Definition: aio.c:74
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:875
int pgaio_io_get_id(PgAioHandle *ioh)
Definition: aio.c:316
PgAioBackend * pgaio_my_backend
Definition: aio.c:81
const char * pgaio_result_status_string(PgAioResultStatus rs)
Definition: aio.c:838
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:159
void assign_io_method(int newval, void *extra)
Definition: aio.c:1217
static void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
Definition: aio.c:360
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:868
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
Definition: aio.c:441
static void pgaio_io_wait_for_free(void)
Definition: aio.c:696
#define PGAIO_HS_TOSTR_CASE(sym)
static const char * pgaio_io_state_get_name(PgAioHandleState s)
Definition: aio.c:812
void pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
Definition: aio.c:248
static void pgaio_io_resowner_register(PgAioHandle *ioh)
Definition: aio.c:376
static PgAioHandle * pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
Definition: aio.c:795
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:340
void pgaio_closing_fd(int fd)
Definition: aio.c:1119
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
Definition: aio.c:391
int io_max_concurrency
Definition: aio.c:75
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:304
bool pgaio_have_staged(void)
Definition: aio.c:1006
PgAioCtl * pgaio_ctl
Definition: aio.c:78
const IoMethodOps * pgaio_method_ops
Definition: aio.c:93
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:909
static const IoMethodOps *const pgaio_method_ops_table[]
Definition: aio.c:84
static void pgaio_io_reclaim(PgAioHandle *ioh)
Definition: aio.c:615
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:329
void pgaio_enter_batchmode(void)
Definition: aio.c:980
void pgaio_submit_staged(void)
Definition: aio.c:1022
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
Definition: aio.c:832
const struct config_enum_entry io_method_options[]
Definition: aio.c:64
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition: aio.c:517
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition: aio.c:468
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:895
void pgaio_error_cleanup(void)
Definition: aio.c:1064
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:228
int pgaio_wref_get_id(PgAioWaitRef *iow)
Definition: aio.c:884
void AtEOXact_Aio(bool is_commit)
Definition: aio.c:1092
void pgaio_shutdown(int code, Datum arg)
Definition: aio.c:1184
bool check_io_max_concurrency(int *newval, void **extra, GucSource source)
Definition: aio.c:1226
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio.c:530
void pgaio_exit_batchmode(void)
Definition: aio.c:991
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:185
@ IOMETHOD_WORKER
Definition: aio.h:35
@ IOMETHOD_SYNC
Definition: aio.h:34
@ PGAIO_TID_INVALID
Definition: aio.h:119
PgAioOp
Definition: aio.h:88
@ PGAIO_OP_INVALID
Definition: aio.h:90
PgAioHandleFlags
Definition: aio.h:49
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
#define DEFAULT_IO_METHOD
Definition: aio.h:42
void pgaio_io_call_stage(PgAioHandle *ioh)
Definition: aio_callback.c:199
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
Definition: aio_callback.c:282
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
Definition: aio_callback.c:225
PgAioHandleState
Definition: aio_internal.h:44
@ PGAIO_HS_STAGED
Definition: aio_internal.h:66
@ PGAIO_HS_COMPLETED_SHARED
Definition: aio_internal.h:82
@ PGAIO_HS_DEFINED
Definition: aio_internal.h:59
@ PGAIO_HS_SUBMITTED
Definition: aio_internal.h:69
@ PGAIO_HS_IDLE
Definition: aio_internal.h:46
@ PGAIO_HS_HANDED_OUT
Definition: aio_internal.h:53
@ PGAIO_HS_COMPLETED_IO
Definition: aio_internal.h:72
@ PGAIO_HS_COMPLETED_LOCAL
Definition: aio_internal.h:89
#define pgaio_debug(elevel, msg,...)
Definition: aio_internal.h:376
#define pgaio_debug_io(elevel, ioh, msg,...)
Definition: aio_internal.h:389
#define PGAIO_SUBMIT_BATCH_SIZE
Definition: aio_internal.h:28
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition: aio_io.c:116
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
Definition: aio_io.c:197
bool pgaio_io_has_target(PgAioHandle *ioh)
Definition: aio_target.c:40
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_OK
Definition: aio_types.h:81
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
#define pg_read_barrier()
Definition: atomics.h:156
#define pg_write_barrier()
Definition: atomics.h:157
#define PG_UINT32_MAX
Definition: c.h:561
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
#define lengthof(array)
Definition: c.h:759
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1231
#define DEBUG3
Definition: elog.h:28
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define DEBUG5
Definition: elog.h:26
#define DEBUG4
Definition: elog.h:27
ProcNumber MyProcNumber
Definition: globals.c:91
bool IsUnderPostmaster
Definition: globals.c:121
volatile uint32 CritSectionCount
Definition: globals.c:46
#define newval
#define GUC_check_errdetail
Definition: guc.h:481
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
#define dclist_container(type, membername, ptr)
Definition: ilist.h:947
#define dclist_head_element(type, membername, lhead)
Definition: ilist.h:955
static void dclist_push_tail(dclist_head *head, dlist_node *node)
Definition: ilist.h:709
static uint32 dclist_count(const dclist_head *head)
Definition: ilist.h:932
static bool dclist_is_empty(const dclist_head *head)
Definition: ilist.h:682
static void dclist_delete_from(dclist_head *head, dlist_node *node)
Definition: ilist.h:763
static dlist_node * dclist_pop_head_node(dclist_head *head)
Definition: ilist.h:789
static void dclist_push_head(dclist_head *head, dlist_node *node)
Definition: ilist.h:693
#define dlist_container(type, membername, ptr)
Definition: ilist.h:593
#define dclist_foreach(iter, lhead)
Definition: ilist.h:970
#define INJECTION_POINT(name, arg)
int i
Definition: isn.c:77
const IoMethodOps pgaio_sync_ops
Definition: method_sync.c:28
const IoMethodOps pgaio_worker_ops
Definition: method_worker.c:84
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
void * arg
static rewind_source * source
Definition: pg_rewind.c:89
uintptr_t Datum
Definition: postgres.h:69
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int ProcNumber
Definition: procnumber.h:24
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1104
void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
Definition: resowner.c:1110
bool wait_on_fd_before_close
Definition: aio_internal.h:262
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
Definition: aio_internal.h:302
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
Definition: aio_internal.h:323
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
Definition: aio_internal.h:288
uint32 io_handle_off
Definition: aio_internal.h:188
dclist_head in_flight_ios
Definition: aio_internal.h:219
uint16 num_staged_ios
Definition: aio_internal.h:208
dclist_head idle_ios
Definition: aio_internal.h:191
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
Definition: aio_internal.h:209
PgAioHandle * handed_out_io
Definition: aio_internal.h:200
PgAioHandle * io_handles
Definition: aio_internal.h:246
uint32 io_handle_count
Definition: aio_internal.h:245
PgAioTargetData target_data
Definition: aio_internal.h:181
struct ResourceOwnerData * resowner
Definition: aio_internal.h:142
int32 owner_procno
Definition: aio_internal.h:125
PgAioResult distilled_result
Definition: aio_internal.h:156
dlist_node node
Definition: aio_internal.h:140
uint8 handle_data_len
Definition: aio_internal.h:122
PgAioOp op
Definition: aio_internal.h:105
PgAioReturn * report_return
Definition: aio_internal.h:171
uint64 generation
Definition: aio_internal.h:146
uint8 num_callbacks
Definition: aio_internal.h:110
PgAioHandleState state
Definition: aio_internal.h:99
dlist_node resowner_node
Definition: aio_internal.h:143
PgAioTargetID target
Definition: aio_internal.h:102
ConditionVariable cv
Definition: aio_internal.h:153
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
uint32 id
Definition: aio_types.h:105
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133
uint32 generation_upper
Definition: aio_types.h:45
uint32 aio_index
Definition: aio_types.h:35
uint32 generation_lower
Definition: aio_types.h:46
Definition: guc.h:174
dlist_node * cur
Definition: ilist.h:179
Definition: regguts.h:323
char * flag(int b)
Definition: test-ctype.c:33