@@ -420,6 +420,7 @@ static void TerminateChildren(int signal);
420
420
#define SignalChildren (sig ) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
421
421
422
422
static int CountChildren (int target );
423
+ static bool assign_backendlist_entry (RegisteredBgWorker * rw );
423
424
static void maybe_start_bgworker (void );
424
425
static bool CreateOptsFile (int argc , char * argv [], char * fullprogname );
425
426
static pid_t StartChildProcess (AuxProcType type );
@@ -5531,13 +5532,33 @@ bgworker_forkexec(int shmem_slot)
5531
5532
* Start a new bgworker.
5532
5533
* Starting time conditions must have been checked already.
5533
5534
*
5535
+ * Returns true on success, false on failure.
5536
+ * In either case, update the RegisteredBgWorker's state appropriately.
5537
+ *
5534
5538
* This code is heavily based on autovacuum.c, q.v.
5535
5539
*/
5536
- static void
5540
+ static bool
5537
5541
do_start_bgworker (RegisteredBgWorker * rw )
5538
5542
{
5539
5543
pid_t worker_pid ;
5540
5544
5545
+ Assert (rw -> rw_pid == 0 );
5546
+
5547
+ /*
5548
+ * Allocate and assign the Backend element. Note we must do this before
5549
+ * forking, so that we can handle out of memory properly.
5550
+ *
5551
+ * Treat failure as though the worker had crashed. That way, the
5552
+ * postmaster will wait a bit before attempting to start it again; if it
5553
+ * tried again right away, most likely it'd find itself repeating the
5554
+ * out-of-memory or fork failure condition.
5555
+ */
5556
+ if (!assign_backendlist_entry (rw ))
5557
+ {
5558
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5559
+ return false;
5560
+ }
5561
+
5541
5562
ereport (DEBUG1 ,
5542
5563
(errmsg ("starting background worker process \"%s\"" ,
5543
5564
rw -> rw_worker .bgw_name )));
@@ -5549,9 +5570,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
5549
5570
#endif
5550
5571
{
5551
5572
case -1 :
5573
+ /* in postmaster, fork failed ... */
5552
5574
ereport (LOG ,
5553
5575
(errmsg ("could not fork worker process: %m" )));
5554
- return ;
5576
+ /* undo what assign_backendlist_entry did */
5577
+ ReleasePostmasterChildSlot (rw -> rw_child_slot );
5578
+ rw -> rw_child_slot = 0 ;
5579
+ free (rw -> rw_backend );
5580
+ rw -> rw_backend = NULL ;
5581
+ /* mark entry as crashed, so we'll try again later */
5582
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5583
+ break ;
5555
5584
5556
5585
#ifndef EXEC_BACKEND
5557
5586
case 0 :
@@ -5575,14 +5604,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
5575
5604
PostmasterContext = NULL ;
5576
5605
5577
5606
StartBackgroundWorker ();
5607
+
5608
+ exit (1 ); /* should not get here */
5578
5609
break ;
5579
5610
#endif
5580
5611
default :
5612
+ /* in postmaster, fork successful ... */
5581
5613
rw -> rw_pid = worker_pid ;
5582
5614
rw -> rw_backend -> pid = rw -> rw_pid ;
5583
5615
ReportBackgroundWorkerPID (rw );
5584
- break ;
5616
+ /* add new worker to lists of backends */
5617
+ dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5618
+ #ifdef EXEC_BACKEND
5619
+ ShmemBackendArrayAdd (rw -> rw_backend );
5620
+ #endif
5621
+ return true;
5585
5622
}
5623
+
5624
+ return false;
5586
5625
}
5587
5626
5588
5627
/*
@@ -5629,6 +5668,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
5629
5668
* Allocate the Backend struct for a connected background worker, but don't
5630
5669
* add it to the list of backends just yet.
5631
5670
*
5671
+ * On failure, return false without changing any worker state.
5672
+ *
5632
5673
* Some info from the Backend is copied into the passed rw.
5633
5674
*/
5634
5675
static bool
@@ -5647,8 +5688,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5647
5688
ereport (LOG ,
5648
5689
(errcode (ERRCODE_INTERNAL_ERROR ),
5649
5690
errmsg ("could not generate random cancel key" )));
5650
-
5651
- rw -> rw_crashed_at = GetCurrentTimestamp ();
5652
5691
return false;
5653
5692
}
5654
5693
@@ -5658,14 +5697,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5658
5697
ereport (LOG ,
5659
5698
(errcode (ERRCODE_OUT_OF_MEMORY ),
5660
5699
errmsg ("out of memory" )));
5661
-
5662
- /*
5663
- * The worker didn't really crash, but setting this nonzero makes
5664
- * postmaster wait a bit before attempting to start it again; if it
5665
- * tried again right away, most likely it'd find itself under the same
5666
- * memory pressure.
5667
- */
5668
- rw -> rw_crashed_at = GetCurrentTimestamp ();
5669
5700
return false;
5670
5701
}
5671
5702
@@ -5687,20 +5718,31 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5687
5718
* As a side effect, the bgworker control variables are set or reset whenever
5688
5719
* there are more workers to start after this one, and whenever the overall
5689
5720
* system state requires it.
5721
+ *
5722
+ * The reason we start at most one worker per call is to avoid consuming the
5723
+ * postmaster's attention for too long when many such requests are pending.
5724
+ * As long as StartWorkerNeeded is true, ServerLoop will not block and will
5725
+ * call this function again after dealing with any other issues.
5690
5726
*/
5691
5727
static void
5692
5728
maybe_start_bgworker (void )
5693
5729
{
5694
5730
slist_mutable_iter iter ;
5695
5731
TimestampTz now = 0 ;
5696
5732
5733
+ /*
5734
+ * During crash recovery, we have no need to be called until the state
5735
+ * transition out of recovery.
5736
+ */
5697
5737
if (FatalError )
5698
5738
{
5699
5739
StartWorkerNeeded = false;
5700
5740
HaveCrashedWorker = false;
5701
- return ; /* not yet */
5741
+ return ;
5702
5742
}
5703
5743
5744
+ /* Don't need to be called again unless we find a reason for it below */
5745
+ StartWorkerNeeded = false;
5704
5746
HaveCrashedWorker = false;
5705
5747
5706
5748
slist_foreach_modify (iter , & BackgroundWorkerList )
@@ -5709,11 +5751,11 @@ maybe_start_bgworker(void)
5709
5751
5710
5752
rw = slist_container (RegisteredBgWorker , rw_lnode , iter .cur );
5711
5753
5712
- /* already running? */
5754
+ /* ignore if already running */
5713
5755
if (rw -> rw_pid != 0 )
5714
5756
continue ;
5715
5757
5716
- /* marked for death? */
5758
+ /* if marked for death, clean up and remove from list */
5717
5759
if (rw -> rw_terminate )
5718
5760
{
5719
5761
ForgetBackgroundWorker (& iter );
@@ -5735,48 +5777,50 @@ maybe_start_bgworker(void)
5735
5777
continue ;
5736
5778
}
5737
5779
5780
+ /* read system time only when needed */
5738
5781
if (now == 0 )
5739
5782
now = GetCurrentTimestamp ();
5740
5783
5741
5784
if (!TimestampDifferenceExceeds (rw -> rw_crashed_at , now ,
5742
5785
rw -> rw_worker .bgw_restart_time * 1000 ))
5743
5786
{
5787
+ /* Set flag to remember that we have workers to start later */
5744
5788
HaveCrashedWorker = true;
5745
5789
continue ;
5746
5790
}
5747
5791
}
5748
5792
5749
5793
if (bgworker_should_start_now (rw -> rw_worker .bgw_start_time ))
5750
5794
{
5751
- /* reset crash time before calling assign_backendlist_entry */
5795
+ /* reset crash time before trying to start worker */
5752
5796
rw -> rw_crashed_at = 0 ;
5753
5797
5754
5798
/*
5755
- * Allocate and assign the Backend element. Note we must do this
5756
- * before forking, so that we can handle out of memory properly.
5799
+ * Try to start the worker.
5800
+ *
5801
+ * On failure, give up processing workers for now, but set
5802
+ * StartWorkerNeeded so we'll come back here on the next iteration
5803
+ * of ServerLoop to try again. (We don't want to wait, because
5804
+ * there might be additional ready-to-run workers.) We could set
5805
+ * HaveCrashedWorker as well, since this worker is now marked
5806
+ * crashed, but there's no need because the next run of this
5807
+ * function will do that.
5757
5808
*/
5758
- if (!assign_backendlist_entry (rw ))
5809
+ if (!do_start_bgworker (rw ))
5810
+ {
5811
+ StartWorkerNeeded = true;
5759
5812
return ;
5760
-
5761
- do_start_bgworker (rw ); /* sets rw->rw_pid */
5762
-
5763
- dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5764
- #ifdef EXEC_BACKEND
5765
- ShmemBackendArrayAdd (rw -> rw_backend );
5766
- #endif
5813
+ }
5767
5814
5768
5815
/*
5769
- * Have ServerLoop call us again. Note that there might not
5770
- * actually *be* another runnable worker , but we don't care all
5771
- * that much; we will find out the next time we run.
5816
+ * Quit, but have ServerLoop call us again to look for additional
5817
+ * ready-to-run workers. There might not be any , but we'll find
5818
+ * out the next time we run.
5772
5819
*/
5773
5820
StartWorkerNeeded = true;
5774
5821
return ;
5775
5822
}
5776
5823
}
5777
-
5778
- /* no runnable worker found */
5779
- StartWorkerNeeded = false;
5780
5824
}
5781
5825
5782
5826
/*
0 commit comments