@@ -408,6 +408,7 @@ static void TerminateChildren(int signal);
408
408
#define SignalChildren (sig ) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
409
409
410
410
static int CountChildren (int target );
411
+ static bool assign_backendlist_entry (RegisteredBgWorker * rw );
411
412
static void maybe_start_bgworker (void );
412
413
static bool CreateOptsFile (int argc , char * argv [], char * fullprogname );
413
414
static pid_t StartChildProcess (AuxProcType type );
@@ -5465,13 +5466,33 @@ bgworker_forkexec(int shmem_slot)
5465
5466
* Start a new bgworker.
5466
5467
* Starting time conditions must have been checked already.
5467
5468
*
5469
+ * Returns true on success, false on failure.
5470
+ * In either case, update the RegisteredBgWorker's state appropriately.
5471
+ *
5468
5472
* This code is heavily based on autovacuum.c, q.v.
5469
5473
*/
5470
- static void
5474
+ static bool
5471
5475
do_start_bgworker (RegisteredBgWorker * rw )
5472
5476
{
5473
5477
pid_t worker_pid ;
5474
5478
5479
+ Assert (rw -> rw_pid == 0 );
5480
+
5481
+ /*
5482
+ * Allocate and assign the Backend element. Note we must do this before
5483
+ * forking, so that we can handle out of memory properly.
5484
+ *
5485
+ * Treat failure as though the worker had crashed. That way, the
5486
+ * postmaster will wait a bit before attempting to start it again; if it
5487
+ * tried again right away, most likely it'd find itself repeating the
5488
+ * out-of-memory or fork failure condition.
5489
+ */
5490
+ if (!assign_backendlist_entry (rw ))
5491
+ {
5492
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5493
+ return false;
5494
+ }
5495
+
5475
5496
ereport (DEBUG1 ,
5476
5497
(errmsg ("starting background worker process \"%s\"" ,
5477
5498
rw -> rw_worker .bgw_name )));
@@ -5483,9 +5504,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
5483
5504
#endif
5484
5505
{
5485
5506
case -1 :
5507
+ /* in postmaster, fork failed ... */
5486
5508
ereport (LOG ,
5487
5509
(errmsg ("could not fork worker process: %m" )));
5488
- return ;
5510
+ /* undo what assign_backendlist_entry did */
5511
+ ReleasePostmasterChildSlot (rw -> rw_child_slot );
5512
+ rw -> rw_child_slot = 0 ;
5513
+ free (rw -> rw_backend );
5514
+ rw -> rw_backend = NULL ;
5515
+ /* mark entry as crashed, so we'll try again later */
5516
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5517
+ break ;
5489
5518
5490
5519
#ifndef EXEC_BACKEND
5491
5520
case 0 :
@@ -5499,13 +5528,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
5499
5528
5500
5529
MyBgworkerEntry = & rw -> rw_worker ;
5501
5530
StartBackgroundWorker ();
5531
+
5532
+ exit (1 ); /* should not get here */
5502
5533
break ;
5503
5534
#endif
5504
5535
default :
5536
+ /* in postmaster, fork successful ... */
5505
5537
rw -> rw_pid = worker_pid ;
5506
5538
rw -> rw_backend -> pid = rw -> rw_pid ;
5507
5539
ReportBackgroundWorkerPID (rw );
5540
+ /* add new worker to lists of backends */
5541
+ dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5542
+ #ifdef EXEC_BACKEND
5543
+ ShmemBackendArrayAdd (rw -> rw_backend );
5544
+ #endif
5545
+ return true;
5508
5546
}
5547
+
5548
+ return false;
5509
5549
}
5510
5550
5511
5551
/*
@@ -5552,6 +5592,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
5552
5592
* Allocate the Backend struct for a connected background worker, but don't
5553
5593
* add it to the list of backends just yet.
5554
5594
*
5595
+ * On failure, return false without changing any worker state.
5596
+ *
5555
5597
* Some info from the Backend is copied into the passed rw.
5556
5598
*/
5557
5599
static bool
@@ -5564,14 +5606,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5564
5606
ereport (LOG ,
5565
5607
(errcode (ERRCODE_OUT_OF_MEMORY ),
5566
5608
errmsg ("out of memory" )));
5567
-
5568
- /*
5569
- * The worker didn't really crash, but setting this nonzero makes
5570
- * postmaster wait a bit before attempting to start it again; if it
5571
- * tried again right away, most likely it'd find itself under the same
5572
- * memory pressure.
5573
- */
5574
- rw -> rw_crashed_at = GetCurrentTimestamp ();
5575
5609
return false;
5576
5610
}
5577
5611
@@ -5601,20 +5635,31 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5601
5635
* As a side effect, the bgworker control variables are set or reset whenever
5602
5636
* there are more workers to start after this one, and whenever the overall
5603
5637
* system state requires it.
5638
+ *
5639
+ * The reason we start at most one worker per call is to avoid consuming the
5640
+ * postmaster's attention for too long when many such requests are pending.
5641
+ * As long as StartWorkerNeeded is true, ServerLoop will not block and will
5642
+ * call this function again after dealing with any other issues.
5604
5643
*/
5605
5644
static void
5606
5645
maybe_start_bgworker (void )
5607
5646
{
5608
5647
slist_mutable_iter iter ;
5609
5648
TimestampTz now = 0 ;
5610
5649
5650
+ /*
5651
+ * During crash recovery, we have no need to be called until the state
5652
+ * transition out of recovery.
5653
+ */
5611
5654
if (FatalError )
5612
5655
{
5613
5656
StartWorkerNeeded = false;
5614
5657
HaveCrashedWorker = false;
5615
- return ; /* not yet */
5658
+ return ;
5616
5659
}
5617
5660
5661
+ /* Don't need to be called again unless we find a reason for it below */
5662
+ StartWorkerNeeded = false;
5618
5663
HaveCrashedWorker = false;
5619
5664
5620
5665
slist_foreach_modify (iter , & BackgroundWorkerList )
@@ -5623,11 +5668,11 @@ maybe_start_bgworker(void)
5623
5668
5624
5669
rw = slist_container (RegisteredBgWorker , rw_lnode , iter .cur );
5625
5670
5626
- /* already running? */
5671
+ /* ignore if already running */
5627
5672
if (rw -> rw_pid != 0 )
5628
5673
continue ;
5629
5674
5630
- /* marked for death? */
5675
+ /* if marked for death, clean up and remove from list */
5631
5676
if (rw -> rw_terminate )
5632
5677
{
5633
5678
ForgetBackgroundWorker (& iter );
@@ -5649,49 +5694,50 @@ maybe_start_bgworker(void)
5649
5694
continue ;
5650
5695
}
5651
5696
5697
+ /* read system time only when needed */
5652
5698
if (now == 0 )
5653
5699
now = GetCurrentTimestamp ();
5654
5700
5655
5701
if (!TimestampDifferenceExceeds (rw -> rw_crashed_at , now ,
5656
5702
rw -> rw_worker .bgw_restart_time * 1000 ))
5657
5703
{
5704
+ /* Set flag to remember that we have workers to start later */
5658
5705
HaveCrashedWorker = true;
5659
5706
continue ;
5660
5707
}
5661
5708
}
5662
5709
5663
5710
if (bgworker_should_start_now (rw -> rw_worker .bgw_start_time ))
5664
5711
{
5665
- /* reset crash time before calling assign_backendlist_entry */
5712
+ /* reset crash time before trying to start worker */
5666
5713
rw -> rw_crashed_at = 0 ;
5667
5714
5668
5715
/*
5669
- * Allocate and assign the Backend element. Note we
5670
- * must do this before forking, so that we can handle out of
5671
- * memory properly.
5716
+ * Try to start the worker.
5717
+ *
5718
+ * On failure, give up processing workers for now, but set
5719
+ * StartWorkerNeeded so we'll come back here on the next iteration
5720
+ * of ServerLoop to try again. (We don't want to wait, because
5721
+ * there might be additional ready-to-run workers.) We could set
5722
+ * HaveCrashedWorker as well, since this worker is now marked
5723
+ * crashed, but there's no need because the next run of this
5724
+ * function will do that.
5672
5725
*/
5673
- if (!assign_backendlist_entry (rw ))
5726
+ if (!do_start_bgworker (rw ))
5727
+ {
5728
+ StartWorkerNeeded = true;
5674
5729
return ;
5675
-
5676
- do_start_bgworker (rw ); /* sets rw->rw_pid */
5677
-
5678
- dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5679
- #ifdef EXEC_BACKEND
5680
- ShmemBackendArrayAdd (rw -> rw_backend );
5681
- #endif
5730
+ }
5682
5731
5683
5732
/*
5684
- * Have ServerLoop call us again. Note that there might not
5685
- * actually *be* another runnable worker , but we don't care all
5686
- * that much; we will find out the next time we run.
5733
+ * Quit, but have ServerLoop call us again to look for additional
5734
+ * ready-to-run workers. There might not be any , but we'll find
5735
+ * out the next time we run.
5687
5736
*/
5688
5737
StartWorkerNeeded = true;
5689
5738
return ;
5690
5739
}
5691
5740
}
5692
-
5693
- /* no runnable worker found */
5694
- StartWorkerNeeded = false;
5695
5741
}
5696
5742
5697
5743
/*
0 commit comments