Skip to content

Commit 668308b

Browse files
committed
Add support for scheduled retry to start minion
Adds support for creating a scheduled job to retry starting the minion process if it has died and not restarted correctly. This will work on Windows and systemd based systems.
1 parent 84b8c85 commit 668308b

File tree

1 file changed

+197
-23
lines changed

1 file changed

+197
-23
lines changed

salt/modules/minion.py

Lines changed: 197 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,20 @@
33
minions
44
"""
55

6+
import datetime
7+
import logging
68
import os
79
import sys
810
import time
911

1012
import salt.key
1113
import salt.utils.data
14+
import salt.utils.path
1215
import salt.utils.systemd
1316
from salt.exceptions import CommandExecutionError
1417

18+
log = logging.getLogger(__name__)
19+
1520
# Don't shadow built-ins.
1621
__func_alias__ = {"list_": "list"}
1722

@@ -34,6 +39,75 @@ def _is_windows_system():
3439
return __grains__.get("kernel") == "Windows"
3540

3641

42+
def _schedule_retry_systemd(retry_delay):
43+
"""
44+
Schedule a retry for the minion restart on systemd systems
45+
"""
46+
if not _is_systemd_system():
47+
return False
48+
49+
# systemd-run --on-active=180 bash -c "systemctl is-active salt-minion || systemctl start salt-minion"
50+
cmd = [
51+
salt.utils.path.which("systemd-run"),
52+
f"--on-active={retry_delay}",
53+
"/bin/sh",
54+
"-c",
55+
"systemctl is-active salt-minion || systemctl start salt-minion",
56+
]
57+
58+
out = __salt__["cmd.run_all"](cmd, python_shell=False)
59+
if out["retcode"] != 0:
60+
raise CommandExecutionError(out["stderr"])
61+
return out
62+
63+
64+
def _schedule_retry_windows(retry_delay):
65+
"""
66+
Schedule a retry for the minion restart on Windows systems
67+
"""
68+
if not _is_windows_system():
69+
return False
70+
71+
when = datetime.datetime.now() + datetime.timedelta(seconds=retry_delay)
72+
cmd = salt.utils.path.which("powershell.exe")
73+
args = "-ExecutionPolicy Bypass -NoProfile -WindowStyle Hidden -Command \"if ((Get-Service -Name salt-minion).Status -ne 'Running') { Start-Service -Name salt-minion }\""
74+
out = __salt__["task.create_task"](
75+
name="retry-minion-restart",
76+
user_name="System",
77+
force=True,
78+
action_type="Execute",
79+
cmd=cmd,
80+
arguments=args,
81+
trigger_type="Once",
82+
start_date=when.strftime("%Y-%m-%d"),
83+
start_time=when.strftime("%H:%M:%S"),
84+
)
85+
if not out:
86+
raise CommandExecutionError("Failed to add retry task")
87+
return out
88+
89+
90+
def _schedule_retry(retry_delay):
91+
"""
92+
Schedule a retry for the minion restart
93+
94+
"""
95+
96+
try:
97+
int(retry_delay)
98+
except (ValueError, TypeError):
99+
raise CommandExecutionError(
100+
"Invalid retry_delay value: {}. Must be a number of seconds.".format(
101+
retry_delay
102+
)
103+
)
104+
if _is_systemd_system():
105+
return _schedule_retry_systemd(retry_delay)
106+
elif _is_windows_system():
107+
return _schedule_retry_windows(retry_delay)
108+
return False
109+
110+
37111
def list_():
38112
"""
39113
Return a list of accepted, denied, unaccepted and rejected keys.
@@ -156,31 +230,31 @@ def kill(timeout=15):
156230
return ret
157231

158232

159-
def restart(systemd=True, win_service=True):
233+
def restart(systemd=True, win_service=True, schedule_retry=False, retry_delay=180):
160234
"""
161235
Restart the salt minion.
162236
163237
The method to restart the minion will be chosen as follows:
164238
165-
- If ``minion_restart_command`` is set in the minion configuration then
166-
the command specified will be used to restart the minion.
239+
If ``minion_restart_command`` is set in the minion configuration then
240+
the command specified will be used to restart the minion.
167241
168-
- If the minion is running as a systemd service then the minion will be
169-
restarted using the systemd_service module, unless ``systemd`` is
170-
set to ``False``
242+
If the minion is running as a systemd service then the minion will be
243+
restarted using the systemd_service module, unless ``systemd`` is
244+
set to ``False``
171245
172-
- If the minion is running as a Windows service then the minion will be
173-
restarted using the win_service module, unless ``win_service`` is
174-
set to ``False``
246+
If the minion is running as a Windows service then the minion will be
247+
restarted using the win_service module, unless ``win_service`` is
248+
set to ``False``
175249
176-
- If the salt-minion process is running in daemon mode (the ``-d``
177-
argument is present in ``argv``) then the minion will be killed and
178-
restarted using the same command line arguments, if possible.
250+
If the salt-minion process is running in daemon mode (the ``-d``
251+
argument is present in ``argv``) then the minion will be killed and
252+
restarted using the same command line arguments, if possible.
179253
180-
- If the salt-minion process is running in the foreground (the ``-d``
181-
argument is not present in ``argv``) then the minion will be killed but not
182-
restarted. This behavior is intended for minion processes that are managed
183-
by a process supervisor.
254+
If the salt-minion process is running in the foreground (the ``-d``
255+
argument is not present in ``argv``) then the minion will be killed but
256+
not restarted. This behavior is intended for minion processes that are
257+
managed by a process supervisor.
184258
185259
systemd
186260
If set to ``False`` then systemd will not be used to restart the minion.
@@ -190,7 +264,58 @@ def restart(systemd=True, win_service=True):
190264
If set to ``False`` then the Windows service manager will not be used to
191265
restart the minion. Defaults to ``True``.
192266
193-
CLI Example:
267+
schedule_retry
268+
If set to ``True`` then a scheduled job will be added to start the
269+
minion if it has failed to restart after the retry_delay
270+
271+
retry_delay
272+
The amount of time to wait before attempting to start the minion if it
273+
has failed to restart. Defaults to 180 seconds.
274+
275+
CLI Examples:
276+
277+
.. code-block:: bash
278+
279+
salt minion[12] minion.restart
280+
281+
minion1:
282+
----------
283+
comment:
284+
- Using systemctl to restart salt-minion
285+
- Service restart successful
286+
killed:
287+
None
288+
restart:
289+
----------
290+
retcode:
291+
0
292+
service_restart:
293+
----------
294+
result:
295+
True
296+
minion2:
297+
----------
298+
comment:
299+
- Using windows service manager to restart salt-minion
300+
- Service restart successful
301+
killed:
302+
None
303+
restart:
304+
----------
305+
retcode:
306+
0
307+
service_restart:
308+
----------
309+
result:
310+
True
311+
312+
The result shows that ``minion1`` was restarted using systemd and
313+
``minion2`` was restarted using the Windows service manager. The
314+
``service_restart`` field indicates the result of the service restart
315+
operation. The ``killed`` field is ``None`` because the minion was restarted
316+
using the service manager and not by killing the process. The ``restart``
317+
field is empty because the minion was restarted using the service manager
318+
and not by running the command line arguments of the minion process.
194319
195320
.. code-block:: bash
196321
@@ -305,21 +430,70 @@ def restart(systemd=True, win_service=True):
305430

306431
if not restart_cmd:
307432
if systemd and _is_systemd_system():
433+
schedule_retry_failed = False
308434
try:
435+
if schedule_retry:
436+
ret["service_restart"]["schedule_retry"] = {}
437+
438+
schedule_retry_failed = True
439+
sched = _schedule_retry(retry_delay)
440+
schedule_retry_failed = False
441+
442+
ret["service_restart"]["schedule_retry"]["detail"] = sched.get(
443+
"stderr", ""
444+
)
445+
ret["service_restart"]["schedule_retry"]["delay"] = retry_delay
446+
comment.append(
447+
"Scheduled retry for minion restart in {} seconds".format(
448+
retry_delay
449+
)
450+
)
451+
309452
ret["service_restart"]["result"] = __salt__["service.restart"](
310453
"salt-minion", no_block=True
311454
)
455+
comment.append("Service restart successful")
312456
except CommandExecutionError as e:
313-
comment.append("Service restart failed")
457+
comment.append(
458+
"Adding scheduled retry failed"
459+
if schedule_retry_failed
460+
else "Service restart failed"
461+
)
314462
ret["service_restart"]["result"] = False
315463
ret["service_restart"]["stderr"] = str(e)
316464
ret["retcode"] = salt.defaults.exitcodes.EX_SOFTWARE
465+
317466
elif win_service and _is_windows_system():
318-
ret["service_restart"]["result"] = __salt__["service.restart"](
319-
"salt-minion"
320-
)
321-
if not ret["service_restart"]:
322-
comment.append("Service restart failed")
467+
schedule_retry_failed = False
468+
try:
469+
if schedule_retry:
470+
ret["service_restart"]["schedule_retry"] = {}
471+
472+
schedule_retry_failed = True
473+
sched = _schedule_retry(retry_delay)
474+
schedule_retry_failed = False
475+
476+
ret["service_restart"]["schedule_retry"]["delay"] = retry_delay
477+
comment.append(
478+
"Scheduled retry for minion restart in {} seconds".format(
479+
retry_delay
480+
)
481+
)
482+
483+
ret["service_restart"]["result"] = __salt__["service.restart"](
484+
"salt-minion"
485+
)
486+
if not ret["service_restart"]:
487+
raise CommandExecutionError("Failed to restart salt-minion service")
488+
comment.append("Service restart successful")
489+
except CommandExecutionError as e:
490+
comment.append(
491+
"Adding scheduled retry failed"
492+
if schedule_retry_failed
493+
else "Service restart failed"
494+
)
495+
ret["service_restart"]["result"] = False
496+
ret["service_restart"]["stderr"] = str(e)
323497
ret["retcode"] = salt.defaults.exitcodes.EX_SOFTWARE
324498

325499
if comment:

0 commit comments

Comments
 (0)