bind10_src.py.in 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981
  1. #!@PYTHON@
  2. # Copyright (C) 2010,2011 Internet Systems Consortium.
  3. #
  4. # Permission to use, copy, modify, and distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  9. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  10. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  11. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  13. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  14. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  15. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. """
  17. This file implements the Boss of Bind (BoB, or bob) program.
  18. Its purpose is to start up the BIND 10 system, and then manage the
  19. processes, by starting and stopping processes, plus restarting
  20. processes that exit.
  21. To start the system, it first runs the c-channel program (msgq), then
  22. connects to that. It then runs the configuration manager, and reads
  23. its own configuration. Then it proceeds to starting other modules.
  24. The Python subprocess module is used for starting processes, but
  25. because this is not efficient for managing groups of processes,
  26. SIGCHLD signals are caught and processed using the signal module.
  27. Most of the logic is contained in the BoB class. However, since Python
  28. requires that signal processing happen in the main thread, we do
  29. signal handling outside of that class, in the code running for
  30. __main__.
  31. """
  32. import sys; sys.path.append ('@@PYTHONPATH@@')
  33. import os
  34. # If B10_FROM_SOURCE is set in the environment, we use data files
  35. # from a directory relative to that, otherwise we use the ones
  36. # installed on the system
  37. if "B10_FROM_SOURCE" in os.environ:
  38. SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
  39. else:
  40. PREFIX = "@prefix@"
  41. DATAROOTDIR = "@datarootdir@"
  42. SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
  43. import subprocess
  44. import signal
  45. import re
  46. import errno
  47. import time
  48. import select
  49. import random
  50. import socket
  51. from optparse import OptionParser, OptionValueError
  52. import io
  53. import pwd
  54. import posix
  55. import isc.cc
  56. import isc.util.process
  57. import isc.net.parse
  58. import isc.log
  59. from isc.log_messages.bind10_messages import *
  60. import isc.bind10.component
  61. import isc.bind10.special_component
  62. isc.log.init("b10-boss")
  63. logger = isc.log.Logger("boss")
  64. # Pending system-wide debug level definitions, the ones we
  65. # use here are hardcoded for now
  66. DBG_PROCESS = 10
  67. DBG_COMMANDS = 30
  68. # Assign this process some longer name
  69. isc.util.process.rename(sys.argv[0])
  70. # This is the version that gets displayed to the user.
  71. # The VERSION string consists of the module name, the module version
  72. # number, and the overall BIND 10 version number (set in configure.ac).
  73. VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
  74. # This is for bind10.boottime of stats module
  75. _BASETIME = time.gmtime()
  76. class RestartSchedule:
  77. """
  78. Keeps state when restarting something (in this case, a process).
  79. When a process dies unexpectedly, we need to restart it. However, if
  80. it fails to restart for some reason, then we should not simply keep
  81. restarting it at high speed.
  82. A more sophisticated algorithm can be developed, but for now we choose
  83. a simple set of rules:
  84. * If a process was been running for >=10 seconds, we restart it
  85. right away.
  86. * If a process was running for <10 seconds, we wait until 10 seconds
  87. after it was started.
  88. To avoid programs getting into lockstep, we use a normal distribution
  89. to avoid being restarted at exactly 10 seconds."""
  90. def __init__(self, restart_frequency=10.0):
  91. self.restart_frequency = restart_frequency
  92. self.run_start_time = None
  93. self.run_stop_time = None
  94. self.restart_time = None
  95. def set_run_start_time(self, when=None):
  96. if when is None:
  97. when = time.time()
  98. self.run_start_time = when
  99. sigma = self.restart_frequency * 0.05
  100. self.restart_time = when + random.normalvariate(self.restart_frequency,
  101. sigma)
  102. def set_run_stop_time(self, when=None):
  103. """We don't actually do anything with stop time now, but it
  104. might be useful for future algorithms."""
  105. if when is None:
  106. when = time.time()
  107. self.run_stop_time = when
  108. def get_restart_time(self, when=None):
  109. if when is None:
  110. when = time.time()
  111. return max(when, self.restart_time)
  112. class ProcessInfoError(Exception): pass
  113. class ProcessInfo:
  114. """Information about a process"""
  115. dev_null = open(os.devnull, "w")
  116. def __init__(self, name, args, env={}, dev_null_stdout=False,
  117. dev_null_stderr=False, uid=None, username=None):
  118. self.name = name
  119. self.args = args
  120. self.env = env
  121. self.dev_null_stdout = dev_null_stdout
  122. self.dev_null_stderr = dev_null_stderr
  123. self.restart_schedule = RestartSchedule()
  124. self.uid = uid
  125. self.username = username
  126. self.process = None
  127. self.pid = None
  128. def _preexec_work(self):
  129. """Function used before running a program that needs to run as a
  130. different user."""
  131. # First, put us into a separate process group so we don't get
  132. # SIGINT signals on Ctrl-C (the boss will shut everthing down by
  133. # other means).
  134. os.setpgrp()
  135. # Second, set the user ID if one has been specified
  136. if self.uid is not None:
  137. try:
  138. posix.setuid(self.uid)
  139. except OSError as e:
  140. if e.errno == errno.EPERM:
  141. # if we failed to change user due to permission report that
  142. raise ProcessInfoError("Unable to change to user %s (uid %d)" % (self.username, self.uid))
  143. else:
  144. # otherwise simply re-raise whatever error we found
  145. raise
  146. def _spawn(self):
  147. if self.dev_null_stdout:
  148. spawn_stdout = self.dev_null
  149. else:
  150. spawn_stdout = None
  151. if self.dev_null_stderr:
  152. spawn_stderr = self.dev_null
  153. else:
  154. spawn_stderr = None
  155. # Environment variables for the child process will be a copy of those
  156. # of the boss process with any additional specific variables given
  157. # on construction (self.env).
  158. spawn_env = os.environ
  159. spawn_env.update(self.env)
  160. if 'B10_FROM_SOURCE' not in os.environ:
  161. spawn_env['PATH'] = "@@LIBEXECDIR@@:" + spawn_env['PATH']
  162. self.process = subprocess.Popen(self.args,
  163. stdin=subprocess.PIPE,
  164. stdout=spawn_stdout,
  165. stderr=spawn_stderr,
  166. close_fds=True,
  167. env=spawn_env,
  168. preexec_fn=self._preexec_work)
  169. self.pid = self.process.pid
  170. self.restart_schedule.set_run_start_time()
  171. # spawn() and respawn() are the same for now, but in the future they
  172. # may have different functionality
  173. def spawn(self):
  174. self._spawn()
  175. def respawn(self):
  176. self._spawn()
  177. class CChannelConnectError(Exception): pass
  178. class BoB:
  179. """Boss of BIND class."""
  180. def __init__(self, msgq_socket_file=None, data_path=None,
  181. config_filename=None, nocache=False, verbose=False, setuid=None,
  182. username=None, cmdctl_port=None, brittle=False):
  183. """
  184. Initialize the Boss of BIND. This is a singleton (only one can run).
  185. The msgq_socket_file specifies the UNIX domain socket file that the
  186. msgq process listens on. If verbose is True, then the boss reports
  187. what it is doing.
  188. Data path and config filename are passed trough to config manager
  189. (if provided) and specify the config file to be used.
  190. The cmdctl_port is passed to cmdctl and specify on which port it
  191. should listen.
  192. """
  193. self.cc_session = None
  194. self.ccs = None
  195. self.curproc = None
  196. self.dead_processes = {}
  197. self.msgq_socket_file = msgq_socket_file
  198. self.nocache = nocache
  199. self.processes = {}
  200. self.runnable = False
  201. self.uid = setuid
  202. self.username = username
  203. self.verbose = verbose
  204. self.data_path = data_path
  205. self.config_filename = config_filename
  206. self.cmdctl_port = cmdctl_port
  207. self.brittle = brittle
  208. self._component_configurator = isc.bind10.component.Configurator(self,
  209. isc.bind10.special_component.get_specials())
  210. self.__core_components = {
  211. 'sockcreator': {
  212. 'kind': 'core',
  213. 'special': 'sockcreator',
  214. 'priority': 200
  215. },
  216. 'msgq': {
  217. 'kind': 'core',
  218. 'special': 'msgq',
  219. 'priority': 199
  220. },
  221. 'cfgmgr': {
  222. 'kind': 'core',
  223. 'special': 'cfgmgr',
  224. 'priority': 198
  225. }
  226. }
  227. self.__started = False
  228. self.__stopping = False
  229. self.exitcode = 0
  230. def __propagate_component_config(self, config):
  231. comps = dict(config)
  232. # Fill in the core components, so they stay alive
  233. for comp in self.__core_components:
  234. if comp in comps:
  235. raise Exception(comp + " is core component managed by " +
  236. "bind10 boss, do not set it")
  237. comps[comp] = self.__core_components[comp]
  238. # Update the configuration
  239. self._component_configurator.reconfigure(comps)
  240. def config_handler(self, new_config):
  241. # If this is initial update, don't do anything now, leave it to startup
  242. if not self.runnable:
  243. return
  244. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
  245. new_config)
  246. try:
  247. if 'components' in new_config:
  248. self.__propagate_component_config(new_config['components'])
  249. return isc.config.ccsession.create_answer(0)
  250. except Exception as e:
  251. return isc.config.ccsession.create_answer(1, str(e))
  252. def get_processes(self):
  253. pids = list(self.processes.keys())
  254. pids.sort()
  255. process_list = [ ]
  256. for pid in pids:
  257. process_list.append([pid, self.processes[pid].name()])
  258. return process_list
  259. def _get_stats_data(self):
  260. return { "stats_data": {
  261. 'bind10.boot_time': time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
  262. }}
  263. def command_handler(self, command, args):
  264. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
  265. answer = isc.config.ccsession.create_answer(1, "command not implemented")
  266. if type(command) != str:
  267. answer = isc.config.ccsession.create_answer(1, "bad command")
  268. else:
  269. if command == "shutdown":
  270. self.runnable = False
  271. answer = isc.config.ccsession.create_answer(0)
  272. elif command == "getstats":
  273. answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
  274. elif command == "sendstats":
  275. # send statistics data to the stats daemon immediately
  276. cmd = isc.config.ccsession.create_command(
  277. 'set', self._get_stats_data())
  278. seq = self.cc_session.group_sendmsg(cmd, 'Stats')
  279. # Consume the answer, in case it becomes a orphan message.
  280. try:
  281. self.cc_session.group_recvmsg(False, seq)
  282. except isc.cc.session.SessionTimeout:
  283. pass
  284. answer = isc.config.ccsession.create_answer(0)
  285. elif command == "ping":
  286. answer = isc.config.ccsession.create_answer(0, "pong")
  287. elif command == "show_processes":
  288. answer = isc.config.ccsession. \
  289. create_answer(0, self.get_processes())
  290. else:
  291. answer = isc.config.ccsession.create_answer(1,
  292. "Unknown command")
  293. return answer
  294. def kill_started_processes(self):
  295. """
  296. Called as part of the exception handling when a process fails to
  297. start, this runs through the list of started processes, killing
  298. each one. It then clears that list.
  299. """
  300. logger.info(BIND10_KILLING_ALL_PROCESSES)
  301. for pid in self.processes:
  302. logger.info(BIND10_KILL_PROCESS, self.processes[pid].name())
  303. self.processes[pid].kill()
  304. self.processes = {}
  305. if self._component_configurator.running():
  306. self._component_configurator.shutdown()
  307. def read_bind10_config(self):
  308. """
  309. Reads the parameters associated with the BoB module itself.
  310. This means the the list of components we should be running.
  311. """
  312. logger.info(BIND10_READING_BOSS_CONFIGURATION)
  313. config_data = self.ccs.get_full_config()
  314. self.__propagate_component_config(config_data['components'])
  315. # Propagate the config to the config manager, first reconfigure
  316. def log_starting(self, process, port = None, address = None):
  317. """
  318. A convenience function to output a "Starting xxx" message if the
  319. logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
  320. Putting this into a separate method ensures
  321. that the output form is consistent across all processes.
  322. The process name (passed as the first argument) is put into
  323. self.curproc, and is used to indicate which process failed to
  324. start if there is an error (and is used in the "Started" message
  325. on success). The optional port and address information are
  326. appended to the message (if present).
  327. """
  328. self.curproc = process
  329. if port is None and address is None:
  330. logger.info(BIND10_STARTING_PROCESS, self.curproc)
  331. elif address is None:
  332. logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
  333. port)
  334. else:
  335. logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
  336. self.curproc, address, port)
  337. def log_started(self, pid = None):
  338. """
  339. A convenience function to output a 'Started xxxx (PID yyyy)'
  340. message. As with starting_message(), this ensures a consistent
  341. format.
  342. """
  343. if pid is None:
  344. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
  345. else:
  346. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
  347. # The next few methods start the individual processes of BIND-10. They
  348. # are called via start_all_processes(). If any fail, an exception is
  349. # raised which is caught by the caller of start_all_processes(); this kills
  350. # processes started up to that point before terminating the program.
  351. def start_msgq(self):
  352. """
  353. Start the message queue and connect to the command channel.
  354. """
  355. self.log_starting("b10-msgq")
  356. msgq_proc = ProcessInfo("b10-msgq", ["b10-msgq"], self.c_channel_env,
  357. True, not self.verbose, uid=self.uid,
  358. username=self.username)
  359. msgq_proc.spawn()
  360. self.log_started(msgq_proc.pid)
  361. # Now connect to the c-channel
  362. cc_connect_start = time.time()
  363. while self.cc_session is None:
  364. # if we have been trying for "a while" give up
  365. if (time.time() - cc_connect_start) > 5:
  366. raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")
  367. # try to connect, and if we can't, wait a short while
  368. try:
  369. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  370. except isc.cc.session.SessionError:
  371. time.sleep(0.1)
  372. return msgq_proc
  373. def start_cfgmgr(self):
  374. """
  375. Starts the configuration manager process
  376. """
  377. self.log_starting("b10-cfgmgr")
  378. args = ["b10-cfgmgr"]
  379. if self.data_path is not None:
  380. args.append("--data-path=" + self.data_path)
  381. if self.config_filename is not None:
  382. args.append("--config-filename=" + self.config_filename)
  383. bind_cfgd = ProcessInfo("b10-cfgmgr", args,
  384. self.c_channel_env, uid=self.uid,
  385. username=self.username)
  386. bind_cfgd.spawn()
  387. self.log_started(bind_cfgd.pid)
  388. # sleep until b10-cfgmgr is fully up and running, this is a good place
  389. # to have a (short) timeout on synchronized groupsend/receive
  390. # TODO: replace the sleep by a listen for ConfigManager started
  391. # message
  392. time.sleep(1)
  393. return bind_cfgd
  394. def start_ccsession(self, c_channel_env):
  395. """
  396. Start the CC Session
  397. The argument c_channel_env is unused but is supplied to keep the
  398. argument list the same for all start_xxx methods.
  399. """
  400. self.log_starting("ccsession") #FIXME This is not a process, can't tell a process is starting
  401. self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
  402. self.config_handler,
  403. self.command_handler)
  404. self.ccs.start()
  405. self.log_started()
  406. # A couple of utility methods for starting processes...
  407. def start_process(self, name, args, c_channel_env, port=None, address=None):
  408. """
  409. Given a set of command arguments, start the process and output
  410. appropriate log messages. If the start is successful, the process
  411. is added to the list of started processes.
  412. The port and address arguments are for log messages only.
  413. """
  414. self.log_starting(name, port, address)
  415. newproc = ProcessInfo(name, args, c_channel_env)
  416. newproc.spawn()
  417. self.log_started(newproc.pid)
  418. return newproc
  419. def start_simple(self, name):
  420. """
  421. Most of the BIND-10 processes are started with the command:
  422. <process-name> [-v]
  423. ... where -v is appended if verbose is enabled. This method
  424. generates the arguments from the name and starts the process.
  425. """
  426. # Set up the command arguments.
  427. args = [name]
  428. if self.verbose:
  429. args += ['-v']
  430. # ... and start the process
  431. return self.start_process(name, args, self.c_channel_env)
  432. # The next few methods start up some of the BIND-10 processes.
  433. # These are the ones that need to be passed some parameters, so
  434. # using a start_simple is not enough. However, in future, we should
  435. # get rid of these parameters and they could be removed then.
  436. def start_auth(self):
  437. """
  438. Start the Authoritative server
  439. """
  440. authargs = ['b10-auth']
  441. if self.nocache:
  442. authargs += ['-n']
  443. if self.uid:
  444. authargs += ['-u', str(self.uid)]
  445. if self.verbose:
  446. authargs += ['-v']
  447. # ... and start
  448. return self.start_process("b10-auth", authargs, self.c_channel_env)
  449. def start_resolver(self):
  450. """
  451. Start the Resolver. At present, all these arguments and switches
  452. are pure speculation. As with the auth daemon, they should be
  453. read from the configuration database.
  454. """
  455. self.curproc = "b10-resolver"
  456. # XXX: this must be read from the configuration manager in the future
  457. resargs = ['b10-resolver']
  458. if self.uid:
  459. resargs += ['-u', str(self.uid)]
  460. if self.verbose:
  461. resargs += ['-v']
  462. # ... and start
  463. return self.start_process("b10-resolver", resargs, self.c_channel_env)
  464. def start_cmdctl(self):
  465. """
  466. Starts the command control process
  467. """
  468. args = ["b10-cmdctl"]
  469. if self.cmdctl_port is not None:
  470. args.append("--port=" + str(self.cmdctl_port))
  471. return self.start_process("b10-cmdctl", args, self.c_channel_env,
  472. self.cmdctl_port)
  473. def start_all_processes(self):
  474. """
  475. Starts up all the processes. Any exception generated during the
  476. starting of the processes is handled by the caller.
  477. """
  478. # Start the real core (sockcreator, msgq, cfgmgr)
  479. self._component_configurator.startup(self.__core_components)
  480. # Connect to the msgq. This is not a process, so it's not handled
  481. # inside the configurator.
  482. c_channel_env = self.c_channel_env
  483. self.start_ccsession(c_channel_env)
  484. # Extract the parameters associated with Bob. This can only be
  485. # done after the CC Session is started.
  486. #
  487. # This will start all the other configured processes.
  488. self.read_bind10_config()
  489. # FIXME: This is currently the only place we can reasonably drop
  490. # root privileges. But that's wrong, as everything will run as root.
  491. # If we put it before the read_bind10_config, the auth and resolver
  492. # will not run as root, which means they can't get their privileged
  493. # sockets.
  494. #
  495. # Once the socket creator is working fully (and is used), this can go
  496. # directly to the function starting socket creator.
  497. if self.uid is not None:
  498. posix.setuid(self.uid)
  499. def startup(self):
  500. """
  501. Start the BoB instance.
  502. Returns None if successful, otherwise an string describing the
  503. problem.
  504. """
  505. # Try to connect to the c-channel daemon, to see if it is already
  506. # running
  507. c_channel_env = {}
  508. if self.msgq_socket_file is not None:
  509. c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
  510. logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
  511. # try to connect, and if we can't wait a short while
  512. try:
  513. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  514. logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
  515. return "b10-msgq already running, or socket file not cleaned , cannot start"
  516. except isc.cc.session.SessionError:
  517. # this is the case we want, where the msgq is not running
  518. pass
  519. # Start all processes. If any one fails to start, kill all started
  520. # processes and exit with an error indication.
  521. try:
  522. self.c_channel_env = c_channel_env
  523. self.start_all_processes()
  524. except Exception as e:
  525. self.kill_started_processes()
  526. return "Unable to start " + self.curproc + ": " + str(e)
  527. # Started successfully
  528. self.runnable = True
  529. self.__started = True
  530. return None
  531. def stop_process(self, process, recipient):
  532. """
  533. Stop the given process, friendly-like. The process is the name it has
  534. (in logs, etc), the recipient is the address on msgq.
  535. """
  536. logger.info(BIND10_STOP_PROCESS, process)
  537. # Ask the process to die willingly
  538. self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
  539. recipient)
  540. def component_shutdown(self, exitcode=0):
  541. """
  542. Stop the Boss instance from a components' request. The exitcode
  543. indicates the desired exit code.
  544. If we did not start yet, it raises an exception, which is meant
  545. to propagate through the component and configurator to the startup
  546. routine and abort the startup imediatelly. If it is started up already,
  547. we just mark it so we terminate soon.
  548. It does set the exit code in both cases.
  549. """
  550. if self.__stopping:
  551. return
  552. self.exitcode = exitcode
  553. if not self.__started:
  554. raise Exception("Component failed during startup");
  555. else:
  556. self.runnable = False
  557. def shutdown(self):
  558. """Stop the BoB instance."""
  559. logger.info(BIND10_SHUTDOWN)
  560. self.__stopping = True
  561. # first try using the BIND 10 request to stop
  562. try:
  563. self._component_configurator.shutdown()
  564. except:
  565. pass
  566. # XXX: some delay probably useful... how much is uncertain
  567. # I have changed the delay from 0.5 to 1, but sometime it's
  568. # still not enough.
  569. time.sleep(1)
  570. self.reap_children()
  571. # next try sending a SIGTERM
  572. processes_to_stop = list(self.processes.values())
  573. for component in processes_to_stop:
  574. if component.pid() is None:
  575. # This isn't running any more for some reason
  576. continue
  577. logger.info(BIND10_SEND_SIGTERM, component.name(),
  578. component.pid())
  579. try:
  580. component.kill()
  581. except OSError:
  582. # ignore these (usually ESRCH because the child
  583. # finally exited)
  584. pass
  585. # finally, send SIGKILL (unmaskable termination) until everybody dies
  586. alive = self.processes # Is there any process alive?
  587. # We set alive to false at the start of each killing and reset it
  588. # to true whenever we find a component that still lives.
  589. while alive:
  590. # XXX: some delay probably useful... how much is uncertain
  591. time.sleep(0.1)
  592. self.reap_children()
  593. processes_to_stop = list(self.processes.values())
  594. alive = False
  595. for component in processes_to_stop:
  596. if component.pid() is None:
  597. # This isn't running any more for some reason
  598. continue
  599. alive = True
  600. logger.info(BIND10_SEND_SIGKILL, component.name(),
  601. component.pid())
  602. try:
  603. component.kill(True)
  604. except OSError:
  605. # ignore these (usually ESRCH because the child
  606. # finally exited)
  607. pass
  608. logger.info(BIND10_SHUTDOWN_COMPLETE)
  609. def _get_process_exit_status(self):
  610. return os.waitpid(-1, os.WNOHANG)
  611. def reap_children(self):
  612. """Check to see if any of our child processes have exited,
  613. and note this for later handling.
  614. """
  615. while True:
  616. try:
  617. (pid, exit_status) = self._get_process_exit_status()
  618. except OSError as o:
  619. if o.errno == errno.ECHILD: break
  620. # XXX: should be impossible to get any other error here
  621. raise
  622. if pid == 0: break
  623. if pid in self.processes:
  624. # One of the processes we know about. Get information on it.
  625. component = self.processes.pop(pid)
  626. # Tell it it failed, but only if it matters at all (eg. it is
  627. # running and we are running - if not, it should stop anyway)
  628. if component.running() and self.runnable:
  629. component.failed()
  630. else:
  631. logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
  632. def restart_processes(self):
  633. """
  634. Restart any dead processes:
  635. * Returns the time when the next process is ready to be restarted.
  636. * If the server is shutting down, returns 0.
  637. * If there are no processes, returns None.
  638. The values returned can be safely passed into select() as the
  639. timeout value.
  640. """
  641. next_restart = None
  642. # if we're shutting down, then don't restart
  643. if not self.runnable:
  644. return 0
  645. # otherwise look through each dead process and try to restart
  646. still_dead = {}
  647. now = time.time()
  648. for proc_info in self.dead_processes.values():
  649. restart_time = proc_info.restart_schedule.get_restart_time(now)
  650. if restart_time > now:
  651. if (next_restart is None) or (next_restart > restart_time):
  652. next_restart = restart_time
  653. still_dead[proc_info.pid] = proc_info
  654. else:
  655. logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
  656. try:
  657. proc_info.respawn()
  658. self.processes[proc_info.pid] = proc_info
  659. logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
  660. except:
  661. still_dead[proc_info.pid] = proc_info
  662. # remember any processes that refuse to be resurrected
  663. self.dead_processes = still_dead
  664. # return the time when the next process is ready to be restarted
  665. return next_restart
  666. def register_process(self, pid, info):
  667. """
  668. Put another process into boss to watch over it. When the process
  669. dies, the info.failed() is called with the exit code.
  670. """
  671. self.processes[pid] = info
  672. # global variables, needed for signal handlers
  673. options = None
  674. boss_of_bind = None
  675. def reaper(signal_number, stack_frame):
  676. """A child process has died (SIGCHLD received)."""
  677. # don't do anything...
  678. # the Python signal handler has been set up to write
  679. # down a pipe, waking up our select() bit
  680. pass
  681. def get_signame(signal_number):
  682. """Return the symbolic name for a signal."""
  683. for sig in dir(signal):
  684. if sig.startswith("SIG") and sig[3].isalnum():
  685. if getattr(signal, sig) == signal_number:
  686. return sig
  687. return "Unknown signal %d" % signal_number
  688. # XXX: perhaps register atexit() function and invoke that instead
  689. def fatal_signal(signal_number, stack_frame):
  690. """We need to exit (SIGINT or SIGTERM received)."""
  691. global options
  692. global boss_of_bind
  693. logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
  694. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  695. boss_of_bind.runnable = False
  696. def process_rename(option, opt_str, value, parser):
  697. """Function that renames the process if it is requested by a option."""
  698. isc.util.process.rename(value)
  699. def parse_args(args=sys.argv[1:], Parser=OptionParser):
  700. """
  701. Function for parsing command line arguments. Returns the
  702. options object from OptionParser.
  703. """
  704. parser = Parser(version=VERSION)
  705. parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
  706. type="string", default=None,
  707. help="UNIX domain socket file the b10-msgq daemon will use")
  708. parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
  709. default=False, help="disable hot-spot cache in authoritative DNS server")
  710. parser.add_option("-u", "--user", dest="user", type="string", default=None,
  711. help="Change user after startup (must run as root)")
  712. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  713. help="display more about what is going on")
  714. parser.add_option("--pretty-name", type="string", action="callback",
  715. callback=process_rename,
  716. help="Set the process name (displayed in ps, top, ...)")
  717. parser.add_option("-c", "--config-file", action="store",
  718. dest="config_file", default=None,
  719. help="Configuration database filename")
  720. parser.add_option("-p", "--data-path", dest="data_path",
  721. help="Directory to search for configuration files",
  722. default=None)
  723. parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
  724. default=None, help="Port of command control")
  725. parser.add_option("--pid-file", dest="pid_file", type="string",
  726. default=None,
  727. help="file to dump the PID of the BIND 10 process")
  728. parser.add_option("--brittle", dest="brittle", action="store_true",
  729. help="debugging flag: exit if any component dies")
  730. (options, args) = parser.parse_args(args)
  731. if options.cmdctl_port is not None:
  732. try:
  733. isc.net.parse.port_parse(options.cmdctl_port)
  734. except ValueError as e:
  735. parser.error(e)
  736. if args:
  737. parser.print_help()
  738. sys.exit(1)
  739. return options
  740. def dump_pid(pid_file):
  741. """
  742. Dump the PID of the current process to the specified file. If the given
  743. file is None this function does nothing. If the file already exists,
  744. the existing content will be removed. If a system error happens in
  745. creating or writing to the file, the corresponding exception will be
  746. propagated to the caller.
  747. """
  748. if pid_file is None:
  749. return
  750. f = open(pid_file, "w")
  751. f.write('%d\n' % os.getpid())
  752. f.close()
  753. def unlink_pid_file(pid_file):
  754. """
  755. Remove the given file, which is basically expected to be the PID file
  756. created by dump_pid(). The specified may or may not exist; if it
  757. doesn't this function does nothing. Other system level errors in removing
  758. the file will be propagated as the corresponding exception.
  759. """
  760. if pid_file is None:
  761. return
  762. try:
  763. os.unlink(pid_file)
  764. except OSError as error:
  765. if error.errno is not errno.ENOENT:
  766. raise
  767. def main():
  768. global options
  769. global boss_of_bind
  770. # Enforce line buffering on stdout, even when not a TTY
  771. sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)
  772. options = parse_args()
  773. # Check user ID.
  774. setuid = None
  775. username = None
  776. if options.user:
  777. # Try getting information about the user, assuming UID passed.
  778. try:
  779. pw_ent = pwd.getpwuid(int(options.user))
  780. setuid = pw_ent.pw_uid
  781. username = pw_ent.pw_name
  782. except ValueError:
  783. pass
  784. except KeyError:
  785. pass
  786. # Next try getting information about the user, assuming user name
  787. # passed.
  788. # If the information is both a valid user name and user number, we
  789. # prefer the name because we try it second. A minor point, hopefully.
  790. try:
  791. pw_ent = pwd.getpwnam(options.user)
  792. setuid = pw_ent.pw_uid
  793. username = pw_ent.pw_name
  794. except KeyError:
  795. pass
  796. if setuid is None:
  797. logger.fatal(BIND10_INVALID_USER, options.user)
  798. sys.exit(1)
  799. # Announce startup.
  800. logger.info(BIND10_STARTING, VERSION)
  801. # Create wakeup pipe for signal handlers
  802. wakeup_pipe = os.pipe()
  803. signal.set_wakeup_fd(wakeup_pipe[1])
  804. # Set signal handlers for catching child termination, as well
  805. # as our own demise.
  806. signal.signal(signal.SIGCHLD, reaper)
  807. signal.siginterrupt(signal.SIGCHLD, False)
  808. signal.signal(signal.SIGINT, fatal_signal)
  809. signal.signal(signal.SIGTERM, fatal_signal)
  810. # Block SIGPIPE, as we don't want it to end this process
  811. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  812. # Go bob!
  813. boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
  814. options.config_file, options.nocache, options.verbose,
  815. setuid, username, options.cmdctl_port, options.brittle)
  816. startup_result = boss_of_bind.startup()
  817. if startup_result:
  818. logger.fatal(BIND10_STARTUP_ERROR, startup_result)
  819. sys.exit(1)
  820. logger.info(BIND10_STARTUP_COMPLETE)
  821. dump_pid(options.pid_file)
  822. # In our main loop, we check for dead processes or messages
  823. # on the c-channel.
  824. wakeup_fd = wakeup_pipe[0]
  825. ccs_fd = boss_of_bind.ccs.get_socket().fileno()
  826. while boss_of_bind.runnable:
  827. # clean up any processes that exited
  828. boss_of_bind.reap_children()
  829. next_restart = boss_of_bind.restart_processes()
  830. if next_restart is None:
  831. wait_time = None
  832. else:
  833. wait_time = max(next_restart - time.time(), 0)
  834. # select() can raise EINTR when a signal arrives,
  835. # even if they are resumable, so we have to catch
  836. # the exception
  837. try:
  838. (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [],
  839. wait_time)
  840. except select.error as err:
  841. if err.args[0] == errno.EINTR:
  842. (rlist, wlist, xlist) = ([], [], [])
  843. else:
  844. logger.fatal(BIND10_SELECT_ERROR, err)
  845. break
  846. for fd in rlist + xlist:
  847. if fd == ccs_fd:
  848. try:
  849. boss_of_bind.ccs.check_command()
  850. except isc.cc.session.ProtocolError:
  851. logger.fatal(BIND10_MSGQ_DISAPPEARED)
  852. self.runnable = False
  853. break
  854. elif fd == wakeup_fd:
  855. os.read(wakeup_fd, 32)
  856. # shutdown
  857. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  858. boss_of_bind.shutdown()
  859. unlink_pid_file(options.pid_file)
  860. sys.exit(boss_of_bind.exitcode)
  861. if __name__ == "__main__":
  862. main()