bind10_src.py.in 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078
  1. #!@PYTHON@
  2. # Copyright (C) 2010,2011 Internet Systems Consortium.
  3. #
  4. # Permission to use, copy, modify, and distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  9. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  10. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  11. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  13. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  14. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  15. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. """
  17. This file implements the Boss of Bind (BoB, or bob) program.
  18. Its purpose is to start up the BIND 10 system, and then manage the
  19. processes, by starting and stopping processes, plus restarting
  20. processes that exit.
  21. To start the system, it first runs the c-channel program (msgq), then
  22. connects to that. It then runs the configuration manager, and reads
  23. its own configuration. Then it proceeds to starting other modules.
  24. The Python subprocess module is used for starting processes, but
  25. because this is not efficient for managing groups of processes,
  26. SIGCHLD signals are caught and processed using the signal module.
  27. Most of the logic is contained in the BoB class. However, since Python
  28. requires that signal processing happen in the main thread, we do
  29. signal handling outside of that class, in the code running for
  30. __main__.
  31. """
  32. import sys; sys.path.append ('@@PYTHONPATH@@')
  33. import os
  34. # If B10_FROM_SOURCE is set in the environment, we use data files
  35. # from a directory relative to that, otherwise we use the ones
  36. # installed on the system
  37. if "B10_FROM_SOURCE" in os.environ:
  38. SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
  39. else:
  40. PREFIX = "@prefix@"
  41. DATAROOTDIR = "@datarootdir@"
  42. SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
  43. import subprocess
  44. import signal
  45. import re
  46. import errno
  47. import time
  48. import select
  49. import random
  50. import socket
  51. from optparse import OptionParser, OptionValueError
  52. import io
  53. import pwd
  54. import posix
  55. import isc.cc
  56. import isc.util.process
  57. import isc.net.parse
  58. import isc.log
  59. from bind10_messages import *
  60. import isc.bind10.sockcreator
  61. isc.log.init("b10-boss")
  62. logger = isc.log.Logger("boss")
  63. # Pending system-wide debug level definitions, the ones we
  64. # use here are hardcoded for now
  65. DBG_PROCESS = 10
  66. DBG_COMMANDS = 30
  67. # Assign this process some longer name
  68. isc.util.process.rename(sys.argv[0])
  69. # This is the version that gets displayed to the user.
  70. # The VERSION string consists of the module name, the module version
  71. # number, and the overall BIND 10 version number (set in configure.ac).
  72. VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
  73. # This is for bind10.boottime of stats module
  74. _BASETIME = time.gmtime()
  75. class RestartSchedule:
  76. """
  77. Keeps state when restarting something (in this case, a process).
  78. When a process dies unexpectedly, we need to restart it. However, if
  79. it fails to restart for some reason, then we should not simply keep
  80. restarting it at high speed.
  81. A more sophisticated algorithm can be developed, but for now we choose
  82. a simple set of rules:
  83. * If a process was been running for >=10 seconds, we restart it
  84. right away.
  85. * If a process was running for <10 seconds, we wait until 10 seconds
  86. after it was started.
  87. To avoid programs getting into lockstep, we use a normal distribution
  88. to avoid being restarted at exactly 10 seconds."""
  89. def __init__(self, restart_frequency=10.0):
  90. self.restart_frequency = restart_frequency
  91. self.run_start_time = None
  92. self.run_stop_time = None
  93. self.restart_time = None
  94. def set_run_start_time(self, when=None):
  95. if when is None:
  96. when = time.time()
  97. self.run_start_time = when
  98. sigma = self.restart_frequency * 0.05
  99. self.restart_time = when + random.normalvariate(self.restart_frequency,
  100. sigma)
  101. def set_run_stop_time(self, when=None):
  102. """We don't actually do anything with stop time now, but it
  103. might be useful for future algorithms."""
  104. if when is None:
  105. when = time.time()
  106. self.run_stop_time = when
  107. def get_restart_time(self, when=None):
  108. if when is None:
  109. when = time.time()
  110. return max(when, self.restart_time)
  111. class ProcessInfoError(Exception): pass
  112. class ProcessInfo:
  113. """Information about a process"""
  114. dev_null = open(os.devnull, "w")
  115. def __init__(self, name, args, env={}, dev_null_stdout=False,
  116. dev_null_stderr=False, uid=None, username=None):
  117. self.name = name
  118. self.args = args
  119. self.env = env
  120. self.dev_null_stdout = dev_null_stdout
  121. self.dev_null_stderr = dev_null_stderr
  122. self.restart_schedule = RestartSchedule()
  123. self.uid = uid
  124. self.username = username
  125. self.process = None
  126. self.pid = None
  127. def _preexec_work(self):
  128. """Function used before running a program that needs to run as a
  129. different user."""
  130. # First, put us into a separate process group so we don't get
  131. # SIGINT signals on Ctrl-C (the boss will shut everthing down by
  132. # other means).
  133. os.setpgrp()
  134. # Second, set the user ID if one has been specified
  135. if self.uid is not None:
  136. try:
  137. posix.setuid(self.uid)
  138. except OSError as e:
  139. if e.errno == errno.EPERM:
  140. # if we failed to change user due to permission report that
  141. raise ProcessInfoError("Unable to change to user %s (uid %d)" % (self.username, self.uid))
  142. else:
  143. # otherwise simply re-raise whatever error we found
  144. raise
  145. def _spawn(self):
  146. if self.dev_null_stdout:
  147. spawn_stdout = self.dev_null
  148. else:
  149. spawn_stdout = None
  150. if self.dev_null_stderr:
  151. spawn_stderr = self.dev_null
  152. else:
  153. spawn_stderr = None
  154. # Environment variables for the child process will be a copy of those
  155. # of the boss process with any additional specific variables given
  156. # on construction (self.env).
  157. spawn_env = os.environ
  158. spawn_env.update(self.env)
  159. if 'B10_FROM_SOURCE' not in os.environ:
  160. spawn_env['PATH'] = "@@LIBEXECDIR@@:" + spawn_env['PATH']
  161. self.process = subprocess.Popen(self.args,
  162. stdin=subprocess.PIPE,
  163. stdout=spawn_stdout,
  164. stderr=spawn_stderr,
  165. close_fds=True,
  166. env=spawn_env,
  167. preexec_fn=self._preexec_work)
  168. self.pid = self.process.pid
  169. self.restart_schedule.set_run_start_time()
  170. # spawn() and respawn() are the same for now, but in the future they
  171. # may have different functionality
  172. def spawn(self):
  173. self._spawn()
  174. def respawn(self):
  175. self._spawn()
  176. class CChannelConnectError(Exception): pass
  177. class BoB:
  178. """Boss of BIND class."""
  179. def __init__(self, msgq_socket_file=None, data_path=None,
  180. config_filename=None, nocache=False, verbose=False, setuid=None,
  181. username=None, cmdctl_port=None, brittle=False):
  182. """
  183. Initialize the Boss of BIND. This is a singleton (only one can run).
  184. The msgq_socket_file specifies the UNIX domain socket file that the
  185. msgq process listens on. If verbose is True, then the boss reports
  186. what it is doing.
  187. Data path and config filename are passed trough to config manager
  188. (if provided) and specify the config file to be used.
  189. The cmdctl_port is passed to cmdctl and specify on which port it
  190. should listen.
  191. """
  192. self.cc_session = None
  193. self.ccs = None
  194. self.cfg_start_auth = True
  195. self.cfg_start_resolver = False
  196. self.cfg_start_dhcp6 = False
  197. self.cfg_start_dhcp4 = False
  198. self.started_auth_family = False
  199. self.started_resolver_family = False
  200. self.curproc = None
  201. self.dead_processes = {}
  202. self.msgq_socket_file = msgq_socket_file
  203. self.nocache = nocache
  204. self.processes = {}
  205. self.expected_shutdowns = {}
  206. self.runnable = False
  207. self.uid = setuid
  208. self.username = username
  209. self.verbose = verbose
  210. self.data_path = data_path
  211. self.config_filename = config_filename
  212. self.cmdctl_port = cmdctl_port
  213. self.brittle = brittle
  214. self.sockcreator = None
  215. def config_handler(self, new_config):
  216. # If this is initial update, don't do anything now, leave it to startup
  217. if not self.runnable:
  218. return
  219. # Now we declare few functions used only internally here. Besides the
  220. # benefit of not polluting the name space, they are closures, so we
  221. # don't need to pass some variables
  222. def start_stop(name, started, start, stop):
  223. if not'start_' + name in new_config:
  224. return
  225. if new_config['start_' + name]:
  226. if not started:
  227. if self.uid is not None:
  228. logger.info(BIND10_START_AS_NON_ROOT, name)
  229. start()
  230. else:
  231. stop()
  232. # These four functions are passed to start_stop (smells like functional
  233. # programming little bit)
  234. def resolver_on():
  235. self.start_resolver(self.c_channel_env)
  236. self.started_resolver_family = True
  237. def resolver_off():
  238. self.stop_resolver()
  239. self.started_resolver_family = False
  240. def auth_on():
  241. self.start_auth(self.c_channel_env)
  242. self.start_xfrout(self.c_channel_env)
  243. self.start_xfrin(self.c_channel_env)
  244. self.start_zonemgr(self.c_channel_env)
  245. self.started_auth_family = True
  246. def auth_off():
  247. self.stop_zonemgr()
  248. self.stop_xfrin()
  249. self.stop_xfrout()
  250. self.stop_auth()
  251. self.started_auth_family = False
  252. # The real code of the config handler function follows here
  253. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
  254. new_config)
  255. start_stop('resolver', self.started_resolver_family, resolver_on,
  256. resolver_off)
  257. start_stop('auth', self.started_auth_family, auth_on, auth_off)
  258. answer = isc.config.ccsession.create_answer(0)
  259. return answer
  260. def get_processes(self):
  261. pids = list(self.processes.keys())
  262. pids.sort()
  263. process_list = [ ]
  264. for pid in pids:
  265. process_list.append([pid, self.processes[pid].name])
  266. return process_list
  267. def _get_stats_data(self):
  268. return { "stats_data": {
  269. 'bind10.boot_time': time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
  270. }}
  271. def command_handler(self, command, args):
  272. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
  273. answer = isc.config.ccsession.create_answer(1, "command not implemented")
  274. if type(command) != str:
  275. answer = isc.config.ccsession.create_answer(1, "bad command")
  276. else:
  277. if command == "shutdown":
  278. self.runnable = False
  279. answer = isc.config.ccsession.create_answer(0)
  280. elif command == "getstats":
  281. answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
  282. elif command == "sendstats":
  283. # send statistics data to the stats daemon immediately
  284. cmd = isc.config.ccsession.create_command(
  285. 'set', self._get_stats_data())
  286. seq = self.cc_session.group_sendmsg(cmd, 'Stats')
  287. # Consume the answer, in case it becomes a orphan message.
  288. try:
  289. self.cc_session.group_recvmsg(False, seq)
  290. except isc.cc.session.SessionTimeout:
  291. pass
  292. answer = isc.config.ccsession.create_answer(0)
  293. elif command == "ping":
  294. answer = isc.config.ccsession.create_answer(0, "pong")
  295. elif command == "show_processes":
  296. answer = isc.config.ccsession. \
  297. create_answer(0, self.get_processes())
  298. else:
  299. answer = isc.config.ccsession.create_answer(1,
  300. "Unknown command")
  301. return answer
  302. def start_creator(self):
  303. self.curproc = 'b10-sockcreator'
  304. self.sockcreator = isc.bind10.sockcreator.Creator("@@LIBEXECDIR@@:" +
  305. os.environ['PATH'])
  306. def stop_creator(self, kill=False):
  307. if self.sockcreator is None:
  308. return
  309. if kill:
  310. self.sockcreator.kill()
  311. else:
  312. self.sockcreator.terminate()
  313. self.sockcreator = None
  314. def kill_started_processes(self):
  315. """
  316. Called as part of the exception handling when a process fails to
  317. start, this runs through the list of started processes, killing
  318. each one. It then clears that list.
  319. """
  320. logger.info(BIND10_KILLING_ALL_PROCESSES)
  321. self.stop_creator(True)
  322. for pid in self.processes:
  323. logger.info(BIND10_KILL_PROCESS, self.processes[pid].name)
  324. self.processes[pid].process.kill()
  325. self.processes = {}
  326. def read_bind10_config(self):
  327. """
  328. Reads the parameters associated with the BoB module itself.
  329. At present these are the components to start although arguably this
  330. information should be in the configuration for the appropriate
  331. module itself. (However, this would cause difficulty in the case of
  332. xfrin/xfrout and zone manager as we don't need to start those if we
  333. are not running the authoritative server.)
  334. """
  335. logger.info(BIND10_READING_BOSS_CONFIGURATION)
  336. config_data = self.ccs.get_full_config()
  337. self.cfg_start_auth = config_data.get("start_auth")
  338. self.cfg_start_resolver = config_data.get("start_resolver")
  339. logger.info(BIND10_CONFIGURATION_START_AUTH, self.cfg_start_auth)
  340. logger.info(BIND10_CONFIGURATION_START_RESOLVER, self.cfg_start_resolver)
  341. def log_starting(self, process, port = None, address = None):
  342. """
  343. A convenience function to output a "Starting xxx" message if the
  344. logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
  345. Putting this into a separate method ensures
  346. that the output form is consistent across all processes.
  347. The process name (passed as the first argument) is put into
  348. self.curproc, and is used to indicate which process failed to
  349. start if there is an error (and is used in the "Started" message
  350. on success). The optional port and address information are
  351. appended to the message (if present).
  352. """
  353. self.curproc = process
  354. if port is None and address is None:
  355. logger.info(BIND10_STARTING_PROCESS, self.curproc)
  356. elif address is None:
  357. logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
  358. port)
  359. else:
  360. logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
  361. self.curproc, address, port)
  362. def log_started(self, pid = None):
  363. """
  364. A convenience function to output a 'Started xxxx (PID yyyy)'
  365. message. As with starting_message(), this ensures a consistent
  366. format.
  367. """
  368. if pid is None:
  369. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
  370. else:
  371. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
  372. # The next few methods start the individual processes of BIND-10. They
  373. # are called via start_all_processes(). If any fail, an exception is
  374. # raised which is caught by the caller of start_all_processes(); this kills
  375. # processes started up to that point before terminating the program.
  376. def start_msgq(self, c_channel_env):
  377. """
  378. Start the message queue and connect to the command channel.
  379. """
  380. self.log_starting("b10-msgq")
  381. c_channel = ProcessInfo("b10-msgq", ["b10-msgq"], c_channel_env,
  382. True, not self.verbose, uid=self.uid,
  383. username=self.username)
  384. c_channel.spawn()
  385. self.processes[c_channel.pid] = c_channel
  386. self.log_started(c_channel.pid)
  387. # Now connect to the c-channel
  388. cc_connect_start = time.time()
  389. while self.cc_session is None:
  390. # if we have been trying for "a while" give up
  391. if (time.time() - cc_connect_start) > 5:
  392. raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")
  393. # try to connect, and if we can't wait a short while
  394. try:
  395. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  396. except isc.cc.session.SessionError:
  397. time.sleep(0.1)
  398. def start_cfgmgr(self, c_channel_env):
  399. """
  400. Starts the configuration manager process
  401. """
  402. self.log_starting("b10-cfgmgr")
  403. args = ["b10-cfgmgr"]
  404. if self.data_path is not None:
  405. args.append("--data-path=" + self.data_path)
  406. if self.config_filename is not None:
  407. args.append("--config-filename=" + self.config_filename)
  408. bind_cfgd = ProcessInfo("b10-cfgmgr", args,
  409. c_channel_env, uid=self.uid,
  410. username=self.username)
  411. bind_cfgd.spawn()
  412. self.processes[bind_cfgd.pid] = bind_cfgd
  413. self.log_started(bind_cfgd.pid)
  414. # sleep until b10-cfgmgr is fully up and running, this is a good place
  415. # to have a (short) timeout on synchronized groupsend/receive
  416. # TODO: replace the sleep by a listen for ConfigManager started
  417. # message
  418. time.sleep(1)
  419. def start_ccsession(self, c_channel_env):
  420. """
  421. Start the CC Session
  422. The argument c_channel_env is unused but is supplied to keep the
  423. argument list the same for all start_xxx methods.
  424. """
  425. self.log_starting("ccsession")
  426. self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
  427. self.config_handler,
  428. self.command_handler)
  429. self.ccs.start()
  430. self.log_started()
  431. # A couple of utility methods for starting processes...
  432. def start_process(self, name, args, c_channel_env, port=None, address=None):
  433. """
  434. Given a set of command arguments, start the process and output
  435. appropriate log messages. If the start is successful, the process
  436. is added to the list of started processes.
  437. The port and address arguments are for log messages only.
  438. """
  439. self.log_starting(name, port, address)
  440. newproc = ProcessInfo(name, args, c_channel_env)
  441. newproc.spawn()
  442. self.processes[newproc.pid] = newproc
  443. self.log_started(newproc.pid)
  444. def start_simple(self, name, c_channel_env, port=None, address=None):
  445. """
  446. Most of the BIND-10 processes are started with the command:
  447. <process-name> [-v]
  448. ... where -v is appended if verbose is enabled. This method
  449. generates the arguments from the name and starts the process.
  450. The port and address arguments are for log messages only.
  451. """
  452. # Set up the command arguments.
  453. args = [name]
  454. if self.verbose:
  455. args += ['-v']
  456. # ... and start the process
  457. self.start_process(name, args, c_channel_env, port, address)
  458. # The next few methods start up the rest of the BIND-10 processes.
  459. # Although many of these methods are little more than a call to
  460. # start_simple, they are retained (a) for testing reasons and (b) as a place
  461. # where modifications can be made if the process start-up sequence changes
  462. # for a given process.
  463. def start_auth(self, c_channel_env):
  464. """
  465. Start the Authoritative server
  466. """
  467. authargs = ['b10-auth']
  468. if self.nocache:
  469. authargs += ['-n']
  470. if self.uid:
  471. authargs += ['-u', str(self.uid)]
  472. if self.verbose:
  473. authargs += ['-v']
  474. # ... and start
  475. self.start_process("b10-auth", authargs, c_channel_env)
  476. def start_resolver(self, c_channel_env):
  477. """
  478. Start the Resolver. At present, all these arguments and switches
  479. are pure speculation. As with the auth daemon, they should be
  480. read from the configuration database.
  481. """
  482. self.curproc = "b10-resolver"
  483. # XXX: this must be read from the configuration manager in the future
  484. resargs = ['b10-resolver']
  485. if self.uid:
  486. resargs += ['-u', str(self.uid)]
  487. if self.verbose:
  488. resargs += ['-v']
  489. # ... and start
  490. self.start_process("b10-resolver", resargs, c_channel_env)
  491. def start_xfrout(self, c_channel_env):
  492. self.start_simple("b10-xfrout", c_channel_env)
  493. def start_xfrin(self, c_channel_env):
  494. self.start_simple("b10-xfrin", c_channel_env)
  495. def start_zonemgr(self, c_channel_env):
  496. self.start_simple("b10-zonemgr", c_channel_env)
  497. def start_stats(self, c_channel_env):
  498. self.start_simple("b10-stats", c_channel_env)
  499. def start_stats_httpd(self, c_channel_env):
  500. self.start_simple("b10-stats-httpd", c_channel_env)
  501. def start_dhcp6(self, c_channel_env):
  502. self.start_simple("b10-dhcp6", c_channel_env)
  503. def start_cmdctl(self, c_channel_env):
  504. """
  505. Starts the command control process
  506. """
  507. args = ["b10-cmdctl"]
  508. if self.cmdctl_port is not None:
  509. args.append("--port=" + str(self.cmdctl_port))
  510. self.start_process("b10-cmdctl", args, c_channel_env, self.cmdctl_port)
  511. def start_all_processes(self):
  512. """
  513. Starts up all the processes. Any exception generated during the
  514. starting of the processes is handled by the caller.
  515. """
  516. # The socket creator first, as it is the only thing that needs root
  517. self.start_creator()
  518. # TODO: Once everything uses the socket creator, we can drop root
  519. # privileges right now
  520. c_channel_env = self.c_channel_env
  521. self.start_msgq(c_channel_env)
  522. self.start_cfgmgr(c_channel_env)
  523. self.start_ccsession(c_channel_env)
  524. # Extract the parameters associated with Bob. This can only be
  525. # done after the CC Session is started.
  526. self.read_bind10_config()
  527. # Continue starting the processes. The authoritative server (if
  528. # selected):
  529. if self.cfg_start_auth:
  530. self.start_auth(c_channel_env)
  531. # ... and resolver (if selected):
  532. if self.cfg_start_resolver:
  533. self.start_resolver(c_channel_env)
  534. self.started_resolver_family = True
  535. # Everything after the main components can run as non-root.
  536. # TODO: this is only temporary - once the privileged socket creator is
  537. # fully working, nothing else will run as root.
  538. if self.uid is not None:
  539. posix.setuid(self.uid)
  540. # xfrin/xfrout and the zone manager are only meaningful if the
  541. # authoritative server has been started.
  542. if self.cfg_start_auth:
  543. self.start_xfrout(c_channel_env)
  544. self.start_xfrin(c_channel_env)
  545. self.start_zonemgr(c_channel_env)
  546. self.started_auth_family = True
  547. # ... and finally start the remaining processes
  548. self.start_stats(c_channel_env)
  549. self.start_stats_httpd(c_channel_env)
  550. self.start_cmdctl(c_channel_env)
  551. if self.cfg_start_dhcp6:
  552. self.start_dhcp6(c_channel_env)
  553. def startup(self):
  554. """
  555. Start the BoB instance.
  556. Returns None if successful, otherwise an string describing the
  557. problem.
  558. """
  559. # Try to connect to the c-channel daemon, to see if it is already
  560. # running
  561. c_channel_env = {}
  562. if self.msgq_socket_file is not None:
  563. c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
  564. logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
  565. # try to connect, and if we can't wait a short while
  566. try:
  567. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  568. logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
  569. return "b10-msgq already running, or socket file not cleaned , cannot start"
  570. except isc.cc.session.SessionError:
  571. # this is the case we want, where the msgq is not running
  572. pass
  573. # Start all processes. If any one fails to start, kill all started
  574. # processes and exit with an error indication.
  575. try:
  576. self.c_channel_env = c_channel_env
  577. self.start_all_processes()
  578. except Exception as e:
  579. self.kill_started_processes()
  580. return "Unable to start " + self.curproc + ": " + str(e)
  581. # Started successfully
  582. self.runnable = True
  583. return None
  584. def stop_all_processes(self):
  585. """Stop all processes."""
  586. cmd = { "command": ['shutdown']}
  587. self.cc_session.group_sendmsg(cmd, 'Cmdctl', 'Cmdctl')
  588. self.cc_session.group_sendmsg(cmd, "ConfigManager", "ConfigManager")
  589. self.cc_session.group_sendmsg(cmd, "Auth", "Auth")
  590. self.cc_session.group_sendmsg(cmd, "Resolver", "Resolver")
  591. self.cc_session.group_sendmsg(cmd, "Xfrout", "Xfrout")
  592. self.cc_session.group_sendmsg(cmd, "Xfrin", "Xfrin")
  593. self.cc_session.group_sendmsg(cmd, "Zonemgr", "Zonemgr")
  594. self.cc_session.group_sendmsg(cmd, "Stats", "Stats")
  595. self.cc_session.group_sendmsg(cmd, "StatsHttpd", "StatsHttpd")
  596. # Terminate the creator last
  597. self.stop_creator()
  598. def stop_process(self, process, recipient):
  599. """
  600. Stop the given process, friendly-like. The process is the name it has
  601. (in logs, etc), the recipient is the address on msgq.
  602. """
  603. logger.info(BIND10_STOP_PROCESS, process)
  604. # TODO: Some timeout to solve processes that don't want to die would
  605. # help. We can even store it in the dict, it is used only as a set
  606. self.expected_shutdowns[process] = 1
  607. # Ask the process to die willingly
  608. self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
  609. recipient)
  610. # Series of stop_process wrappers
  611. def stop_resolver(self):
  612. self.stop_process('b10-resolver', 'Resolver')
  613. def stop_auth(self):
  614. self.stop_process('b10-auth', 'Auth')
  615. def stop_xfrout(self):
  616. self.stop_process('b10-xfrout', 'Xfrout')
  617. def stop_xfrin(self):
  618. self.stop_process('b10-xfrin', 'Xfrin')
  619. def stop_zonemgr(self):
  620. self.stop_process('b10-zonemgr', 'Zonemgr')
  621. def shutdown(self):
  622. """Stop the BoB instance."""
  623. logger.info(BIND10_SHUTDOWN)
  624. # first try using the BIND 10 request to stop
  625. try:
  626. self.stop_all_processes()
  627. except:
  628. pass
  629. # XXX: some delay probably useful... how much is uncertain
  630. # I have changed the delay from 0.5 to 1, but sometime it's
  631. # still not enough.
  632. time.sleep(1)
  633. self.reap_children()
  634. # next try sending a SIGTERM
  635. processes_to_stop = list(self.processes.values())
  636. for proc_info in processes_to_stop:
  637. logger.info(BIND10_SEND_SIGTERM, proc_info.name,
  638. proc_info.pid)
  639. try:
  640. proc_info.process.terminate()
  641. except OSError:
  642. # ignore these (usually ESRCH because the child
  643. # finally exited)
  644. pass
  645. # finally, send SIGKILL (unmaskable termination) until everybody dies
  646. while self.processes:
  647. # XXX: some delay probably useful... how much is uncertain
  648. time.sleep(0.1)
  649. self.reap_children()
  650. processes_to_stop = list(self.processes.values())
  651. for proc_info in processes_to_stop:
  652. logger.info(BIND10_SEND_SIGKILL, proc_info.name,
  653. proc_info.pid)
  654. try:
  655. proc_info.process.kill()
  656. except OSError:
  657. # ignore these (usually ESRCH because the child
  658. # finally exited)
  659. pass
  660. logger.info(BIND10_SHUTDOWN_COMPLETE)
  661. def _get_process_exit_status(self):
  662. return os.waitpid(-1, os.WNOHANG)
  663. def reap_children(self):
  664. """Check to see if any of our child processes have exited,
  665. and note this for later handling.
  666. """
  667. while True:
  668. try:
  669. (pid, exit_status) = self._get_process_exit_status()
  670. except OSError as o:
  671. if o.errno == errno.ECHILD: break
  672. # XXX: should be impossible to get any other error here
  673. raise
  674. if pid == 0: break
  675. if self.sockcreator is not None and self.sockcreator.pid() == pid:
  676. # This is the socket creator, started and terminated
  677. # differently. This can't be restarted.
  678. if self.runnable:
  679. logger.fatal(BIND10_SOCKCREATOR_CRASHED)
  680. self.sockcreator = None
  681. self.runnable = False
  682. elif pid in self.processes:
  683. # One of the processes we know about. Get information on it.
  684. proc_info = self.processes.pop(pid)
  685. proc_info.restart_schedule.set_run_stop_time()
  686. self.dead_processes[proc_info.pid] = proc_info
  687. # Write out message, but only if in the running state:
  688. # During startup and shutdown, these messages are handled
  689. # elsewhere.
  690. if self.runnable:
  691. if exit_status is None:
  692. logger.warn(BIND10_PROCESS_ENDED_NO_EXIT_STATUS,
  693. proc_info.name, proc_info.pid)
  694. else:
  695. logger.warn(BIND10_PROCESS_ENDED_WITH_EXIT_STATUS,
  696. proc_info.name, proc_info.pid,
  697. exit_status)
  698. # Was it a special process?
  699. if proc_info.name == "b10-msgq":
  700. logger.fatal(BIND10_MSGQ_DAEMON_ENDED)
  701. self.runnable = False
  702. # If we're in 'brittle' mode, we want to shutdown after
  703. # any process dies.
  704. if self.brittle:
  705. self.runnable = False
  706. else:
  707. logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
  708. def restart_processes(self):
  709. """
  710. Restart any dead processes:
  711. * Returns the time when the next process is ready to be restarted.
  712. * If the server is shutting down, returns 0.
  713. * If there are no processes, returns None.
  714. The values returned can be safely passed into select() as the
  715. timeout value.
  716. """
  717. next_restart = None
  718. # if we're shutting down, then don't restart
  719. if not self.runnable:
  720. return 0
  721. # otherwise look through each dead process and try to restart
  722. still_dead = {}
  723. now = time.time()
  724. for proc_info in self.dead_processes.values():
  725. if proc_info.name in self.expected_shutdowns:
  726. # We don't restart, we wanted it to die
  727. del self.expected_shutdowns[proc_info.name]
  728. continue
  729. restart_time = proc_info.restart_schedule.get_restart_time(now)
  730. if restart_time > now:
  731. if (next_restart is None) or (next_restart > restart_time):
  732. next_restart = restart_time
  733. still_dead[proc_info.pid] = proc_info
  734. else:
  735. logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
  736. try:
  737. proc_info.respawn()
  738. self.processes[proc_info.pid] = proc_info
  739. logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
  740. except:
  741. still_dead[proc_info.pid] = proc_info
  742. # remember any processes that refuse to be resurrected
  743. self.dead_processes = still_dead
  744. # return the time when the next process is ready to be restarted
  745. return next_restart
  746. # global variables, needed for signal handlers
  747. options = None
  748. boss_of_bind = None
  749. def reaper(signal_number, stack_frame):
  750. """A child process has died (SIGCHLD received)."""
  751. # don't do anything...
  752. # the Python signal handler has been set up to write
  753. # down a pipe, waking up our select() bit
  754. pass
  755. def get_signame(signal_number):
  756. """Return the symbolic name for a signal."""
  757. for sig in dir(signal):
  758. if sig.startswith("SIG") and sig[3].isalnum():
  759. if getattr(signal, sig) == signal_number:
  760. return sig
  761. return "Unknown signal %d" % signal_number
  762. # XXX: perhaps register atexit() function and invoke that instead
  763. def fatal_signal(signal_number, stack_frame):
  764. """We need to exit (SIGINT or SIGTERM received)."""
  765. global options
  766. global boss_of_bind
  767. logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
  768. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  769. boss_of_bind.runnable = False
  770. def process_rename(option, opt_str, value, parser):
  771. """Function that renames the process if it is requested by a option."""
  772. isc.util.process.rename(value)
  773. def parse_args(args=sys.argv[1:], Parser=OptionParser):
  774. """
  775. Function for parsing command line arguments. Returns the
  776. options object from OptionParser.
  777. """
  778. parser = Parser(version=VERSION)
  779. parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
  780. type="string", default=None,
  781. help="UNIX domain socket file the b10-msgq daemon will use")
  782. parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
  783. default=False, help="disable hot-spot cache in authoritative DNS server")
  784. parser.add_option("-u", "--user", dest="user", type="string", default=None,
  785. help="Change user after startup (must run as root)")
  786. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  787. help="display more about what is going on")
  788. parser.add_option("--pretty-name", type="string", action="callback",
  789. callback=process_rename,
  790. help="Set the process name (displayed in ps, top, ...)")
  791. parser.add_option("-c", "--config-file", action="store",
  792. dest="config_file", default=None,
  793. help="Configuration database filename")
  794. parser.add_option("-p", "--data-path", dest="data_path",
  795. help="Directory to search for configuration files",
  796. default=None)
  797. parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
  798. default=None, help="Port of command control")
  799. parser.add_option("--pid-file", dest="pid_file", type="string",
  800. default=None,
  801. help="file to dump the PID of the BIND 10 process")
  802. parser.add_option("--brittle", dest="brittle", action="store_true",
  803. help="debugging flag: exit if any component dies")
  804. (options, args) = parser.parse_args(args)
  805. if options.cmdctl_port is not None:
  806. try:
  807. isc.net.parse.port_parse(options.cmdctl_port)
  808. except ValueError as e:
  809. parser.error(e)
  810. if args:
  811. parser.print_help()
  812. sys.exit(1)
  813. return options
  814. def dump_pid(pid_file):
  815. """
  816. Dump the PID of the current process to the specified file. If the given
  817. file is None this function does nothing. If the file already exists,
  818. the existing content will be removed. If a system error happens in
  819. creating or writing to the file, the corresponding exception will be
  820. propagated to the caller.
  821. """
  822. if pid_file is None:
  823. return
  824. f = open(pid_file, "w")
  825. f.write('%d\n' % os.getpid())
  826. f.close()
  827. def unlink_pid_file(pid_file):
  828. """
  829. Remove the given file, which is basically expected to be the PID file
  830. created by dump_pid(). The specified may or may not exist; if it
  831. doesn't this function does nothing. Other system level errors in removing
  832. the file will be propagated as the corresponding exception.
  833. """
  834. if pid_file is None:
  835. return
  836. try:
  837. os.unlink(pid_file)
  838. except OSError as error:
  839. if error.errno is not errno.ENOENT:
  840. raise
  841. def main():
  842. global options
  843. global boss_of_bind
  844. # Enforce line buffering on stdout, even when not a TTY
  845. sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)
  846. options = parse_args()
  847. # Check user ID.
  848. setuid = None
  849. username = None
  850. if options.user:
  851. # Try getting information about the user, assuming UID passed.
  852. try:
  853. pw_ent = pwd.getpwuid(int(options.user))
  854. setuid = pw_ent.pw_uid
  855. username = pw_ent.pw_name
  856. except ValueError:
  857. pass
  858. except KeyError:
  859. pass
  860. # Next try getting information about the user, assuming user name
  861. # passed.
  862. # If the information is both a valid user name and user number, we
  863. # prefer the name because we try it second. A minor point, hopefully.
  864. try:
  865. pw_ent = pwd.getpwnam(options.user)
  866. setuid = pw_ent.pw_uid
  867. username = pw_ent.pw_name
  868. except KeyError:
  869. pass
  870. if setuid is None:
  871. logger.fatal(BIND10_INVALID_USER, options.user)
  872. sys.exit(1)
  873. # Announce startup.
  874. logger.info(BIND10_STARTING, VERSION)
  875. # Create wakeup pipe for signal handlers
  876. wakeup_pipe = os.pipe()
  877. signal.set_wakeup_fd(wakeup_pipe[1])
  878. # Set signal handlers for catching child termination, as well
  879. # as our own demise.
  880. signal.signal(signal.SIGCHLD, reaper)
  881. signal.siginterrupt(signal.SIGCHLD, False)
  882. signal.signal(signal.SIGINT, fatal_signal)
  883. signal.signal(signal.SIGTERM, fatal_signal)
  884. # Block SIGPIPE, as we don't want it to end this process
  885. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  886. # Go bob!
  887. boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
  888. options.config_file, options.nocache, options.verbose,
  889. setuid, username, options.cmdctl_port, options.brittle)
  890. startup_result = boss_of_bind.startup()
  891. if startup_result:
  892. logger.fatal(BIND10_STARTUP_ERROR, startup_result)
  893. sys.exit(1)
  894. logger.info(BIND10_STARTUP_COMPLETE)
  895. dump_pid(options.pid_file)
  896. # In our main loop, we check for dead processes or messages
  897. # on the c-channel.
  898. wakeup_fd = wakeup_pipe[0]
  899. ccs_fd = boss_of_bind.ccs.get_socket().fileno()
  900. while boss_of_bind.runnable:
  901. # clean up any processes that exited
  902. boss_of_bind.reap_children()
  903. next_restart = boss_of_bind.restart_processes()
  904. if next_restart is None:
  905. wait_time = None
  906. else:
  907. wait_time = max(next_restart - time.time(), 0)
  908. # select() can raise EINTR when a signal arrives,
  909. # even if they are resumable, so we have to catch
  910. # the exception
  911. try:
  912. (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [],
  913. wait_time)
  914. except select.error as err:
  915. if err.args[0] == errno.EINTR:
  916. (rlist, wlist, xlist) = ([], [], [])
  917. else:
  918. logger.fatal(BIND10_SELECT_ERROR, err)
  919. break
  920. for fd in rlist + xlist:
  921. if fd == ccs_fd:
  922. try:
  923. boss_of_bind.ccs.check_command()
  924. except isc.cc.session.ProtocolError:
  925. logger.fatal(BIND10_MSGQ_DISAPPEARED)
  926. self.runnable = False
  927. break
  928. elif fd == wakeup_fd:
  929. os.read(wakeup_fd, 32)
  930. # shutdown
  931. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  932. boss_of_bind.shutdown()
  933. unlink_pid_file(options.pid_file)
  934. sys.exit(0)
  935. if __name__ == "__main__":
  936. main()