bind10_src.py.in 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192
  1. #!@PYTHON@
  2. # Copyright (C) 2010,2011 Internet Systems Consortium.
  3. #
  4. # Permission to use, copy, modify, and distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  9. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  10. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  11. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  13. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  14. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  15. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. """
  17. This file implements the Boss of Bind (BoB, or bob) program.
  18. Its purpose is to start up the BIND 10 system, and then manage the
  19. processes, by starting and stopping processes, plus restarting
  20. processes that exit.
  21. To start the system, it first runs the c-channel program (msgq), then
  22. connects to that. It then runs the configuration manager, and reads
  23. its own configuration. Then it proceeds to starting other modules.
  24. The Python subprocess module is used for starting processes, but
  25. because this is not efficient for managing groups of processes,
  26. SIGCHLD signals are caught and processed using the signal module.
  27. Most of the logic is contained in the BoB class. However, since Python
  28. requires that signal processing happen in the main thread, we do
  29. signal handling outside of that class, in the code running for
  30. __main__.
  31. """
  32. import sys; sys.path.append ('@@PYTHONPATH@@')
  33. import os
  34. # If B10_FROM_SOURCE is set in the environment, we use data files
  35. # from a directory relative to that, otherwise we use the ones
  36. # installed on the system
  37. if "B10_FROM_SOURCE" in os.environ:
  38. SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
  39. ADD_LIBEXEC_PATH = False
  40. else:
  41. PREFIX = "@prefix@"
  42. DATAROOTDIR = "@datarootdir@"
  43. SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
  44. ADD_LIBEXEC_PATH = True
  45. import subprocess
  46. import signal
  47. import re
  48. import errno
  49. import time
  50. import select
  51. import random
  52. import socket
  53. from optparse import OptionParser, OptionValueError
  54. import io
  55. import pwd
  56. import posix
  57. import copy
  58. import isc.cc
  59. import isc.util.process
  60. import isc.net.parse
  61. import isc.log
  62. from isc.log_messages.bind10_messages import *
  63. import isc.bind10.component
  64. import isc.bind10.special_component
  65. isc.log.init("b10-boss")
  66. logger = isc.log.Logger("boss")
  67. # Pending system-wide debug level definitions, the ones we
  68. # use here are hardcoded for now
  69. DBG_PROCESS = logger.DBGLVL_TRACE_BASIC
  70. DBG_COMMANDS = logger.DBGLVL_TRACE_DETAIL
  71. # Assign this process some longer name
  72. isc.util.process.rename(sys.argv[0])
  73. # This is the version that gets displayed to the user.
  74. # The VERSION string consists of the module name, the module version
  75. # number, and the overall BIND 10 version number (set in configure.ac).
  76. VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
  77. # This is for boot_time of Boss
  78. _BASETIME = time.gmtime()
  79. class RestartSchedule:
  80. """
  81. Keeps state when restarting something (in this case, a process).
  82. When a process dies unexpectedly, we need to restart it. However, if
  83. it fails to restart for some reason, then we should not simply keep
  84. restarting it at high speed.
  85. A more sophisticated algorithm can be developed, but for now we choose
  86. a simple set of rules:
  87. * If a process was been running for >=10 seconds, we restart it
  88. right away.
  89. * If a process was running for <10 seconds, we wait until 10 seconds
  90. after it was started.
  91. To avoid programs getting into lockstep, we use a normal distribution
  92. to avoid being restarted at exactly 10 seconds."""
  93. def __init__(self, restart_frequency=10.0):
  94. self.restart_frequency = restart_frequency
  95. self.run_start_time = None
  96. self.run_stop_time = None
  97. self.restart_time = None
  98. def set_run_start_time(self, when=None):
  99. if when is None:
  100. when = time.time()
  101. self.run_start_time = when
  102. sigma = self.restart_frequency * 0.05
  103. self.restart_time = when + random.normalvariate(self.restart_frequency,
  104. sigma)
  105. def set_run_stop_time(self, when=None):
  106. """We don't actually do anything with stop time now, but it
  107. might be useful for future algorithms."""
  108. if when is None:
  109. when = time.time()
  110. self.run_stop_time = when
  111. def get_restart_time(self, when=None):
  112. if when is None:
  113. when = time.time()
  114. return max(when, self.restart_time)
  115. class ProcessInfoError(Exception): pass
  116. class ProcessInfo:
  117. """Information about a process"""
  118. dev_null = open(os.devnull, "w")
  119. def __init__(self, name, args, env={}, dev_null_stdout=False,
  120. dev_null_stderr=False, uid=None, username=None):
  121. self.name = name
  122. self.args = args
  123. self.env = env
  124. self.dev_null_stdout = dev_null_stdout
  125. self.dev_null_stderr = dev_null_stderr
  126. self.restart_schedule = RestartSchedule()
  127. self.uid = uid
  128. self.username = username
  129. self.process = None
  130. self.pid = None
  131. def _preexec_work(self):
  132. """Function used before running a program that needs to run as a
  133. different user."""
  134. # First, put us into a separate process group so we don't get
  135. # SIGINT signals on Ctrl-C (the boss will shut everthing down by
  136. # other means).
  137. os.setpgrp()
  138. # Second, set the user ID if one has been specified
  139. if self.uid is not None:
  140. try:
  141. posix.setuid(self.uid)
  142. except OSError as e:
  143. if e.errno == errno.EPERM:
  144. # if we failed to change user due to permission report that
  145. raise ProcessInfoError("Unable to change to user %s (uid %d)" % (self.username, self.uid))
  146. else:
  147. # otherwise simply re-raise whatever error we found
  148. raise
  149. def _spawn(self):
  150. if self.dev_null_stdout:
  151. spawn_stdout = self.dev_null
  152. else:
  153. spawn_stdout = None
  154. if self.dev_null_stderr:
  155. spawn_stderr = self.dev_null
  156. else:
  157. spawn_stderr = None
  158. # Environment variables for the child process will be a copy of those
  159. # of the boss process with any additional specific variables given
  160. # on construction (self.env).
  161. spawn_env = copy.deepcopy(os.environ)
  162. spawn_env.update(self.env)
  163. if ADD_LIBEXEC_PATH:
  164. spawn_env['PATH'] = "@@LIBEXECDIR@@:" + spawn_env['PATH']
  165. self.process = subprocess.Popen(self.args,
  166. stdin=subprocess.PIPE,
  167. stdout=spawn_stdout,
  168. stderr=spawn_stderr,
  169. close_fds=True,
  170. env=spawn_env,
  171. preexec_fn=self._preexec_work)
  172. self.pid = self.process.pid
  173. self.restart_schedule.set_run_start_time()
  174. # spawn() and respawn() are the same for now, but in the future they
  175. # may have different functionality
  176. def spawn(self):
  177. self._spawn()
  178. def respawn(self):
  179. self._spawn()
  180. class CChannelConnectError(Exception): pass
  181. class ProcessStartError(Exception): pass
  182. class BoB:
  183. """Boss of BIND class."""
  184. def __init__(self, msgq_socket_file=None, data_path=None,
  185. config_filename=None, nocache=False, verbose=False, setuid=None,
  186. username=None, cmdctl_port=None, brittle=False, wait_time=10):
  187. """
  188. Initialize the Boss of BIND. This is a singleton (only one can run).
  189. The msgq_socket_file specifies the UNIX domain socket file that the
  190. msgq process listens on. If verbose is True, then the boss reports
  191. what it is doing.
  192. Data path and config filename are passed through to config manager
  193. (if provided) and specify the config file to be used.
  194. The cmdctl_port is passed to cmdctl and specify on which port it
  195. should listen.
  196. brittle is a debug option that controls whether the Boss shuts down
  197. after any process dies.
  198. wait_time controls the amount of time (in seconds) that Boss waits
  199. for selected processes to initialize before continuing with the
  200. initialization. Currently this is only the configuration manager.
  201. """
  202. self.cc_session = None
  203. self.ccs = None
  204. self.cfg_start_auth = True
  205. self.cfg_start_resolver = False
  206. self.cfg_start_dhcp6 = False
  207. self.cfg_start_dhcp4 = False
  208. self.started_auth_family = False
  209. self.started_resolver_family = False
  210. self.curproc = None
  211. # XXX: Not used now, waits for reintroduction of restarts.
  212. self.dead_processes = {}
  213. self.msgq_socket_file = msgq_socket_file
  214. self.nocache = nocache
  215. self.component_config = {}
  216. self.processes = {}
  217. self.runnable = False
  218. self.uid = setuid
  219. self.username = username
  220. self.verbose = verbose
  221. self.data_path = data_path
  222. self.config_filename = config_filename
  223. self.cmdctl_port = cmdctl_port
  224. self.brittle = brittle
  225. self.wait_time = wait_time
  226. self.sockcreator = None
  227. self._component_configurator = isc.bind10.component.Configurator(self,
  228. isc.bind10.special_component.get_specials())
  229. # The priorities here make them start in the correct order. First
  230. # the socket creator (which would drop root privileges by then),
  231. # then message queue and after that the config manager (which uses
  232. # the config manager)
  233. self.__core_components = {
  234. 'sockcreator': {
  235. 'kind': 'core',
  236. 'special': 'sockcreator',
  237. 'priority': 200
  238. },
  239. 'msgq': {
  240. 'kind': 'core',
  241. 'special': 'msgq',
  242. 'priority': 199
  243. },
  244. 'cfgmgr': {
  245. 'kind': 'core',
  246. 'special': 'cfgmgr',
  247. 'priority': 198
  248. }
  249. }
  250. self.__started = False
  251. self.exitcode = 0
  252. # If -v was set, enable full debug logging.
  253. if self.verbose:
  254. logger.set_severity("DEBUG", 99)
  255. def __propagate_component_config(self, config):
  256. comps = dict(config)
  257. # Fill in the core components, so they stay alive
  258. for comp in self.__core_components:
  259. if comp in comps:
  260. raise Exception(comp + " is core component managed by " +
  261. "bind10 boss, do not set it")
  262. comps[comp] = self.__core_components[comp]
  263. # Update the configuration
  264. self._component_configurator.reconfigure(comps)
  265. def config_handler(self, new_config):
  266. # If this is initial update, don't do anything now, leave it to startup
  267. if not self.runnable:
  268. return
  269. # Now we declare few functions used only internally here. Besides the
  270. # benefit of not polluting the name space, they are closures, so we
  271. # don't need to pass some variables
  272. def start_stop(name, started, start, stop):
  273. if not'start_' + name in new_config:
  274. return
  275. if new_config['start_' + name]:
  276. if not started:
  277. if self.uid is not None:
  278. logger.info(BIND10_START_AS_NON_ROOT, name)
  279. start()
  280. else:
  281. stop()
  282. # These four functions are passed to start_stop (smells like functional
  283. # programming little bit)
  284. def resolver_on():
  285. self.component_config['b10-resolver'] = { 'kind': 'needed',
  286. 'special': 'resolver' }
  287. self.__propagate_component_config(self.component_config)
  288. self.started_resolver_family = True
  289. def resolver_off():
  290. if 'b10-resolver' in self.component_config:
  291. del self.component_config['b10-resolver']
  292. self.__propagate_component_config(self.component_config)
  293. self.started_resolver_family = False
  294. def auth_on():
  295. self.component_config['b10-auth'] = { 'kind': 'needed',
  296. 'special': 'auth' }
  297. self.component_config['b10-xfrout'] = { 'kind': 'dispensable',
  298. 'address': 'Xfrout' }
  299. self.component_config['b10-xfrin'] = { 'kind': 'dispensable',
  300. 'special': 'xfrin' }
  301. self.component_config['b10-zonemgr'] = { 'kind': 'dispensable',
  302. 'address': 'Zonemgr' }
  303. self.__propagate_component_config(self.component_config)
  304. self.started_auth_family = True
  305. def auth_off():
  306. if 'b10-zonemgr' in self.component_config:
  307. del self.component_config['b10-zonemgr']
  308. if 'b10-xfrin' in self.component_config:
  309. del self.component_config['b10-xfrin']
  310. if 'b10-xfrout' in self.component_config:
  311. del self.component_config['b10-xfrout']
  312. if 'b10-auth' in self.component_config:
  313. del self.component_config['b10-auth']
  314. self.__propagate_component_config(self.component_config)
  315. self.started_auth_family = False
  316. # The real code of the config handler function follows here
  317. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
  318. new_config)
  319. start_stop('resolver', self.started_resolver_family, resolver_on,
  320. resolver_off)
  321. start_stop('auth', self.started_auth_family, auth_on, auth_off)
  322. answer = isc.config.ccsession.create_answer(0)
  323. return answer
  324. def get_processes(self):
  325. pids = list(self.processes.keys())
  326. pids.sort()
  327. process_list = [ ]
  328. for pid in pids:
  329. process_list.append([pid, self.processes[pid].name()])
  330. return process_list
  331. def _get_stats_data(self):
  332. return { "owner": "Boss",
  333. "data": { 'boot_time':
  334. time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
  335. }
  336. }
  337. def command_handler(self, command, args):
  338. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
  339. answer = isc.config.ccsession.create_answer(1, "command not implemented")
  340. if type(command) != str:
  341. answer = isc.config.ccsession.create_answer(1, "bad command")
  342. else:
  343. if command == "shutdown":
  344. self.runnable = False
  345. answer = isc.config.ccsession.create_answer(0)
  346. elif command == "getstats":
  347. answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
  348. elif command == "sendstats":
  349. # send statistics data to the stats daemon immediately
  350. stats_data = self._get_stats_data()
  351. valid = self.ccs.get_module_spec().validate_statistics(
  352. True, stats_data["data"])
  353. if valid:
  354. cmd = isc.config.ccsession.create_command('set', stats_data)
  355. seq = self.cc_session.group_sendmsg(cmd, 'Stats')
  356. # Consume the answer, in case it becomes a orphan message.
  357. try:
  358. self.cc_session.group_recvmsg(False, seq)
  359. except isc.cc.session.SessionTimeout:
  360. pass
  361. answer = isc.config.ccsession.create_answer(0)
  362. else:
  363. logger.fatal(BIND10_INVALID_STATISTICS_DATA);
  364. answer = isc.config.ccsession.create_answer(
  365. 1, "specified statistics data is invalid")
  366. elif command == "ping":
  367. answer = isc.config.ccsession.create_answer(0, "pong")
  368. elif command == "show_processes":
  369. answer = isc.config.ccsession. \
  370. create_answer(0, self.get_processes())
  371. else:
  372. answer = isc.config.ccsession.create_answer(1,
  373. "Unknown command")
  374. return answer
  375. def kill_started_processes(self):
  376. """
  377. Called as part of the exception handling when a process fails to
  378. start, this runs through the list of started processes, killing
  379. each one. It then clears that list.
  380. """
  381. logger.info(BIND10_KILLING_ALL_PROCESSES)
  382. self.stop_creator(True)
  383. for pid in self.processes:
  384. logger.info(BIND10_KILL_PROCESS, self.processes[pid].name())
  385. self.processes[pid].kill(True)
  386. self.processes = {}
  387. def read_bind10_config(self):
  388. """
  389. Reads the parameters associated with the BoB module itself.
  390. At present these are the components to start although arguably this
  391. information should be in the configuration for the appropriate
  392. module itself. (However, this would cause difficulty in the case of
  393. xfrin/xfrout and zone manager as we don't need to start those if we
  394. are not running the authoritative server.)
  395. """
  396. logger.info(BIND10_READING_BOSS_CONFIGURATION)
  397. config_data = self.ccs.get_full_config()
  398. self.cfg_start_auth = config_data.get("start_auth")
  399. self.cfg_start_resolver = config_data.get("start_resolver")
  400. logger.info(BIND10_CONFIGURATION_START_AUTH, self.cfg_start_auth)
  401. logger.info(BIND10_CONFIGURATION_START_RESOLVER, self.cfg_start_resolver)
  402. def log_starting(self, process, port = None, address = None):
  403. """
  404. A convenience function to output a "Starting xxx" message if the
  405. logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
  406. Putting this into a separate method ensures
  407. that the output form is consistent across all processes.
  408. The process name (passed as the first argument) is put into
  409. self.curproc, and is used to indicate which process failed to
  410. start if there is an error (and is used in the "Started" message
  411. on success). The optional port and address information are
  412. appended to the message (if present).
  413. """
  414. self.curproc = process
  415. if port is None and address is None:
  416. logger.info(BIND10_STARTING_PROCESS, self.curproc)
  417. elif address is None:
  418. logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
  419. port)
  420. else:
  421. logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
  422. self.curproc, address, port)
  423. def log_started(self, pid = None):
  424. """
  425. A convenience function to output a 'Started xxxx (PID yyyy)'
  426. message. As with starting_message(), this ensures a consistent
  427. format.
  428. """
  429. if pid is None:
  430. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
  431. else:
  432. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
  433. def process_running(self, msg, who):
  434. """
  435. Some processes return a message to the Boss after they have
  436. started to indicate that they are running. The form of the
  437. message is a dictionary with contents {"running:", "<process>"}.
  438. This method checks the passed message and returns True if the
  439. "who" process is contained in the message (so is presumably
  440. running). It returns False for all other conditions and will
  441. log an error if appropriate.
  442. """
  443. if msg is not None:
  444. try:
  445. if msg["running"] == who:
  446. return True
  447. else:
  448. logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
  449. except:
  450. logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
  451. return False
  452. # The next few methods start the individual processes of BIND-10. They
  453. # are called via start_all_processes(). If any fail, an exception is
  454. # raised which is caught by the caller of start_all_processes(); this kills
  455. # processes started up to that point before terminating the program.
  456. def start_msgq(self):
  457. """
  458. Start the message queue and connect to the command channel.
  459. """
  460. self.log_starting("b10-msgq")
  461. msgq_proc = ProcessInfo("b10-msgq", ["b10-msgq"], self.c_channel_env,
  462. True, not self.verbose, uid=self.uid,
  463. username=self.username)
  464. msgq_proc.spawn()
  465. self.log_started(msgq_proc.pid)
  466. # Now connect to the c-channel
  467. cc_connect_start = time.time()
  468. while self.cc_session is None:
  469. # if we have been trying for "a while" give up
  470. if (time.time() - cc_connect_start) > 5:
  471. raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")
  472. # try to connect, and if we can't wait a short while
  473. try:
  474. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  475. except isc.cc.session.SessionError:
  476. time.sleep(0.1)
  477. # Subscribe to the message queue. The only messages we expect to receive
  478. # on this channel are once relating to process startup.
  479. self.cc_session.group_subscribe("Boss")
  480. return msgq_proc
  481. def start_cfgmgr(self):
  482. """
  483. Starts the configuration manager process
  484. """
  485. self.log_starting("b10-cfgmgr")
  486. args = ["b10-cfgmgr"]
  487. if self.data_path is not None:
  488. args.append("--data-path=" + self.data_path)
  489. if self.config_filename is not None:
  490. args.append("--config-filename=" + self.config_filename)
  491. bind_cfgd = ProcessInfo("b10-cfgmgr", args,
  492. self.c_channel_env, uid=self.uid,
  493. username=self.username)
  494. bind_cfgd.spawn()
  495. self.log_started(bind_cfgd.pid)
  496. # Wait for the configuration manager to start up as subsequent initialization
  497. # cannot proceed without it. The time to wait can be set on the command line.
  498. time_remaining = self.wait_time
  499. msg, env = self.cc_session.group_recvmsg()
  500. while time_remaining > 0 and not self.process_running(msg, "ConfigManager"):
  501. logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
  502. time.sleep(1)
  503. time_remaining = time_remaining - 1
  504. msg, env = self.cc_session.group_recvmsg()
  505. if not self.process_running(msg, "ConfigManager"):
  506. raise ProcessStartError("Configuration manager process has not started")
  507. return bind_cfgd
  508. def start_ccsession(self, c_channel_env):
  509. """
  510. Start the CC Session
  511. The argument c_channel_env is unused but is supplied to keep the
  512. argument list the same for all start_xxx methods.
  513. With regards to logging, note that as the CC session is not a
  514. process, the log_starting/log_started methods are not used.
  515. """
  516. logger.info(BIND10_STARTING_CC)
  517. self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
  518. self.config_handler,
  519. self.command_handler,
  520. socket_file = self.msgq_socket_file)
  521. self.ccs.start()
  522. logger.debug(DBG_PROCESS, BIND10_STARTED_CC)
  523. # A couple of utility methods for starting processes...
  524. def start_process(self, name, args, c_channel_env, port=None, address=None):
  525. """
  526. Given a set of command arguments, start the process and output
  527. appropriate log messages. If the start is successful, the process
  528. is added to the list of started processes.
  529. The port and address arguments are for log messages only.
  530. """
  531. self.log_starting(name, port, address)
  532. newproc = ProcessInfo(name, args, c_channel_env)
  533. newproc.spawn()
  534. self.log_started(newproc.pid)
  535. return newproc
  536. def register_process(self, pid, component):
  537. """
  538. Put another process into boss to watch over it. When the process
  539. dies, the component.failed() is called with the exit code.
  540. """
  541. self.processes[pid] = component
  542. def start_simple(self, name):
  543. """
  544. Most of the BIND-10 processes are started with the command:
  545. <process-name> [-v]
  546. ... where -v is appended if verbose is enabled. This method
  547. generates the arguments from the name and starts the process.
  548. The port and address arguments are for log messages only.
  549. """
  550. # Set up the command arguments.
  551. args = [name]
  552. if self.verbose:
  553. args += ['-v']
  554. # ... and start the process
  555. return self.start_process(name, args, self.c_channel_env)
  556. # The next few methods start up the rest of the BIND-10 processes.
  557. # Although many of these methods are little more than a call to
  558. # start_simple, they are retained (a) for testing reasons and (b) as a place
  559. # where modifications can be made if the process start-up sequence changes
  560. # for a given process.
  561. def start_auth(self):
  562. """
  563. Start the Authoritative server
  564. """
  565. authargs = ['b10-auth']
  566. if self.nocache:
  567. authargs += ['-n']
  568. if self.uid:
  569. authargs += ['-u', str(self.uid)]
  570. if self.verbose:
  571. authargs += ['-v']
  572. # ... and start
  573. return self.start_process("b10-auth", authargs, self.c_channel_env)
  574. def start_resolver(self):
  575. """
  576. Start the Resolver. At present, all these arguments and switches
  577. are pure speculation. As with the auth daemon, they should be
  578. read from the configuration database.
  579. """
  580. self.curproc = "b10-resolver"
  581. # XXX: this must be read from the configuration manager in the future
  582. resargs = ['b10-resolver']
  583. if self.uid:
  584. resargs += ['-u', str(self.uid)]
  585. if self.verbose:
  586. resargs += ['-v']
  587. # ... and start
  588. return self.start_process("b10-resolver", resargs, self.c_channel_env)
  589. def start_cmdctl(self):
  590. """
  591. Starts the command control process
  592. """
  593. args = ["b10-cmdctl"]
  594. if self.cmdctl_port is not None:
  595. args.append("--port=" + str(self.cmdctl_port))
  596. return self.start_process("b10-cmdctl", args, self.c_channel_env,
  597. self.cmdctl_port)
  598. def start_xfrin(self):
  599. # XXX: a quick-hack workaround. xfrin will implicitly use dynamically
  600. # loadable data source modules, which will be installed in $(libdir).
  601. # On some OSes (including MacOS X and *BSDs) the main process (python)
  602. # cannot find the modules unless they are located in a common shared
  603. # object path or a path in the (DY)LD_LIBRARY_PATH. We should seek
  604. # a cleaner solution, but for a short term workaround we specify the
  605. # path here, unconditionally, and without even bothering which
  606. # environment variable should be used.
  607. #
  608. # We reuse the ADD_LIBEXEC_PATH variable to see whether we need to
  609. # do this, as the conditions that make this workaround needed are
  610. # the same as for the libexec path addition
  611. # TODO: Once #1292 is finished, remove this method and the special
  612. # component, use it as normal component.
  613. c_channel_env = dict(self.c_channel_env)
  614. if ADD_LIBEXEC_PATH:
  615. cur_path = os.getenv('DYLD_LIBRARY_PATH')
  616. cur_path = '' if cur_path is None else ':' + cur_path
  617. c_channel_env['DYLD_LIBRARY_PATH'] = "@@LIBDIR@@" + cur_path
  618. cur_path = os.getenv('LD_LIBRARY_PATH')
  619. cur_path = '' if cur_path is None else ':' + cur_path
  620. c_channel_env['LD_LIBRARY_PATH'] = "@@LIBDIR@@" + cur_path
  621. # Set up the command arguments.
  622. args = ['b10-xfrin']
  623. if self.verbose:
  624. args += ['-v']
  625. return self.start_process("b10-xfrin", args, c_channel_env)
  626. def start_all_processes(self):
  627. """
  628. Starts up all the processes. Any exception generated during the
  629. starting of the processes is handled by the caller.
  630. """
  631. # Start the real core (sockcreator, msgq, cfgmgr)
  632. self._component_configurator.startup(self.__core_components)
  633. # Connect to the msgq. This is not a process, so it's not handled
  634. # inside the configurator.
  635. c_channel_env = self.c_channel_env
  636. self.start_ccsession(c_channel_env)
  637. # Extract the parameters associated with Bob. This can only be
  638. # done after the CC Session is started. Note that the logging
  639. # configuration may override the "-v" switch set on the command line.
  640. self.read_bind10_config()
  641. # Continue starting the processes. The authoritative server (if
  642. # selected):
  643. component_config = {}
  644. if self.cfg_start_auth:
  645. component_config['b10-auth'] = { 'kind': 'needed',
  646. 'special': 'auth' }
  647. self.__propagate_component_config(component_config)
  648. # ... and resolver (if selected):
  649. if self.cfg_start_resolver:
  650. component_config['b10-resolver'] = { 'kind': 'needed',
  651. 'special': 'resolver' }
  652. self.started_resolver_family = True
  653. self.__propagate_component_config(component_config)
  654. # Everything after the main components can run as non-root.
  655. # TODO: this is only temporary - once the privileged socket creator is
  656. # fully working, nothing else will run as root.
  657. if self.uid is not None:
  658. posix.setuid(self.uid)
  659. # xfrin/xfrout and the zone manager are only meaningful if the
  660. # authoritative server has been started.
  661. if self.cfg_start_auth:
  662. component_config['b10-xfrout'] = { 'kind': 'dispensable',
  663. 'address': 'Xfrout' }
  664. component_config['b10-xfrin'] = { 'kind': 'dispensable',
  665. 'special': 'xfrin' }
  666. component_config['b10-zonemgr'] = { 'kind': 'dispensable',
  667. 'address': 'Zonemgr' }
  668. self.__propagate_component_config(component_config)
  669. self.started_auth_family = True
  670. # ... and finally start the remaining processes
  671. component_config['b10-stats'] = { 'kind': 'dispensable',
  672. 'address': 'Stats' }
  673. component_config['b10-stats-httpd'] = { 'kind': 'dispensable',
  674. 'address': 'StatsHttpd' }
  675. component_config['b10-cmdctl'] = { 'kind': 'needed',
  676. 'special': 'cmdctl' }
  677. if self.cfg_start_dhcp6:
  678. component_config['b10-dhcp6'] = { 'kind': 'dispensable',
  679. 'address': 'DHCP6' }
  680. self.__propagate_component_config(component_config)
  681. self.component_config = component_config
  682. def startup(self):
  683. """
  684. Start the BoB instance.
  685. Returns None if successful, otherwise an string describing the
  686. problem.
  687. """
  688. # Try to connect to the c-channel daemon, to see if it is already
  689. # running
  690. c_channel_env = {}
  691. if self.msgq_socket_file is not None:
  692. c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
  693. logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
  694. # try to connect, and if we can't wait a short while
  695. try:
  696. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  697. logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
  698. return "b10-msgq already running, or socket file not cleaned , cannot start"
  699. except isc.cc.session.SessionError:
  700. # this is the case we want, where the msgq is not running
  701. pass
  702. # Start all processes. If any one fails to start, kill all started
  703. # processes and exit with an error indication.
  704. try:
  705. self.c_channel_env = c_channel_env
  706. self.start_all_processes()
  707. except Exception as e:
  708. self.kill_started_processes()
  709. return "Unable to start " + self.curproc + ": " + str(e)
  710. # Started successfully
  711. self.runnable = True
  712. self.__started = True
  713. return None
  714. def stop_process(self, process, recipient):
  715. """
  716. Stop the given process, friendly-like. The process is the name it has
  717. (in logs, etc), the recipient is the address on msgq.
  718. """
  719. logger.info(BIND10_STOP_PROCESS, process)
  720. self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
  721. recipient)
  722. def component_shutdown(self, exitcode=0):
  723. """
  724. Stop the Boss instance from a components' request. The exitcode
  725. indicates the desired exit code.
  726. If we did not start yet, it raises an exception, which is meant
  727. to propagate through the component and configurator to the startup
  728. routine and abort the startup imediatelly. If it is started up already,
  729. we just mark it so we terminate soon.
  730. It does set the exit code in both cases.
  731. """
  732. self.exitcode = exitcode
  733. if not self.__started:
  734. raise Exception("Component failed during startup");
  735. else:
  736. self.runnable = False
  737. # Series of stop_process wrappers
  738. def stop_resolver(self):
  739. self.stop_process('b10-resolver', 'Resolver')
  740. def stop_auth(self):
  741. self.stop_process('b10-auth', 'Auth')
  742. def stop_xfrout(self):
  743. self.stop_process('b10-xfrout', 'Xfrout')
  744. def stop_xfrin(self):
  745. self.stop_process('b10-xfrin', 'Xfrin')
  746. def stop_zonemgr(self):
  747. self.stop_process('b10-zonemgr', 'Zonemgr')
  748. def shutdown(self):
  749. """Stop the BoB instance."""
  750. logger.info(BIND10_SHUTDOWN)
  751. # first try using the BIND 10 request to stop
  752. try:
  753. self._component_configurator.shutdown()
  754. except:
  755. pass
  756. # XXX: some delay probably useful... how much is uncertain
  757. # I have changed the delay from 0.5 to 1, but sometime it's
  758. # still not enough.
  759. time.sleep(1)
  760. self.reap_children()
  761. # next try sending a SIGTERM
  762. components_to_stop = list(self.processes.values())
  763. for component in components_to_stop:
  764. logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
  765. try:
  766. component.kill()
  767. except OSError:
  768. # ignore these (usually ESRCH because the child
  769. # finally exited)
  770. pass
  771. # finally, send SIGKILL (unmaskable termination) until everybody dies
  772. while self.processes:
  773. # XXX: some delay probably useful... how much is uncertain
  774. time.sleep(0.1)
  775. self.reap_children()
  776. components_to_stop = list(self.processes.values())
  777. for component in components_to_stop:
  778. logger.info(BIND10_SEND_SIGKILL, component.name(),
  779. component.pid())
  780. try:
  781. component.kill(True)
  782. except OSError:
  783. # ignore these (usually ESRCH because the child
  784. # finally exited)
  785. pass
  786. logger.info(BIND10_SHUTDOWN_COMPLETE)
  787. def _get_process_exit_status(self):
  788. return os.waitpid(-1, os.WNOHANG)
  789. def reap_children(self):
  790. """Check to see if any of our child processes have exited,
  791. and note this for later handling.
  792. """
  793. while True:
  794. try:
  795. (pid, exit_status) = self._get_process_exit_status()
  796. except OSError as o:
  797. if o.errno == errno.ECHILD: break
  798. # XXX: should be impossible to get any other error here
  799. raise
  800. if pid == 0: break
  801. if pid in self.processes:
  802. # One of the processes we know about. Get information on it.
  803. component = self.processes.pop(pid)
  804. if component.running() and self.runnable:
  805. # Tell it it failed. But only if it matters (we are
  806. # not shutting down and the component considers itself
  807. # to be running.
  808. component.failed(exit_status);
  809. else:
  810. logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
  811. def restart_processes(self):
  812. """
  813. Restart any dead processes:
  814. * Returns the time when the next process is ready to be restarted.
  815. * If the server is shutting down, returns 0.
  816. * If there are no processes, returns None.
  817. The values returned can be safely passed into select() as the
  818. timeout value.
  819. """
  820. next_restart = None
  821. # if we're shutting down, then don't restart
  822. if not self.runnable:
  823. return 0
  824. # otherwise look through each dead process and try to restart
  825. still_dead = {}
  826. now = time.time()
  827. for proc_info in self.dead_processes.values():
  828. restart_time = proc_info.restart_schedule.get_restart_time(now)
  829. if restart_time > now:
  830. if (next_restart is None) or (next_restart > restart_time):
  831. next_restart = restart_time
  832. still_dead[proc_info.pid] = proc_info
  833. else:
  834. logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
  835. try:
  836. proc_info.respawn()
  837. self.processes[proc_info.pid] = proc_info
  838. logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
  839. except:
  840. still_dead[proc_info.pid] = proc_info
  841. # remember any processes that refuse to be resurrected
  842. self.dead_processes = still_dead
  843. # return the time when the next process is ready to be restarted
  844. return next_restart
  845. # global variables, needed for signal handlers
  846. options = None
  847. boss_of_bind = None
  848. def reaper(signal_number, stack_frame):
  849. """A child process has died (SIGCHLD received)."""
  850. # don't do anything...
  851. # the Python signal handler has been set up to write
  852. # down a pipe, waking up our select() bit
  853. pass
  854. def get_signame(signal_number):
  855. """Return the symbolic name for a signal."""
  856. for sig in dir(signal):
  857. if sig.startswith("SIG") and sig[3].isalnum():
  858. if getattr(signal, sig) == signal_number:
  859. return sig
  860. return "Unknown signal %d" % signal_number
  861. # XXX: perhaps register atexit() function and invoke that instead
  862. def fatal_signal(signal_number, stack_frame):
  863. """We need to exit (SIGINT or SIGTERM received)."""
  864. global options
  865. global boss_of_bind
  866. logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
  867. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  868. boss_of_bind.runnable = False
  869. def process_rename(option, opt_str, value, parser):
  870. """Function that renames the process if it is requested by a option."""
  871. isc.util.process.rename(value)
  872. def parse_args(args=sys.argv[1:], Parser=OptionParser):
  873. """
  874. Function for parsing command line arguments. Returns the
  875. options object from OptionParser.
  876. """
  877. parser = Parser(version=VERSION)
  878. parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
  879. type="string", default=None,
  880. help="UNIX domain socket file the b10-msgq daemon will use")
  881. parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
  882. default=False, help="disable hot-spot cache in authoritative DNS server")
  883. parser.add_option("-u", "--user", dest="user", type="string", default=None,
  884. help="Change user after startup (must run as root)")
  885. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  886. help="display more about what is going on")
  887. parser.add_option("--pretty-name", type="string", action="callback",
  888. callback=process_rename,
  889. help="Set the process name (displayed in ps, top, ...)")
  890. parser.add_option("-c", "--config-file", action="store",
  891. dest="config_file", default=None,
  892. help="Configuration database filename")
  893. parser.add_option("-p", "--data-path", dest="data_path",
  894. help="Directory to search for configuration files",
  895. default=None)
  896. parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
  897. default=None, help="Port of command control")
  898. parser.add_option("--pid-file", dest="pid_file", type="string",
  899. default=None,
  900. help="file to dump the PID of the BIND 10 process")
  901. parser.add_option("--brittle", dest="brittle", action="store_true",
  902. help="debugging flag: exit if any component dies")
  903. parser.add_option("-w", "--wait", dest="wait_time", type="int",
  904. default=10, help="Time (in seconds) to wait for config manager to start up")
  905. (options, args) = parser.parse_args(args)
  906. if options.cmdctl_port is not None:
  907. try:
  908. isc.net.parse.port_parse(options.cmdctl_port)
  909. except ValueError as e:
  910. parser.error(e)
  911. if args:
  912. parser.print_help()
  913. sys.exit(1)
  914. return options
  915. def dump_pid(pid_file):
  916. """
  917. Dump the PID of the current process to the specified file. If the given
  918. file is None this function does nothing. If the file already exists,
  919. the existing content will be removed. If a system error happens in
  920. creating or writing to the file, the corresponding exception will be
  921. propagated to the caller.
  922. """
  923. if pid_file is None:
  924. return
  925. f = open(pid_file, "w")
  926. f.write('%d\n' % os.getpid())
  927. f.close()
  928. def unlink_pid_file(pid_file):
  929. """
  930. Remove the given file, which is basically expected to be the PID file
  931. created by dump_pid(). The specified may or may not exist; if it
  932. doesn't this function does nothing. Other system level errors in removing
  933. the file will be propagated as the corresponding exception.
  934. """
  935. if pid_file is None:
  936. return
  937. try:
  938. os.unlink(pid_file)
  939. except OSError as error:
  940. if error.errno is not errno.ENOENT:
  941. raise
  942. def main():
  943. global options
  944. global boss_of_bind
  945. # Enforce line buffering on stdout, even when not a TTY
  946. sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)
  947. options = parse_args()
  948. # Check user ID.
  949. setuid = None
  950. username = None
  951. if options.user:
  952. # Try getting information about the user, assuming UID passed.
  953. try:
  954. pw_ent = pwd.getpwuid(int(options.user))
  955. setuid = pw_ent.pw_uid
  956. username = pw_ent.pw_name
  957. except ValueError:
  958. pass
  959. except KeyError:
  960. pass
  961. # Next try getting information about the user, assuming user name
  962. # passed.
  963. # If the information is both a valid user name and user number, we
  964. # prefer the name because we try it second. A minor point, hopefully.
  965. try:
  966. pw_ent = pwd.getpwnam(options.user)
  967. setuid = pw_ent.pw_uid
  968. username = pw_ent.pw_name
  969. except KeyError:
  970. pass
  971. if setuid is None:
  972. logger.fatal(BIND10_INVALID_USER, options.user)
  973. sys.exit(1)
  974. # Announce startup.
  975. logger.info(BIND10_STARTING, VERSION)
  976. # Create wakeup pipe for signal handlers
  977. wakeup_pipe = os.pipe()
  978. signal.set_wakeup_fd(wakeup_pipe[1])
  979. # Set signal handlers for catching child termination, as well
  980. # as our own demise.
  981. signal.signal(signal.SIGCHLD, reaper)
  982. signal.siginterrupt(signal.SIGCHLD, False)
  983. signal.signal(signal.SIGINT, fatal_signal)
  984. signal.signal(signal.SIGTERM, fatal_signal)
  985. # Block SIGPIPE, as we don't want it to end this process
  986. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  987. # Go bob!
  988. boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
  989. options.config_file, options.nocache, options.verbose,
  990. setuid, username, options.cmdctl_port, options.brittle,
  991. options.wait_time)
  992. startup_result = boss_of_bind.startup()
  993. if startup_result:
  994. logger.fatal(BIND10_STARTUP_ERROR, startup_result)
  995. sys.exit(1)
  996. logger.info(BIND10_STARTUP_COMPLETE)
  997. dump_pid(options.pid_file)
  998. # In our main loop, we check for dead processes or messages
  999. # on the c-channel.
  1000. wakeup_fd = wakeup_pipe[0]
  1001. ccs_fd = boss_of_bind.ccs.get_socket().fileno()
  1002. while boss_of_bind.runnable:
  1003. # clean up any processes that exited
  1004. boss_of_bind.reap_children()
  1005. # XXX: As we don't put anything into the processes to be restarted,
  1006. # this is really a complicated NOP. But we will try to reintroduce
  1007. # delayed restarts, so it stays here for now, until we find out if
  1008. # it's useful.
  1009. next_restart = boss_of_bind.restart_processes()
  1010. if next_restart is None:
  1011. wait_time = None
  1012. else:
  1013. wait_time = max(next_restart - time.time(), 0)
  1014. # select() can raise EINTR when a signal arrives,
  1015. # even if they are resumable, so we have to catch
  1016. # the exception
  1017. try:
  1018. (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [],
  1019. wait_time)
  1020. except select.error as err:
  1021. if err.args[0] == errno.EINTR:
  1022. (rlist, wlist, xlist) = ([], [], [])
  1023. else:
  1024. logger.fatal(BIND10_SELECT_ERROR, err)
  1025. break
  1026. for fd in rlist + xlist:
  1027. if fd == ccs_fd:
  1028. try:
  1029. boss_of_bind.ccs.check_command()
  1030. except isc.cc.session.ProtocolError:
  1031. logger.fatal(BIND10_MSGQ_DISAPPEARED)
  1032. self.runnable = False
  1033. break
  1034. elif fd == wakeup_fd:
  1035. os.read(wakeup_fd, 32)
  1036. # shutdown
  1037. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  1038. boss_of_bind.shutdown()
  1039. unlink_pid_file(options.pid_file)
  1040. sys.exit(0)
  1041. if __name__ == "__main__":
  1042. main()