bind10_src.py.in 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. #!@PYTHON@
  2. # Copyright (C) 2010,2011 Internet Systems Consortium.
  3. #
  4. # Permission to use, copy, modify, and distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  9. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  10. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  11. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  13. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  14. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  15. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. """
  17. This file implements the Boss of Bind (BoB, or bob) program.
  18. Its purpose is to start up the BIND 10 system, and then manage the
  19. processes, by starting and stopping processes, plus restarting
  20. processes that exit.
  21. To start the system, it first runs the c-channel program (msgq), then
  22. connects to that. It then runs the configuration manager, and reads
  23. its own configuration. Then it proceeds to starting other modules.
  24. The Python subprocess module is used for starting processes, but
  25. because this is not efficient for managing groups of processes,
  26. SIGCHLD signals are caught and processed using the signal module.
  27. Most of the logic is contained in the BoB class. However, since Python
  28. requires that signal processing happen in the main thread, we do
  29. signal handling outside of that class, in the code running for
  30. __main__.
  31. """
  32. import sys; sys.path.append ('@@PYTHONPATH@@')
  33. import os
  34. # If B10_FROM_SOURCE is set in the environment, we use data files
  35. # from a directory relative to that, otherwise we use the ones
  36. # installed on the system
  37. if "B10_FROM_SOURCE" in os.environ:
  38. SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
  39. else:
  40. PREFIX = "@prefix@"
  41. DATAROOTDIR = "@datarootdir@"
  42. SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
  43. import subprocess
  44. import signal
  45. import re
  46. import errno
  47. import time
  48. import select
  49. import random
  50. import socket
  51. from optparse import OptionParser, OptionValueError
  52. import io
  53. import pwd
  54. import posix
  55. import copy
  56. from bind10_config import LIBEXECPATH
  57. import bind10_config
  58. import isc.cc
  59. import isc.util.process
  60. import isc.net.parse
  61. import isc.log
  62. from isc.log_messages.bind10_messages import *
  63. import isc.bind10.component
  64. import isc.bind10.special_component
  65. import isc.bind10.socket_cache
  66. import libutil_io_python
  67. import tempfile
  68. isc.log.init("b10-boss")
  69. logger = isc.log.Logger("boss")
  70. # Pending system-wide debug level definitions, the ones we
  71. # use here are hardcoded for now
  72. DBG_PROCESS = logger.DBGLVL_TRACE_BASIC
  73. DBG_COMMANDS = logger.DBGLVL_TRACE_DETAIL
  74. # Messages sent over the unix domain socket to indicate if it is followed by a real socket
  75. CREATOR_SOCKET_OK = b"1\n"
  76. CREATOR_SOCKET_UNAVAILABLE = b"0\n"
  77. # RCodes of known exceptions for the get_token command
  78. CREATOR_SOCKET_ERROR = 2
  79. CREATOR_SHARE_ERROR = 3
  80. # Assign this process some longer name
  81. isc.util.process.rename(sys.argv[0])
  82. # This is the version that gets displayed to the user.
  83. # The VERSION string consists of the module name, the module version
  84. # number, and the overall BIND 10 version number (set in configure.ac).
  85. VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
  86. # This is for boot_time of Boss
  87. _BASETIME = time.gmtime()
  88. class ProcessInfoError(Exception): pass
  89. class ProcessInfo:
  90. """Information about a process"""
  91. dev_null = open(os.devnull, "w")
  92. def __init__(self, name, args, env={}, dev_null_stdout=False,
  93. dev_null_stderr=False):
  94. self.name = name
  95. self.args = args
  96. self.env = env
  97. self.dev_null_stdout = dev_null_stdout
  98. self.dev_null_stderr = dev_null_stderr
  99. self.process = None
  100. self.pid = None
  101. def _preexec_work(self):
  102. """Function used before running a program that needs to run as a
  103. different user."""
  104. # First, put us into a separate process group so we don't get
  105. # SIGINT signals on Ctrl-C (the boss will shut everthing down by
  106. # other means).
  107. os.setpgrp()
  108. def _spawn(self):
  109. if self.dev_null_stdout:
  110. spawn_stdout = self.dev_null
  111. else:
  112. spawn_stdout = None
  113. if self.dev_null_stderr:
  114. spawn_stderr = self.dev_null
  115. else:
  116. spawn_stderr = None
  117. # Environment variables for the child process will be a copy of those
  118. # of the boss process with any additional specific variables given
  119. # on construction (self.env).
  120. spawn_env = copy.deepcopy(os.environ)
  121. spawn_env.update(self.env)
  122. spawn_env['PATH'] = LIBEXECPATH + ':' + spawn_env['PATH']
  123. self.process = subprocess.Popen(self.args,
  124. stdin=subprocess.PIPE,
  125. stdout=spawn_stdout,
  126. stderr=spawn_stderr,
  127. close_fds=True,
  128. env=spawn_env,
  129. preexec_fn=self._preexec_work)
  130. self.pid = self.process.pid
  131. # spawn() and respawn() are the same for now, but in the future they
  132. # may have different functionality
  133. def spawn(self):
  134. self._spawn()
  135. def respawn(self):
  136. self._spawn()
  137. class CChannelConnectError(Exception): pass
  138. class ProcessStartError(Exception): pass
  139. class BoB:
  140. """Boss of BIND class."""
  141. def __init__(self, msgq_socket_file=None, data_path=None,
  142. config_filename=None, clear_config=False, nocache=False,
  143. verbose=False, nokill=False, setuid=None, setgid=None,
  144. username=None, cmdctl_port=None, wait_time=10):
  145. """
  146. Initialize the Boss of BIND. This is a singleton (only one can run).
  147. The msgq_socket_file specifies the UNIX domain socket file that the
  148. msgq process listens on. If verbose is True, then the boss reports
  149. what it is doing.
  150. Data path and config filename are passed through to config manager
  151. (if provided) and specify the config file to be used.
  152. The cmdctl_port is passed to cmdctl and specify on which port it
  153. should listen.
  154. wait_time controls the amount of time (in seconds) that Boss waits
  155. for selected processes to initialize before continuing with the
  156. initialization. Currently this is only the configuration manager.
  157. """
  158. self.cc_session = None
  159. self.ccs = None
  160. self.curproc = None
  161. self.msgq_socket_file = msgq_socket_file
  162. self.nocache = nocache
  163. self.component_config = {}
  164. # Some time in future, it may happen that a single component has
  165. # multple processes (like a pipeline-like component). If so happens,
  166. # name "components" may be inapropriate. But as the code isn't probably
  167. # completely ready for it, we leave it at components for now. We also
  168. # want to support multiple instances of a single component. If it turns
  169. # out that we'll have a single component with multiple same processes
  170. # or if we start multiple components with the same configuration (we do
  171. # this now, but it might change) is an open question.
  172. self.components = {}
  173. # Simply list of components that died and need to wait for a
  174. # restart. Components manage their own restart schedule now
  175. self.components_to_restart = []
  176. self.runnable = False
  177. self.uid = setuid
  178. self.gid = setgid
  179. self.username = username
  180. self.verbose = verbose
  181. self.nokill = nokill
  182. self.data_path = data_path
  183. self.config_filename = config_filename
  184. self.clear_config = clear_config
  185. self.cmdctl_port = cmdctl_port
  186. self.wait_time = wait_time
  187. self._component_configurator = isc.bind10.component.Configurator(self,
  188. isc.bind10.special_component.get_specials())
  189. # The priorities here make them start in the correct order. First
  190. # the socket creator (which would drop root privileges by then),
  191. # then message queue and after that the config manager (which uses
  192. # the config manager)
  193. self.__core_components = {
  194. 'sockcreator': {
  195. 'kind': 'core',
  196. 'special': 'sockcreator',
  197. 'priority': 200
  198. },
  199. 'msgq': {
  200. 'kind': 'core',
  201. 'special': 'msgq',
  202. 'priority': 199
  203. },
  204. 'cfgmgr': {
  205. 'kind': 'core',
  206. 'special': 'cfgmgr',
  207. 'priority': 198
  208. }
  209. }
  210. self.__started = False
  211. self.exitcode = 0
  212. # If -v was set, enable full debug logging.
  213. if self.verbose:
  214. logger.set_severity("DEBUG", 99)
  215. # This is set in init_socket_srv
  216. self._socket_path = None
  217. self._socket_cache = None
  218. self._tmpdir = None
  219. self._srv_socket = None
  220. self._unix_sockets = {}
  221. def __propagate_component_config(self, config):
  222. comps = dict(config)
  223. # Fill in the core components, so they stay alive
  224. for comp in self.__core_components:
  225. if comp in comps:
  226. raise Exception(comp + " is core component managed by " +
  227. "bind10 boss, do not set it")
  228. comps[comp] = self.__core_components[comp]
  229. # Update the configuration
  230. self._component_configurator.reconfigure(comps)
  231. def config_handler(self, new_config):
  232. # If this is initial update, don't do anything now, leave it to startup
  233. if not self.runnable:
  234. return
  235. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
  236. new_config)
  237. try:
  238. if 'components' in new_config:
  239. self.__propagate_component_config(new_config['components'])
  240. return isc.config.ccsession.create_answer(0)
  241. except Exception as e:
  242. return isc.config.ccsession.create_answer(1, str(e))
  243. def get_processes(self):
  244. pids = list(self.components.keys())
  245. pids.sort()
  246. process_list = [ ]
  247. for pid in pids:
  248. process_list.append([pid, self.components[pid].name()])
  249. return process_list
  250. def _get_stats_data(self):
  251. return { "owner": "Boss",
  252. "data": { 'boot_time':
  253. time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
  254. }
  255. }
  256. def command_handler(self, command, args):
  257. logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
  258. answer = isc.config.ccsession.create_answer(1, "command not implemented")
  259. if type(command) != str:
  260. answer = isc.config.ccsession.create_answer(1, "bad command")
  261. else:
  262. if command == "shutdown":
  263. self.runnable = False
  264. answer = isc.config.ccsession.create_answer(0)
  265. elif command == "getstats":
  266. answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
  267. elif command == "sendstats":
  268. # send statistics data to the stats daemon immediately
  269. stats_data = self._get_stats_data()
  270. valid = self.ccs.get_module_spec().validate_statistics(
  271. True, stats_data["data"])
  272. if valid:
  273. cmd = isc.config.ccsession.create_command('set', stats_data)
  274. seq = self.cc_session.group_sendmsg(cmd, 'Stats')
  275. # Consume the answer, in case it becomes a orphan message.
  276. try:
  277. self.cc_session.group_recvmsg(False, seq)
  278. except isc.cc.session.SessionTimeout:
  279. pass
  280. answer = isc.config.ccsession.create_answer(0)
  281. else:
  282. logger.fatal(BIND10_INVALID_STATISTICS_DATA);
  283. answer = isc.config.ccsession.create_answer(
  284. 1, "specified statistics data is invalid")
  285. elif command == "ping":
  286. answer = isc.config.ccsession.create_answer(0, "pong")
  287. elif command == "show_processes":
  288. answer = isc.config.ccsession. \
  289. create_answer(0, self.get_processes())
  290. elif command == "get_socket":
  291. answer = self._get_socket(args)
  292. elif command == "drop_socket":
  293. if "token" not in args:
  294. answer = isc.config.ccsession. \
  295. create_answer(1, "Missing token parameter")
  296. else:
  297. try:
  298. self._socket_cache.drop_socket(args["token"])
  299. answer = isc.config.ccsession.create_answer(0)
  300. except Exception as e:
  301. answer = isc.config.ccsession.create_answer(1, str(e))
  302. else:
  303. answer = isc.config.ccsession.create_answer(1,
  304. "Unknown command")
  305. return answer
  306. def kill_started_components(self):
  307. """
  308. Called as part of the exception handling when a process fails to
  309. start, this runs through the list of started processes, killing
  310. each one. It then clears that list.
  311. """
  312. logger.info(BIND10_KILLING_ALL_PROCESSES)
  313. for pid in self.components:
  314. logger.info(BIND10_KILL_PROCESS, self.components[pid].name())
  315. self.components[pid].kill(True)
  316. self.components = {}
  317. def _read_bind10_config(self):
  318. """
  319. Reads the parameters associated with the BoB module itself.
  320. This means the list of components we should start now.
  321. This could easily be combined into start_all_processes, but
  322. it stays because of historical reasons and because the tests
  323. replace the method sometimes.
  324. """
  325. logger.info(BIND10_READING_BOSS_CONFIGURATION)
  326. config_data = self.ccs.get_full_config()
  327. self.__propagate_component_config(config_data['components'])
  328. def log_starting(self, process, port = None, address = None):
  329. """
  330. A convenience function to output a "Starting xxx" message if the
  331. logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
  332. Putting this into a separate method ensures
  333. that the output form is consistent across all processes.
  334. The process name (passed as the first argument) is put into
  335. self.curproc, and is used to indicate which process failed to
  336. start if there is an error (and is used in the "Started" message
  337. on success). The optional port and address information are
  338. appended to the message (if present).
  339. """
  340. self.curproc = process
  341. if port is None and address is None:
  342. logger.info(BIND10_STARTING_PROCESS, self.curproc)
  343. elif address is None:
  344. logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
  345. port)
  346. else:
  347. logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
  348. self.curproc, address, port)
  349. def log_started(self, pid = None):
  350. """
  351. A convenience function to output a 'Started xxxx (PID yyyy)'
  352. message. As with starting_message(), this ensures a consistent
  353. format.
  354. """
  355. if pid is None:
  356. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
  357. else:
  358. logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
  359. def process_running(self, msg, who):
  360. """
  361. Some processes return a message to the Boss after they have
  362. started to indicate that they are running. The form of the
  363. message is a dictionary with contents {"running:", "<process>"}.
  364. This method checks the passed message and returns True if the
  365. "who" process is contained in the message (so is presumably
  366. running). It returns False for all other conditions and will
  367. log an error if appropriate.
  368. """
  369. if msg is not None:
  370. try:
  371. if msg["running"] == who:
  372. return True
  373. else:
  374. logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
  375. except:
  376. logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
  377. return False
  378. # The next few methods start the individual processes of BIND-10. They
  379. # are called via start_all_processes(). If any fail, an exception is
  380. # raised which is caught by the caller of start_all_processes(); this kills
  381. # processes started up to that point before terminating the program.
  382. def start_msgq(self):
  383. """
  384. Start the message queue and connect to the command channel.
  385. """
  386. self.log_starting("b10-msgq")
  387. msgq_proc = ProcessInfo("b10-msgq", ["b10-msgq"], self.c_channel_env,
  388. True, not self.verbose)
  389. msgq_proc.spawn()
  390. self.log_started(msgq_proc.pid)
  391. # Now connect to the c-channel
  392. cc_connect_start = time.time()
  393. while self.cc_session is None:
  394. # if we have been trying for "a while" give up
  395. if (time.time() - cc_connect_start) > 5:
  396. raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")
  397. # try to connect, and if we can't wait a short while
  398. try:
  399. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  400. except isc.cc.session.SessionError:
  401. time.sleep(0.1)
  402. # Subscribe to the message queue. The only messages we expect to receive
  403. # on this channel are once relating to process startup.
  404. self.cc_session.group_subscribe("Boss")
  405. return msgq_proc
  406. def start_cfgmgr(self):
  407. """
  408. Starts the configuration manager process
  409. """
  410. self.log_starting("b10-cfgmgr")
  411. args = ["b10-cfgmgr"]
  412. if self.data_path is not None:
  413. args.append("--data-path=" + self.data_path)
  414. if self.config_filename is not None:
  415. args.append("--config-filename=" + self.config_filename)
  416. if self.clear_config:
  417. args.append("--clear-config")
  418. bind_cfgd = ProcessInfo("b10-cfgmgr", args,
  419. self.c_channel_env)
  420. bind_cfgd.spawn()
  421. self.log_started(bind_cfgd.pid)
  422. # Wait for the configuration manager to start up as subsequent initialization
  423. # cannot proceed without it. The time to wait can be set on the command line.
  424. time_remaining = self.wait_time
  425. msg, env = self.cc_session.group_recvmsg()
  426. while time_remaining > 0 and not self.process_running(msg, "ConfigManager"):
  427. logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
  428. time.sleep(1)
  429. time_remaining = time_remaining - 1
  430. msg, env = self.cc_session.group_recvmsg()
  431. if not self.process_running(msg, "ConfigManager"):
  432. raise ProcessStartError("Configuration manager process has not started")
  433. return bind_cfgd
  434. def start_ccsession(self, c_channel_env):
  435. """
  436. Start the CC Session
  437. The argument c_channel_env is unused but is supplied to keep the
  438. argument list the same for all start_xxx methods.
  439. With regards to logging, note that as the CC session is not a
  440. process, the log_starting/log_started methods are not used.
  441. """
  442. logger.info(BIND10_STARTING_CC)
  443. self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
  444. self.config_handler,
  445. self.command_handler,
  446. socket_file = self.msgq_socket_file)
  447. self.ccs.start()
  448. logger.debug(DBG_PROCESS, BIND10_STARTED_CC)
  449. # A couple of utility methods for starting processes...
  450. def start_process(self, name, args, c_channel_env, port=None, address=None):
  451. """
  452. Given a set of command arguments, start the process and output
  453. appropriate log messages. If the start is successful, the process
  454. is added to the list of started processes.
  455. The port and address arguments are for log messages only.
  456. """
  457. self.log_starting(name, port, address)
  458. newproc = ProcessInfo(name, args, c_channel_env)
  459. newproc.spawn()
  460. self.log_started(newproc.pid)
  461. return newproc
  462. def register_process(self, pid, component):
  463. """
  464. Put another process into boss to watch over it. When the process
  465. dies, the component.failed() is called with the exit code.
  466. It is expected the info is a isc.bind10.component.BaseComponent
  467. subclass (or anything having the same interface).
  468. """
  469. self.components[pid] = component
  470. def start_simple(self, name):
  471. """
  472. Most of the BIND-10 processes are started with the command:
  473. <process-name> [-v]
  474. ... where -v is appended if verbose is enabled. This method
  475. generates the arguments from the name and starts the process.
  476. The port and address arguments are for log messages only.
  477. """
  478. # Set up the command arguments.
  479. args = [name]
  480. if self.verbose:
  481. args += ['-v']
  482. # ... and start the process
  483. return self.start_process(name, args, self.c_channel_env)
  484. # The next few methods start up the rest of the BIND-10 processes.
  485. # Although many of these methods are little more than a call to
  486. # start_simple, they are retained (a) for testing reasons and (b) as a place
  487. # where modifications can be made if the process start-up sequence changes
  488. # for a given process.
  489. def start_auth(self):
  490. """
  491. Start the Authoritative server
  492. """
  493. if self.uid is not None and self.__started:
  494. logger.warn(BIND10_START_AS_NON_ROOT_AUTH)
  495. authargs = ['b10-auth']
  496. if self.nocache:
  497. authargs += ['-n']
  498. if self.verbose:
  499. authargs += ['-v']
  500. # ... and start
  501. return self.start_process("b10-auth", authargs, self.c_channel_env)
  502. def start_resolver(self):
  503. """
  504. Start the Resolver. At present, all these arguments and switches
  505. are pure speculation. As with the auth daemon, they should be
  506. read from the configuration database.
  507. """
  508. if self.uid is not None and self.__started:
  509. logger.warn(BIND10_START_AS_NON_ROOT_RESOLVER)
  510. self.curproc = "b10-resolver"
  511. # XXX: this must be read from the configuration manager in the future
  512. resargs = ['b10-resolver']
  513. if self.verbose:
  514. resargs += ['-v']
  515. # ... and start
  516. return self.start_process("b10-resolver", resargs, self.c_channel_env)
  517. def start_cmdctl(self):
  518. """
  519. Starts the command control process
  520. """
  521. args = ["b10-cmdctl"]
  522. if self.cmdctl_port is not None:
  523. args.append("--port=" + str(self.cmdctl_port))
  524. if self.verbose:
  525. args.append("-v")
  526. return self.start_process("b10-cmdctl", args, self.c_channel_env,
  527. self.cmdctl_port)
  528. def start_all_components(self):
  529. """
  530. Starts up all the components. Any exception generated during the
  531. starting of the components is handled by the caller.
  532. """
  533. # Start the real core (sockcreator, msgq, cfgmgr)
  534. self._component_configurator.startup(self.__core_components)
  535. # Connect to the msgq. This is not a process, so it's not handled
  536. # inside the configurator.
  537. self.start_ccsession(self.c_channel_env)
  538. # Extract the parameters associated with Bob. This can only be
  539. # done after the CC Session is started. Note that the logging
  540. # configuration may override the "-v" switch set on the command line.
  541. self._read_bind10_config()
  542. # TODO: Return the dropping of privileges
  543. def startup(self):
  544. """
  545. Start the BoB instance.
  546. Returns None if successful, otherwise an string describing the
  547. problem.
  548. """
  549. # Try to connect to the c-channel daemon, to see if it is already
  550. # running
  551. c_channel_env = {}
  552. if self.msgq_socket_file is not None:
  553. c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
  554. logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
  555. # try to connect, and if we can't wait a short while
  556. try:
  557. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  558. logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
  559. return "b10-msgq already running, or socket file not cleaned , cannot start"
  560. except isc.cc.session.SessionError:
  561. # this is the case we want, where the msgq is not running
  562. pass
  563. # Start all components. If any one fails to start, kill all started
  564. # components and exit with an error indication.
  565. try:
  566. self.c_channel_env = c_channel_env
  567. self.start_all_components()
  568. except Exception as e:
  569. self.kill_started_components()
  570. return "Unable to start " + self.curproc + ": " + str(e)
  571. # Started successfully
  572. self.runnable = True
  573. self.__started = True
  574. return None
  575. def stop_process(self, process, recipient, pid):
  576. """
  577. Stop the given process, friendly-like. The process is the name it has
  578. (in logs, etc), the recipient is the address on msgq. The pid is the
  579. pid of the process (if we have multiple processes of the same name,
  580. it might want to choose if it is for this one).
  581. """
  582. logger.info(BIND10_STOP_PROCESS, process)
  583. self.cc_session.group_sendmsg(isc.config.ccsession.
  584. create_command('shutdown', {'pid': pid}),
  585. recipient, recipient)
  586. def component_shutdown(self, exitcode=0):
  587. """
  588. Stop the Boss instance from a components' request. The exitcode
  589. indicates the desired exit code.
  590. If we did not start yet, it raises an exception, which is meant
  591. to propagate through the component and configurator to the startup
  592. routine and abort the startup immediately. If it is started up already,
  593. we just mark it so we terminate soon.
  594. It does set the exit code in both cases.
  595. """
  596. self.exitcode = exitcode
  597. if not self.__started:
  598. raise Exception("Component failed during startup");
  599. else:
  600. self.runnable = False
  601. def shutdown(self):
  602. """Stop the BoB instance."""
  603. logger.info(BIND10_SHUTDOWN)
  604. # If ccsession is still there, inform rest of the system this module
  605. # is stopping. Since everything will be stopped shortly, this is not
  606. # really necessary, but this is done to reflect that boss is also
  607. # 'just' a module.
  608. self.ccs.send_stopping()
  609. # try using the BIND 10 request to stop
  610. try:
  611. self._component_configurator.shutdown()
  612. except:
  613. pass
  614. # XXX: some delay probably useful... how much is uncertain
  615. # I have changed the delay from 0.5 to 1, but sometime it's
  616. # still not enough.
  617. time.sleep(1)
  618. self.reap_children()
  619. # Send TERM and KILL signals to modules if we're not prevented
  620. # from doing so
  621. if not self.nokill:
  622. # next try sending a SIGTERM
  623. components_to_stop = list(self.components.values())
  624. for component in components_to_stop:
  625. logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
  626. try:
  627. component.kill()
  628. except OSError:
  629. # ignore these (usually ESRCH because the child
  630. # finally exited)
  631. pass
  632. # finally, send SIGKILL (unmaskable termination) until everybody dies
  633. while self.components:
  634. # XXX: some delay probably useful... how much is uncertain
  635. time.sleep(0.1)
  636. self.reap_children()
  637. components_to_stop = list(self.components.values())
  638. for component in components_to_stop:
  639. logger.info(BIND10_SEND_SIGKILL, component.name(),
  640. component.pid())
  641. try:
  642. component.kill(True)
  643. except OSError:
  644. # ignore these (usually ESRCH because the child
  645. # finally exited)
  646. pass
  647. logger.info(BIND10_SHUTDOWN_COMPLETE)
  648. def _get_process_exit_status(self):
  649. return os.waitpid(-1, os.WNOHANG)
  650. def reap_children(self):
  651. """Check to see if any of our child processes have exited,
  652. and note this for later handling.
  653. """
  654. while True:
  655. try:
  656. (pid, exit_status) = self._get_process_exit_status()
  657. except OSError as o:
  658. if o.errno == errno.ECHILD: break
  659. # XXX: should be impossible to get any other error here
  660. raise
  661. if pid == 0: break
  662. if pid in self.components:
  663. # One of the components we know about. Get information on it.
  664. component = self.components.pop(pid)
  665. logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
  666. exit_status)
  667. if component.running() and self.runnable:
  668. # Tell it it failed. But only if it matters (we are
  669. # not shutting down and the component considers itself
  670. # to be running.
  671. component_restarted = component.failed(exit_status);
  672. # if the process wants to be restarted, but not just yet,
  673. # it returns False
  674. if not component_restarted:
  675. self.components_to_restart.append(component)
  676. else:
  677. logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
  678. def restart_processes(self):
  679. """
  680. Restart any dead processes:
  681. * Returns the time when the next process is ready to be restarted.
  682. * If the server is shutting down, returns 0.
  683. * If there are no processes, returns None.
  684. The values returned can be safely passed into select() as the
  685. timeout value.
  686. """
  687. if not self.runnable:
  688. return 0
  689. still_dead = []
  690. # keep track of the first time we need to check this queue again,
  691. # if at all
  692. next_restart_time = None
  693. now = time.time()
  694. for component in self.components_to_restart:
  695. if not component.restart(now):
  696. still_dead.append(component)
  697. if next_restart_time is None or\
  698. next_restart_time > component.get_restart_time():
  699. next_restart_time = component.get_restart_time()
  700. self.components_to_restart = still_dead
  701. return next_restart_time
  702. def _get_socket(self, args):
  703. """
  704. Implementation of the get_socket CC command. It asks the cache
  705. to provide the token and sends the information back.
  706. """
  707. try:
  708. try:
  709. addr = isc.net.parse.addr_parse(args['address'])
  710. port = isc.net.parse.port_parse(args['port'])
  711. protocol = args['protocol']
  712. if protocol not in ['UDP', 'TCP']:
  713. raise ValueError("Protocol must be either UDP or TCP")
  714. share_mode = args['share_mode']
  715. if share_mode not in ['ANY', 'SAMEAPP', 'NO']:
  716. raise ValueError("Share mode must be one of ANY, SAMEAPP" +
  717. " or NO")
  718. share_name = args['share_name']
  719. except KeyError as ke:
  720. return \
  721. isc.config.ccsession.create_answer(1,
  722. "Missing parameter " +
  723. str(ke))
  724. # FIXME: This call contains blocking IPC. It is expected to be
  725. # short, but if it turns out to be problem, we'll need to do
  726. # something about it.
  727. token = self._socket_cache.get_token(protocol, addr, port,
  728. share_mode, share_name)
  729. return isc.config.ccsession.create_answer(0, {
  730. 'token': token,
  731. 'path': self._socket_path
  732. })
  733. except isc.bind10.socket_cache.SocketError as e:
  734. return isc.config.ccsession.create_answer(CREATOR_SOCKET_ERROR,
  735. str(e))
  736. except isc.bind10.socket_cache.ShareError as e:
  737. return isc.config.ccsession.create_answer(CREATOR_SHARE_ERROR,
  738. str(e))
  739. except Exception as e:
  740. return isc.config.ccsession.create_answer(1, str(e))
  741. def socket_request_handler(self, token, unix_socket):
  742. """
  743. This function handles a token that comes over a unix_domain socket.
  744. The function looks into the _socket_cache and sends the socket
  745. identified by the token back over the unix_socket.
  746. """
  747. try:
  748. token = str(token, 'ASCII') # Convert from bytes to str
  749. fd = self._socket_cache.get_socket(token, unix_socket.fileno())
  750. # FIXME: These two calls are blocking in their nature. An OS-level
  751. # buffer is likely to be large enough to hold all these data, but
  752. # if it wasn't and the remote application got stuck, we would have
  753. # a problem. If there appear such problems, we should do something
  754. # about it.
  755. unix_socket.sendall(CREATOR_SOCKET_OK)
  756. libutil_io_python.send_fd(unix_socket.fileno(), fd)
  757. except Exception as e:
  758. logger.info(BIND10_NO_SOCKET, token, e)
  759. unix_socket.sendall(CREATOR_SOCKET_UNAVAILABLE)
  760. def socket_consumer_dead(self, unix_socket):
  761. """
  762. This function handles when a unix_socket closes. This means all
  763. sockets sent to it are to be considered closed. This function signals
  764. so to the _socket_cache.
  765. """
  766. logger.info(BIND10_LOST_SOCKET_CONSUMER, unix_socket.fileno())
  767. try:
  768. self._socket_cache.drop_application(unix_socket.fileno())
  769. except ValueError:
  770. # This means the application holds no sockets. It's harmless, as it
  771. # can happen in real life - for example, it requests a socket, but
  772. # get_socket doesn't find it, so the application dies. It should be
  773. # rare, though.
  774. pass
  775. def set_creator(self, creator):
  776. """
  777. Registeres a socket creator into the boss. The socket creator is not
  778. used directly, but through a cache. The cache is created in this
  779. method.
  780. If called more than once, it raises a ValueError.
  781. """
  782. if self._socket_cache is not None:
  783. raise ValueError("A creator was inserted previously")
  784. self._socket_cache = isc.bind10.socket_cache.Cache(creator)
  785. def init_socket_srv(self):
  786. """
  787. Creates and listens on a unix-domain socket to be able to send out
  788. the sockets.
  789. This method should be called after switching user, or the switched
  790. applications won't be able to access the socket.
  791. """
  792. self._srv_socket = socket.socket(socket.AF_UNIX)
  793. # We create a temporary directory somewhere safe and unique, to avoid
  794. # the need to find the place ourself or bother users. Also, this
  795. # secures the socket on some platforms, as it creates a private
  796. # directory.
  797. self._tmpdir = tempfile.mkdtemp(prefix='sockcreator-')
  798. # Get the name
  799. self._socket_path = os.path.join(self._tmpdir, "sockcreator")
  800. # And bind the socket to the name
  801. self._srv_socket.bind(self._socket_path)
  802. self._srv_socket.listen(5)
  803. def remove_socket_srv(self):
  804. """
  805. Closes and removes the listening socket and the directory where it
  806. lives, as we created both.
  807. It does nothing if the _srv_socket is not set (eg. it was not yet
  808. initialized).
  809. """
  810. if self._srv_socket is not None:
  811. self._srv_socket.close()
  812. os.remove(self._socket_path)
  813. os.rmdir(self._tmpdir)
  814. def _srv_accept(self):
  815. """
  816. Accept a socket from the unix domain socket server and put it to the
  817. others we care about.
  818. """
  819. (socket, conn) = self._srv_socket.accept()
  820. self._unix_sockets[socket.fileno()] = (socket, b'')
  821. def _socket_data(self, socket_fileno):
  822. """
  823. This is called when a socket identified by the socket_fileno needs
  824. attention. We try to read data from there. If it is closed, we remove
  825. it.
  826. """
  827. (sock, previous) = self._unix_sockets[socket_fileno]
  828. while True:
  829. try:
  830. data = sock.recv(1, socket.MSG_DONTWAIT)
  831. except socket.error as se:
  832. # These two might be different on some systems
  833. if se.errno == errno.EAGAIN or se.errno == errno.EWOULDBLOCK:
  834. # No more data now. Oh, well, just store what we have.
  835. self._unix_sockets[socket_fileno] = (sock, previous)
  836. return
  837. else:
  838. data = b'' # Pretend it got closed
  839. if len(data) == 0: # The socket got to it's end
  840. del self._unix_sockets[socket_fileno]
  841. self.socket_consumer_dead(sock)
  842. sock.close()
  843. return
  844. else:
  845. if data == b"\n":
  846. # Handle this token and clear it
  847. self.socket_request_handler(previous, sock)
  848. previous = b''
  849. else:
  850. previous += data
  851. def run(self, wakeup_fd):
  852. """
  853. The main loop, waiting for sockets, commands and dead processes.
  854. Runs as long as the runnable is true.
  855. The wakeup_fd descriptor is the read end of pipe where CHLD signal
  856. handler writes.
  857. """
  858. ccs_fd = self.ccs.get_socket().fileno()
  859. while self.runnable:
  860. # clean up any processes that exited
  861. self.reap_children()
  862. next_restart = self.restart_processes()
  863. if next_restart is None:
  864. wait_time = None
  865. else:
  866. wait_time = max(next_restart - time.time(), 0)
  867. # select() can raise EINTR when a signal arrives,
  868. # even if they are resumable, so we have to catch
  869. # the exception
  870. try:
  871. (rlist, wlist, xlist) = \
  872. select.select([wakeup_fd, ccs_fd,
  873. self._srv_socket.fileno()] +
  874. list(self._unix_sockets.keys()), [], [],
  875. wait_time)
  876. except select.error as err:
  877. if err.args[0] == errno.EINTR:
  878. (rlist, wlist, xlist) = ([], [], [])
  879. else:
  880. logger.fatal(BIND10_SELECT_ERROR, err)
  881. break
  882. for fd in rlist + xlist:
  883. if fd == ccs_fd:
  884. try:
  885. self.ccs.check_command()
  886. except isc.cc.session.ProtocolError:
  887. logger.fatal(BIND10_MSGQ_DISAPPEARED)
  888. self.runnable = False
  889. break
  890. elif fd == wakeup_fd:
  891. os.read(wakeup_fd, 32)
  892. elif fd == self._srv_socket.fileno():
  893. self._srv_accept()
  894. elif fd in self._unix_sockets:
  895. self._socket_data(fd)
  896. # global variables, needed for signal handlers
  897. options = None
  898. boss_of_bind = None
  899. def reaper(signal_number, stack_frame):
  900. """A child process has died (SIGCHLD received)."""
  901. # don't do anything...
  902. # the Python signal handler has been set up to write
  903. # down a pipe, waking up our select() bit
  904. pass
  905. def get_signame(signal_number):
  906. """Return the symbolic name for a signal."""
  907. for sig in dir(signal):
  908. if sig.startswith("SIG") and sig[3].isalnum():
  909. if getattr(signal, sig) == signal_number:
  910. return sig
  911. return "Unknown signal %d" % signal_number
  912. # XXX: perhaps register atexit() function and invoke that instead
  913. def fatal_signal(signal_number, stack_frame):
  914. """We need to exit (SIGINT or SIGTERM received)."""
  915. global options
  916. global boss_of_bind
  917. logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
  918. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  919. boss_of_bind.runnable = False
  920. def process_rename(option, opt_str, value, parser):
  921. """Function that renames the process if it is requested by a option."""
  922. isc.util.process.rename(value)
  923. def parse_args(args=sys.argv[1:], Parser=OptionParser):
  924. """
  925. Function for parsing command line arguments. Returns the
  926. options object from OptionParser.
  927. """
  928. parser = Parser(version=VERSION)
  929. parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
  930. type="string", default=None,
  931. help="UNIX domain socket file the b10-msgq daemon will use")
  932. parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
  933. default=False, help="disable hot-spot cache in authoritative DNS server")
  934. parser.add_option("-i", "--no-kill", action="store_true", dest="nokill",
  935. default=False, help="do not send SIGTERM and SIGKILL signals to modules during shutdown")
  936. parser.add_option("-u", "--user", dest="user", type="string", default=None,
  937. help="Change user after startup (must run as root)")
  938. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  939. help="display more about what is going on")
  940. parser.add_option("--pretty-name", type="string", action="callback",
  941. callback=process_rename,
  942. help="Set the process name (displayed in ps, top, ...)")
  943. parser.add_option("-c", "--config-file", action="store",
  944. dest="config_file", default=None,
  945. help="Configuration database filename")
  946. parser.add_option("--clear-config", action="store_true",
  947. dest="clear_config", default=False,
  948. help="Create backup of the configuration file and " +
  949. "start with a clean configuration")
  950. parser.add_option("-p", "--data-path", dest="data_path",
  951. help="Directory to search for configuration files",
  952. default=None)
  953. parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
  954. default=None, help="Port of command control")
  955. parser.add_option("--pid-file", dest="pid_file", type="string",
  956. default=None,
  957. help="file to dump the PID of the BIND 10 process")
  958. parser.add_option("-w", "--wait", dest="wait_time", type="int",
  959. default=10, help="Time (in seconds) to wait for config manager to start up")
  960. (options, args) = parser.parse_args(args)
  961. if options.cmdctl_port is not None:
  962. try:
  963. isc.net.parse.port_parse(options.cmdctl_port)
  964. except ValueError as e:
  965. parser.error(e)
  966. if args:
  967. parser.print_help()
  968. sys.exit(1)
  969. return options
  970. def dump_pid(pid_file):
  971. """
  972. Dump the PID of the current process to the specified file. If the given
  973. file is None this function does nothing. If the file already exists,
  974. the existing content will be removed. If a system error happens in
  975. creating or writing to the file, the corresponding exception will be
  976. propagated to the caller.
  977. """
  978. if pid_file is None:
  979. return
  980. f = open(pid_file, "w")
  981. f.write('%d\n' % os.getpid())
  982. f.close()
  983. def unlink_pid_file(pid_file):
  984. """
  985. Remove the given file, which is basically expected to be the PID file
  986. created by dump_pid(). The specified may or may not exist; if it
  987. doesn't this function does nothing. Other system level errors in removing
  988. the file will be propagated as the corresponding exception.
  989. """
  990. if pid_file is None:
  991. return
  992. try:
  993. os.unlink(pid_file)
  994. except OSError as error:
  995. if error.errno is not errno.ENOENT:
  996. raise
  997. def remove_lock_files():
  998. """
  999. Remove various lock files which were created by code such as in the
  1000. logger. This function should be called after BIND 10 shutdown.
  1001. """
  1002. lockfiles = ["logger_lockfile"]
  1003. lpath = bind10_config.DATA_PATH
  1004. if "B10_FROM_BUILD" in os.environ:
  1005. lpath = os.environ["B10_FROM_BUILD"]
  1006. if "B10_FROM_SOURCE_LOCALSTATEDIR" in os.environ:
  1007. lpath = os.environ["B10_FROM_SOURCE_LOCALSTATEDIR"]
  1008. if "B10_LOCKFILE_DIR_FROM_BUILD" in os.environ:
  1009. lpath = os.environ["B10_LOCKFILE_DIR_FROM_BUILD"]
  1010. for f in lockfiles:
  1011. fname = lpath + '/' + f
  1012. if os.path.isfile(fname):
  1013. os.unlink(fname)
  1014. return
  1015. def main():
  1016. global options
  1017. global boss_of_bind
  1018. # Enforce line buffering on stdout, even when not a TTY
  1019. sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)
  1020. options = parse_args()
  1021. # Check user ID.
  1022. setuid = None
  1023. setgid = None
  1024. username = None
  1025. if options.user:
  1026. # Try getting information about the user, assuming UID passed.
  1027. try:
  1028. pw_ent = pwd.getpwuid(int(options.user))
  1029. setuid = pw_ent.pw_uid
  1030. setgid = pw_ent.pw_gid
  1031. username = pw_ent.pw_name
  1032. except ValueError:
  1033. pass
  1034. except KeyError:
  1035. pass
  1036. # Next try getting information about the user, assuming user name
  1037. # passed.
  1038. # If the information is both a valid user name and user number, we
  1039. # prefer the name because we try it second. A minor point, hopefully.
  1040. try:
  1041. pw_ent = pwd.getpwnam(options.user)
  1042. setuid = pw_ent.pw_uid
  1043. setgid = pw_ent.pw_gid
  1044. username = pw_ent.pw_name
  1045. except KeyError:
  1046. pass
  1047. if setuid is None:
  1048. logger.fatal(BIND10_INVALID_USER, options.user)
  1049. sys.exit(1)
  1050. # Announce startup.
  1051. logger.info(BIND10_STARTING, VERSION)
  1052. # Create wakeup pipe for signal handlers
  1053. wakeup_pipe = os.pipe()
  1054. signal.set_wakeup_fd(wakeup_pipe[1])
  1055. # Set signal handlers for catching child termination, as well
  1056. # as our own demise.
  1057. signal.signal(signal.SIGCHLD, reaper)
  1058. signal.siginterrupt(signal.SIGCHLD, False)
  1059. signal.signal(signal.SIGINT, fatal_signal)
  1060. signal.signal(signal.SIGTERM, fatal_signal)
  1061. # Block SIGPIPE, as we don't want it to end this process
  1062. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  1063. try:
  1064. # Go bob!
  1065. boss_of_bind = BoB(options.msgq_socket_file, options.data_path,
  1066. options.config_file, options.clear_config,
  1067. options.nocache, options.verbose, options.nokill,
  1068. setuid, setgid, username, options.cmdctl_port,
  1069. options.wait_time)
  1070. startup_result = boss_of_bind.startup()
  1071. if startup_result:
  1072. logger.fatal(BIND10_STARTUP_ERROR, startup_result)
  1073. sys.exit(1)
  1074. boss_of_bind.init_socket_srv()
  1075. logger.info(BIND10_STARTUP_COMPLETE)
  1076. dump_pid(options.pid_file)
  1077. # Let it run
  1078. boss_of_bind.run(wakeup_pipe[0])
  1079. # shutdown
  1080. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  1081. boss_of_bind.shutdown()
  1082. finally:
  1083. # Clean up the filesystem
  1084. unlink_pid_file(options.pid_file)
  1085. remove_lock_files()
  1086. if boss_of_bind is not None:
  1087. boss_of_bind.remove_socket_srv()
  1088. sys.exit(boss_of_bind.exitcode)
  1089. if __name__ == "__main__":
  1090. main()