bind10.py.in 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977
  1. #!@PYTHON@
  2. # Copyright (C) 2010 Internet Systems Consortium.
  3. #
  4. # Permission to use, copy, modify, and distribute this software for any
  5. # purpose with or without fee is hereby granted, provided that the above
  6. # copyright notice and this permission notice appear in all copies.
  7. #
  8. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  9. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  10. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  11. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  13. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  14. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  15. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16. """
  17. This file implements the Boss of Bind (BoB, or bob) program.
  18. Its purpose is to start up the BIND 10 system, and then manage the
  19. processes, by starting and stopping processes, plus restarting
  20. processes that exit.
  21. To start the system, it first runs the c-channel program (msgq), then
  22. connects to that. It then runs the configuration manager, and reads
  23. its own configuration. Then it proceeds to starting other modules.
  24. The Python subprocess module is used for starting processes, but
  25. because this is not efficient for managing groups of processes,
  26. SIGCHLD signals are caught and processed using the signal module.
  27. Most of the logic is contained in the BoB class. However, since Python
  28. requires that signal processing happen in the main thread, we do
  29. signal handling outside of that class, in the code running for
  30. __main__.
  31. """
  32. import sys; sys.path.append ('@@PYTHONPATH@@')
  33. import os
  34. # If B10_FROM_SOURCE is set in the environment, we use data files
  35. # from a directory relative to that, otherwise we use the ones
  36. # installed on the system
  37. if "B10_FROM_SOURCE" in os.environ:
  38. SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
  39. else:
  40. PREFIX = "@prefix@"
  41. DATAROOTDIR = "@datarootdir@"
  42. SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
  43. import subprocess
  44. import signal
  45. import re
  46. import errno
  47. import time
  48. import select
  49. import random
  50. import socket
  51. from optparse import OptionParser, OptionValueError
  52. import io
  53. import pwd
  54. import posix
  55. import isc.cc
  56. import isc.util.process
  57. import isc.net.parse
  58. # Assign this process some longer name
  59. isc.util.process.rename(sys.argv[0])
  60. # This is the version that gets displayed to the user.
  61. # The VERSION string consists of the module name, the module version
  62. # number, and the overall BIND 10 version number (set in configure.ac).
  63. VERSION = "bind10 20101129 (BIND 10 @PACKAGE_VERSION@)"
  64. # This is for bind10.boottime of stats module
  65. _BASETIME = time.gmtime()
  66. class RestartSchedule:
  67. """
  68. Keeps state when restarting something (in this case, a process).
  69. When a process dies unexpectedly, we need to restart it. However, if
  70. it fails to restart for some reason, then we should not simply keep
  71. restarting it at high speed.
  72. A more sophisticated algorithm can be developed, but for now we choose
  73. a simple set of rules:
  74. * If a process was been running for >=10 seconds, we restart it
  75. right away.
  76. * If a process was running for <10 seconds, we wait until 10 seconds
  77. after it was started.
  78. To avoid programs getting into lockstep, we use a normal distribution
  79. to avoid being restarted at exactly 10 seconds."""
  80. def __init__(self, restart_frequency=10.0):
  81. self.restart_frequency = restart_frequency
  82. self.run_start_time = None
  83. self.run_stop_time = None
  84. self.restart_time = None
  85. def set_run_start_time(self, when=None):
  86. if when is None:
  87. when = time.time()
  88. self.run_start_time = when
  89. sigma = self.restart_frequency * 0.05
  90. self.restart_time = when + random.normalvariate(self.restart_frequency,
  91. sigma)
  92. def set_run_stop_time(self, when=None):
  93. """We don't actually do anything with stop time now, but it
  94. might be useful for future algorithms."""
  95. if when is None:
  96. when = time.time()
  97. self.run_stop_time = when
  98. def get_restart_time(self, when=None):
  99. if when is None:
  100. when = time.time()
  101. return max(when, self.restart_time)
  102. class ProcessInfoError(Exception): pass
  103. class ProcessInfo:
  104. """Information about a process"""
  105. dev_null = open(os.devnull, "w")
  106. def __init__(self, name, args, env={}, dev_null_stdout=False,
  107. dev_null_stderr=False, uid=None, username=None):
  108. self.name = name
  109. self.args = args
  110. self.env = env
  111. self.dev_null_stdout = dev_null_stdout
  112. self.dev_null_stderr = dev_null_stderr
  113. self.restart_schedule = RestartSchedule()
  114. self.uid = uid
  115. self.username = username
  116. self._spawn()
  117. def _preexec_work(self):
  118. """Function used before running a program that needs to run as a
  119. different user."""
  120. # First, put us into a separate process group so we don't get
  121. # SIGINT signals on Ctrl-C (the boss will shut everthing down by
  122. # other means).
  123. os.setpgrp()
  124. # Second, set the user ID if one has been specified
  125. if self.uid is not None:
  126. try:
  127. posix.setuid(self.uid)
  128. except OSError as e:
  129. if e.errno == errno.EPERM:
  130. # if we failed to change user due to permission report that
  131. raise ProcessInfoError("Unable to change to user %s (uid %d)" % (self.username, self.uid))
  132. else:
  133. # otherwise simply re-raise whatever error we found
  134. raise
  135. def _spawn(self):
  136. if self.dev_null_stdout:
  137. spawn_stdout = self.dev_null
  138. else:
  139. spawn_stdout = None
  140. if self.dev_null_stderr:
  141. spawn_stderr = self.dev_null
  142. else:
  143. spawn_stderr = None
  144. # Environment variables for the child process will be a copy of those
  145. # of the boss process with any additional specific variables given
  146. # on construction (self.env).
  147. spawn_env = os.environ
  148. spawn_env.update(self.env)
  149. if 'B10_FROM_SOURCE' not in os.environ:
  150. spawn_env['PATH'] = "@@LIBEXECDIR@@:" + spawn_env['PATH']
  151. self.process = subprocess.Popen(self.args,
  152. stdin=subprocess.PIPE,
  153. stdout=spawn_stdout,
  154. stderr=spawn_stderr,
  155. close_fds=True,
  156. env=spawn_env,
  157. preexec_fn=self._preexec_work)
  158. self.pid = self.process.pid
  159. self.restart_schedule.set_run_start_time()
  160. def respawn(self):
  161. self._spawn()
  162. class CChannelConnectError(Exception): pass
  163. class BoB:
  164. """Boss of BIND class."""
  165. def __init__(self, msgq_socket_file=None, dns_port=5300, address=None,
  166. forward=None, nocache=False, verbose=False, setuid=None,
  167. username=None):
  168. """
  169. Initialize the Boss of BIND. This is a singleton (only one can run).
  170. The msgq_socket_file specifies the UNIX domain socket file that the
  171. msgq process listens on. If verbose is True, then the boss reports
  172. what it is doing.
  173. """
  174. self.address = address
  175. self.dns_port = dns_port
  176. self.forward = forward
  177. if forward:
  178. self.resolver = True
  179. else:
  180. self.resolver = False
  181. self.cc_session = None
  182. self.ccs = None
  183. self.cfg_start_auth = True
  184. self.cfg_start_resolver = False
  185. self.started_auth_family = False
  186. self.started_resolver_family = False
  187. self.curproc = None
  188. self.dead_processes = {}
  189. self.msgq_socket_file = msgq_socket_file
  190. self.nocache = nocache
  191. self.processes = {}
  192. self.expected_shutdowns = {}
  193. self.runnable = False
  194. self.uid = setuid
  195. self.username = username
  196. self.verbose = verbose
  197. def config_handler(self, new_config):
  198. # If this is initial update, don't do anything now, leave it to startup
  199. if not self.runnable:
  200. return
  201. # Now we declare few functions used only internally here. Besides the
  202. # benefit of not polluting the name space, they are closures, so we
  203. # don't need to pass some variables
  204. def start_stop(name, started, start, stop):
  205. if not'start_' + name in new_config:
  206. return
  207. if new_config['start_' + name]:
  208. if not started:
  209. if self.uid is not None:
  210. sys.stderr.write("[bind10] Starting " + name + " as " +
  211. "a user, not root. This might fail.\n")
  212. start()
  213. else:
  214. stop()
  215. # These four functions are passed to start_stop (smells like functional
  216. # programming little bit)
  217. def resolver_on():
  218. self.start_resolver(self.c_channel_env)
  219. self.started_resolver_family = True
  220. def resolver_off():
  221. self.stop_resolver()
  222. self.started_resolver_family = False
  223. def auth_on():
  224. self.start_auth(self.c_channel_env)
  225. self.start_xfrout(self.c_channel_env)
  226. self.start_xfrin(self.c_channel_env)
  227. self.start_zonemgr(self.c_channel_env)
  228. self.started_auth_family = True
  229. def auth_off():
  230. self.stop_zonemgr()
  231. self.stop_xfrin()
  232. self.stop_xfrout()
  233. self.stop_auth()
  234. self.started_auth_family = False
  235. # The real code of the config handler function follows here
  236. if self.verbose:
  237. sys.stdout.write("[bind10] Handling new configuration: " +
  238. str(new_config) + "\n")
  239. start_stop('resolver', self.started_resolver_family, resolver_on,
  240. resolver_off)
  241. start_stop('auth', self.started_auth_family, auth_on, auth_off)
  242. answer = isc.config.ccsession.create_answer(0)
  243. return answer
  244. def command_handler(self, command, args):
  245. if self.verbose:
  246. sys.stdout.write("[bind10] Boss got command: " + command + "\n")
  247. answer = isc.config.ccsession.create_answer(1, "command not implemented")
  248. if type(command) != str:
  249. answer = isc.config.ccsession.create_answer(1, "bad command")
  250. else:
  251. if command == "shutdown":
  252. self.runnable = False
  253. answer = isc.config.ccsession.create_answer(0)
  254. else:
  255. answer = isc.config.ccsession.create_answer(1,
  256. "Unknown command")
  257. return answer
  258. def kill_started_processes(self):
  259. """
  260. Called as part of the exception handling when a process fails to
  261. start, this runs through the list of started processes, killing
  262. each one. It then clears that list.
  263. """
  264. if self.verbose:
  265. sys.stdout.write("[bind10] killing started processes:\n")
  266. for pid in self.processes:
  267. if self.verbose:
  268. sys.stdout.write("[bind10] - %s\n" % self.processes[pid].name)
  269. self.processes[pid].process.kill()
  270. self.processes = {}
  271. def read_bind10_config(self):
  272. """
  273. Reads the parameters associated with the BoB module itself.
  274. At present these are the components to start although arguably this
  275. information should be in the configuration for the appropriate
  276. module itself. (However, this would cause difficulty in the case of
  277. xfrin/xfrout and zone manager as we don't need to start those if we
  278. are not running the authoritative server.)
  279. """
  280. if self.verbose:
  281. sys.stdout.write("[bind10] Reading Boss configuration:\n")
  282. config_data = self.ccs.get_full_config()
  283. self.cfg_start_auth = config_data.get("start_auth")
  284. self.cfg_start_resolver = config_data.get("start_resolver")
  285. if self.verbose:
  286. sys.stdout.write("[bind10] - start_auth: %s\n" %
  287. str(self.cfg_start_auth))
  288. sys.stdout.write("[bind10] - start_resolver: %s\n" %
  289. str(self.cfg_start_resolver))
  290. def log_starting(self, process, port = None, address = None):
  291. """
  292. A convenience function to output a "Starting xxx" message if the
  293. verbose option is set. Putting this into a separate method ensures
  294. that the output form is consistent across all processes.
  295. The process name (passed as the first argument) is put into
  296. self.curproc, and is used to indicate which process failed to
  297. start if there is an error (and is used in the "Started" message
  298. on success). The optional port and address information are
  299. appended to the message (if present).
  300. """
  301. self.curproc = process
  302. if self.verbose:
  303. sys.stdout.write("[bind10] Starting %s" % self.curproc)
  304. if port is not None:
  305. sys.stdout.write(" on port %d" % port)
  306. if address is not None:
  307. sys.stdout.write(" (address %s)" % str(address))
  308. sys.stdout.write("\n")
  309. def log_started(self, pid = None):
  310. """
  311. A convenience function to output a 'Started xxxx (PID yyyy)'
  312. message. As with starting_message(), this ensures a consistent
  313. format.
  314. """
  315. if self.verbose:
  316. sys.stdout.write("[bind10] Started %s" % self.curproc)
  317. if pid is not None:
  318. sys.stdout.write(" (PID %d)" % pid)
  319. sys.stdout.write("\n")
  320. # The next few methods start the individual processes of BIND-10. They
  321. # are called via start_all_process(). If any fail, an exception is raised
  322. # which is caught by the caller of start_all_processes(); this kills
  323. # processes started up to that point before terminating the program.
  324. def start_msgq(self, c_channel_env):
  325. """
  326. Start the message queue and connect to the command channel.
  327. """
  328. self.log_starting("b10-msgq")
  329. c_channel = ProcessInfo("b10-msgq", ["b10-msgq"], c_channel_env,
  330. True, not self.verbose, uid=self.uid,
  331. username=self.username)
  332. self.processes[c_channel.pid] = c_channel
  333. self.log_started(c_channel.pid)
  334. # Now connect to the c-channel
  335. cc_connect_start = time.time()
  336. while self.cc_session is None:
  337. # if we have been trying for "a while" give up
  338. if (time.time() - cc_connect_start) > 5:
  339. raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")
  340. # try to connect, and if we can't wait a short while
  341. try:
  342. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  343. except isc.cc.session.SessionError:
  344. time.sleep(0.1)
  345. def start_cfgmgr(self, c_channel_env):
  346. """
  347. Starts the configuration manager process
  348. """
  349. self.log_starting("b10-cfgmgr")
  350. bind_cfgd = ProcessInfo("b10-cfgmgr", ["b10-cfgmgr"],
  351. c_channel_env, uid=self.uid,
  352. username=self.username)
  353. self.processes[bind_cfgd.pid] = bind_cfgd
  354. self.log_started(bind_cfgd.pid)
  355. # sleep until b10-cfgmgr is fully up and running, this is a good place
  356. # to have a (short) timeout on synchronized groupsend/receive
  357. # TODO: replace the sleep by a listen for ConfigManager started
  358. # message
  359. time.sleep(1)
  360. def start_ccsession(self, c_channel_env):
  361. """
  362. Start the CC Session
  363. The argument c_channel_env is unused but is supplied to keep the
  364. argument list the same for all start_xxx methods.
  365. """
  366. self.log_starting("ccsession")
  367. self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
  368. self.config_handler, self.command_handler)
  369. self.ccs.start()
  370. self.log_started()
  371. # A couple of utility methods for starting processes...
  372. def start_process(self, name, args, c_channel_env, port=None, address=None):
  373. """
  374. Given a set of command arguments, start the process and output
  375. appropriate log messages. If the start is successful, the process
  376. is added to the list of started processes.
  377. The port and address arguments are for log messages only.
  378. """
  379. self.log_starting(name, port, address)
  380. newproc = ProcessInfo(name, args, c_channel_env)
  381. self.processes[newproc.pid] = newproc
  382. self.log_started(newproc.pid)
  383. def start_simple(self, name, c_channel_env, port=None, address=None):
  384. """
  385. Most of the BIND-10 processes are started with the command:
  386. <process-name> [-v]
  387. ... where -v is appended if verbose is enabled. This method
  388. generates the arguments from the name and starts the process.
  389. The port and address arguments are for log messages only.
  390. """
  391. # Set up the command arguments.
  392. args = [name]
  393. if self.verbose:
  394. args += ['-v']
  395. # ... and start the process
  396. self.start_process(name, args, c_channel_env, port, address)
  397. # The next few methods start up the rest of the BIND-10 processes.
  398. # Although many of these methods are little more than a call to
  399. # start_simple, they are retained (a) for testing reasons and (b) as a place
  400. # where modifications can be made if the process start-up sequence changes
  401. # for a given process.
  402. def start_auth(self, c_channel_env):
  403. """
  404. Start the Authoritative server
  405. """
  406. # XXX: this must be read from the configuration manager in the future
  407. if self.resolver:
  408. dns_prog = 'b10-resolver'
  409. else:
  410. dns_prog = 'b10-auth'
  411. dnsargs = [dns_prog]
  412. if not self.resolver:
  413. # The resolver uses configuration manager for these
  414. dnsargs += ['-p', str(self.dns_port)]
  415. if self.address:
  416. dnsargs += ['-a', str(self.address)]
  417. if self.nocache:
  418. dnsargs += ['-n']
  419. if self.uid:
  420. dnsargs += ['-u', str(self.uid)]
  421. if self.verbose:
  422. dnsargs += ['-v']
  423. # ... and start
  424. self.start_process("b10-auth", dnsargs, c_channel_env,
  425. self.dns_port, self.address)
  426. def start_resolver(self, c_channel_env):
  427. """
  428. Start the Resolver. At present, all these arguments and switches
  429. are pure speculation. As with the auth daemon, they should be
  430. read from the configuration database.
  431. """
  432. self.curproc = "b10-resolver"
  433. # XXX: this must be read from the configuration manager in the future
  434. resargs = ['b10-resolver']
  435. if self.uid:
  436. resargs += ['-u', str(self.uid)]
  437. if self.verbose:
  438. resargs += ['-v']
  439. # ... and start
  440. self.start_process("b10-resolver", resargs, c_channel_env)
  441. def start_xfrout(self, c_channel_env):
  442. self.start_simple("b10-xfrout", c_channel_env)
  443. def start_xfrin(self, c_channel_env):
  444. self.start_simple("b10-xfrin", c_channel_env)
  445. def start_zonemgr(self, c_channel_env):
  446. self.start_simple("b10-zonemgr", c_channel_env)
  447. def start_stats(self, c_channel_env):
  448. self.start_simple("b10-stats", c_channel_env)
  449. def start_cmdctl(self, c_channel_env):
  450. # XXX: we hardcode port 8080
  451. self.start_simple("b10-cmdctl", c_channel_env, 8080)
  452. def start_all_processes(self):
  453. """
  454. Starts up all the processes. Any exception generated during the
  455. starting of the processes is handled by the caller.
  456. """
  457. c_channel_env = self.c_channel_env
  458. self.start_msgq(c_channel_env)
  459. self.start_cfgmgr(c_channel_env)
  460. self.start_ccsession(c_channel_env)
  461. # Extract the parameters associated with Bob. This can only be
  462. # done after the CC Session is started.
  463. self.read_bind10_config()
  464. # Continue starting the processes. The authoritative server (if
  465. # selected):
  466. if self.cfg_start_auth:
  467. self.start_auth(c_channel_env)
  468. # ... and resolver (if selected):
  469. if self.cfg_start_resolver:
  470. self.start_resolver(c_channel_env)
  471. self.started_resolver_family = True
  472. # Everything after the main components can run as non-root.
  473. # TODO: this is only temporary - once the privileged socket creator is
  474. # fully working, nothing else will run as root.
  475. if self.uid is not None:
  476. posix.setuid(self.uid)
  477. # xfrin/xfrout and the zone manager are only meaningful if the
  478. # authoritative server has been started.
  479. if self.cfg_start_auth:
  480. self.start_xfrout(c_channel_env)
  481. self.start_xfrin(c_channel_env)
  482. self.start_zonemgr(c_channel_env)
  483. self.started_auth_family = True
  484. # ... and finally start the remaining processes
  485. self.start_stats(c_channel_env)
  486. self.start_cmdctl(c_channel_env)
  487. def startup(self):
  488. """
  489. Start the BoB instance.
  490. Returns None if successful, otherwise an string describing the
  491. problem.
  492. """
  493. # Try to connect to the c-channel daemon, to see if it is already
  494. # running
  495. c_channel_env = {}
  496. if self.msgq_socket_file is not None:
  497. c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
  498. if self.verbose:
  499. sys.stdout.write("[bind10] Checking for already running b10-msgq\n")
  500. # try to connect, and if we can't wait a short while
  501. try:
  502. self.cc_session = isc.cc.Session(self.msgq_socket_file)
  503. return "b10-msgq already running, or socket file not cleaned , cannot start"
  504. except isc.cc.session.SessionError:
  505. # this is the case we want, where the msgq is not running
  506. pass
  507. # Start all processes. If any one fails to start, kill all started
  508. # processes and exit with an error indication.
  509. try:
  510. self.c_channel_env = c_channel_env
  511. self.start_all_processes()
  512. except Exception as e:
  513. self.kill_started_processes()
  514. return "Unable to start " + self.curproc + ": " + str(e)
  515. # Started successfully
  516. self.runnable = True
  517. return None
  518. def stop_all_processes(self):
  519. """Stop all processes."""
  520. cmd = { "command": ['shutdown']}
  521. self.cc_session.group_sendmsg(cmd, 'Cmdctl', 'Cmdctl')
  522. self.cc_session.group_sendmsg(cmd, "ConfigManager", "ConfigManager")
  523. self.cc_session.group_sendmsg(cmd, "Auth", "Auth")
  524. self.cc_session.group_sendmsg(cmd, "Resolver", "Resolver")
  525. self.cc_session.group_sendmsg(cmd, "Xfrout", "Xfrout")
  526. self.cc_session.group_sendmsg(cmd, "Xfrin", "Xfrin")
  527. self.cc_session.group_sendmsg(cmd, "Zonemgr", "Zonemgr")
  528. self.cc_session.group_sendmsg(cmd, "Stats", "Stats")
  529. def stop_process(self, process, recipient):
  530. """
  531. Stop the given process, friendly-like. The process is the name it has
  532. (in logs, etc), the recipient is the address on msgq.
  533. """
  534. if self.verbose:
  535. sys.stdout.write("[bind10] Asking %s to terminate\n" % process)
  536. # TODO: Some timeout to solve processes that don't want to die would
  537. # help. We can even store it in the dict, it is used only as a set
  538. self.expected_shutdowns[process] = 1
  539. # Ask the process to die willingly
  540. self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
  541. recipient)
  542. # Series of stop_process wrappers
  543. def stop_resolver(self):
  544. self.stop_process('b10-resolver', 'Resolver')
  545. def stop_auth(self):
  546. self.stop_process('b10-auth', 'Auth')
  547. def stop_xfrout(self):
  548. self.stop_process('b10-xfrout', 'Xfrout')
  549. def stop_xfrin(self):
  550. self.stop_process('b10-xfrin', 'Xfrin')
  551. def stop_zonemgr(self):
  552. self.stop_process('b10-zonemgr', 'Zonemgr')
  553. def shutdown(self):
  554. """Stop the BoB instance."""
  555. if self.verbose:
  556. sys.stdout.write("[bind10] Stopping the server.\n")
  557. # first try using the BIND 10 request to stop
  558. try:
  559. self.stop_all_processes()
  560. except:
  561. pass
  562. # XXX: some delay probably useful... how much is uncertain
  563. # I have changed the delay from 0.5 to 1, but sometime it's
  564. # still not enough.
  565. time.sleep(1)
  566. self.reap_children()
  567. # next try sending a SIGTERM
  568. processes_to_stop = list(self.processes.values())
  569. for proc_info in processes_to_stop:
  570. if self.verbose:
  571. sys.stdout.write("[bind10] Sending SIGTERM to %s (PID %d).\n" %
  572. (proc_info.name, proc_info.pid))
  573. try:
  574. proc_info.process.terminate()
  575. except OSError:
  576. # ignore these (usually ESRCH because the child
  577. # finally exited)
  578. pass
  579. # finally, send SIGKILL (unmaskable termination) until everybody dies
  580. while self.processes:
  581. # XXX: some delay probably useful... how much is uncertain
  582. time.sleep(0.1)
  583. self.reap_children()
  584. processes_to_stop = list(self.processes.values())
  585. for proc_info in processes_to_stop:
  586. if self.verbose:
  587. sys.stdout.write("[bind10] Sending SIGKILL to %s (PID %d).\n" %
  588. (proc_info.name, proc_info.pid))
  589. try:
  590. proc_info.process.kill()
  591. except OSError:
  592. # ignore these (usually ESRCH because the child
  593. # finally exited)
  594. pass
  595. if self.verbose:
  596. sys.stdout.write("[bind10] All processes ended, server done.\n")
  597. def reap_children(self):
  598. """Check to see if any of our child processes have exited,
  599. and note this for later handling.
  600. """
  601. while True:
  602. try:
  603. (pid, exit_status) = os.waitpid(-1, os.WNOHANG)
  604. except OSError as o:
  605. if o.errno == errno.ECHILD: break
  606. # XXX: should be impossible to get any other error here
  607. raise
  608. if pid == 0: break
  609. if pid in self.processes:
  610. # One of the processes we know about. Get information on it.
  611. proc_info = self.processes.pop(pid)
  612. proc_info.restart_schedule.set_run_stop_time()
  613. self.dead_processes[proc_info.pid] = proc_info
  614. # Write out message, but only if in the running state:
  615. # During startup and shutdown, these messages are handled
  616. # elsewhere.
  617. if self.runnable:
  618. if exit_status is None:
  619. sys.stdout.write(
  620. "[bind10] Process %s (PID %d) died: exit status not available" %
  621. (proc_info.name, proc_info.pid))
  622. else:
  623. sys.stdout.write(
  624. "[bind10] Process %s (PID %d) terminated, exit status = %d\n" %
  625. (proc_info.name, proc_info.pid, exit_status))
  626. # Was it a special process?
  627. if proc_info.name == "b10-msgq":
  628. sys.stdout.write(
  629. "[bind10] The b10-msgq process died, shutting down.\n")
  630. self.runnable = False
  631. else:
  632. sys.stdout.write("[bind10] Unknown child pid %d exited.\n" % pid)
  633. def restart_processes(self):
  634. """
  635. Restart any dead processes:
  636. * Returns the time when the next process is ready to be restarted.
  637. * If the server is shutting down, returns 0.
  638. * If there are no processes, returns None.
  639. The values returned can be safely passed into select() as the
  640. timeout value.
  641. """
  642. next_restart = None
  643. # if we're shutting down, then don't restart
  644. if not self.runnable:
  645. return 0
  646. # otherwise look through each dead process and try to restart
  647. still_dead = {}
  648. now = time.time()
  649. for proc_info in self.dead_processes.values():
  650. if proc_info.name in self.expected_shutdowns:
  651. # We don't restart, we wanted it to die
  652. del self.expected_shutdowns[proc_info.name]
  653. continue
  654. restart_time = proc_info.restart_schedule.get_restart_time(now)
  655. if restart_time > now:
  656. if (next_restart is None) or (next_restart > restart_time):
  657. next_restart = restart_time
  658. still_dead[proc_info.pid] = proc_info
  659. else:
  660. if self.verbose:
  661. sys.stdout.write("[bind10] Resurrecting dead %s process...\n" %
  662. proc_info.name)
  663. try:
  664. proc_info.respawn()
  665. self.processes[proc_info.pid] = proc_info
  666. sys.stdout.write("[bind10] Resurrected %s (PID %d)\n" %
  667. (proc_info.name, proc_info.pid))
  668. except:
  669. still_dead[proc_info.pid] = proc_info
  670. # remember any processes that refuse to be resurrected
  671. self.dead_processes = still_dead
  672. # return the time when the next process is ready to be restarted
  673. return next_restart
  674. # global variables, needed for signal handlers
  675. options = None
  676. boss_of_bind = None
  677. def reaper(signal_number, stack_frame):
  678. """A child process has died (SIGCHLD received)."""
  679. # don't do anything...
  680. # the Python signal handler has been set up to write
  681. # down a pipe, waking up our select() bit
  682. pass
  683. def get_signame(signal_number):
  684. """Return the symbolic name for a signal."""
  685. for sig in dir(signal):
  686. if sig.startswith("SIG") and sig[3].isalnum():
  687. if getattr(signal, sig) == signal_number:
  688. return sig
  689. return "Unknown signal %d" % signal_number
  690. # XXX: perhaps register atexit() function and invoke that instead
  691. def fatal_signal(signal_number, stack_frame):
  692. """We need to exit (SIGINT or SIGTERM received)."""
  693. global options
  694. global boss_of_bind
  695. if options.verbose:
  696. sys.stdout.write("[bind10] Received %s.\n" % get_signame(signal_number))
  697. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  698. boss_of_bind.runnable = False
  699. def check_port(option, opt_str, value, parser):
  700. """Function to insure that the port we are passed is actually
  701. a valid port number. Used by OptionParser() on startup."""
  702. try:
  703. if opt_str in ['-p', '--port']:
  704. parser.values.dns_port = isc.net.parse.port_parse(value)
  705. else:
  706. raise OptionValueError("Unknown option " + opt_str)
  707. except ValueError as e:
  708. raise OptionValueError(str(e))
  709. def check_addr(option, opt_str, value, parser):
  710. """Function to insure that the address we are passed is actually
  711. a valid address. Used by OptionParser() on startup."""
  712. try:
  713. if opt_str in ['-a', '--address']:
  714. parser.values.address = isc.net.parse.addr_parse(value)
  715. elif opt_str in ['-f', '--forward']:
  716. parser.values.forward = isc.net.parse.addr_parse(value)
  717. else:
  718. raise OptionValueError("Unknown option " + opt_str)
  719. except ValueError:
  720. raise OptionValueError("%s requires a valid IPv4 or IPv6 address" % opt_str)
  721. def process_rename(option, opt_str, value, parser):
  722. """Function that renames the process if it is requested by a option."""
  723. isc.util.process.rename(value)
  724. def main():
  725. global options
  726. global boss_of_bind
  727. # Enforce line buffering on stdout, even when not a TTY
  728. sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)
  729. # Parse any command-line options.
  730. parser = OptionParser(version=VERSION)
  731. parser.add_option("-a", "--address", dest="address", type="string",
  732. action="callback", callback=check_addr, default=None,
  733. help="address the DNS server will use (default: listen on all addresses)")
  734. parser.add_option("-f", "--forward", dest="forward", type="string",
  735. action="callback", callback=check_addr, default=None,
  736. help="nameserver to which DNS queries should be forwarded")
  737. parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
  738. type="string", default=None,
  739. help="UNIX domain socket file the b10-msgq daemon will use")
  740. parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
  741. default=False, help="disable hot-spot cache in authoritative DNS server")
  742. parser.add_option("-p", "--port", dest="dns_port", type="int",
  743. action="callback", callback=check_port, default=5300,
  744. help="port the DNS server will use (default 5300)")
  745. parser.add_option("-u", "--user", dest="user", type="string", default=None,
  746. help="Change user after startup (must run as root)")
  747. parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
  748. help="display more about what is going on")
  749. parser.add_option("--pretty-name", type="string", action="callback",
  750. callback=process_rename,
  751. help="Set the process name (displayed in ps, top, ...)")
  752. (options, args) = parser.parse_args()
  753. if args:
  754. parser.print_help()
  755. sys.exit(1)
  756. # Check user ID.
  757. setuid = None
  758. username = None
  759. if options.user:
  760. # Try getting information about the user, assuming UID passed.
  761. try:
  762. pw_ent = pwd.getpwuid(int(options.user))
  763. setuid = pw_ent.pw_uid
  764. username = pw_ent.pw_name
  765. except ValueError:
  766. pass
  767. except KeyError:
  768. pass
  769. # Next try getting information about the user, assuming user name
  770. # passed.
  771. # If the information is both a valid user name and user number, we
  772. # prefer the name because we try it second. A minor point, hopefully.
  773. try:
  774. pw_ent = pwd.getpwnam(options.user)
  775. setuid = pw_ent.pw_uid
  776. username = pw_ent.pw_name
  777. except KeyError:
  778. pass
  779. if setuid is None:
  780. sys.stderr.write("bind10: invalid user: '%s'\n" % options.user)
  781. sys.exit(1)
  782. # Announce startup.
  783. if options.verbose:
  784. sys.stdout.write("%s\n" % VERSION)
  785. # Create wakeup pipe for signal handlers
  786. wakeup_pipe = os.pipe()
  787. signal.set_wakeup_fd(wakeup_pipe[1])
  788. # Set signal handlers for catching child termination, as well
  789. # as our own demise.
  790. signal.signal(signal.SIGCHLD, reaper)
  791. signal.siginterrupt(signal.SIGCHLD, False)
  792. signal.signal(signal.SIGINT, fatal_signal)
  793. signal.signal(signal.SIGTERM, fatal_signal)
  794. # Block SIGPIPE, as we don't want it to end this process
  795. signal.signal(signal.SIGPIPE, signal.SIG_IGN)
  796. # Go bob!
  797. boss_of_bind = BoB(options.msgq_socket_file, options.dns_port,
  798. options.address, options.forward, options.nocache,
  799. options.verbose, setuid, username)
  800. startup_result = boss_of_bind.startup()
  801. if startup_result:
  802. sys.stderr.write("[bind10] Error on startup: %s\n" % startup_result)
  803. sys.exit(1)
  804. sys.stdout.write("[bind10] BIND 10 started\n")
  805. # send "bind10.boot_time" to b10-stats
  806. time.sleep(1) # wait a second
  807. if options.verbose:
  808. sys.stdout.write("[bind10] send \"bind10.boot_time\" to b10-stats\n")
  809. cmd = isc.config.ccsession.create_command('set',
  810. { "stats_data": {
  811. 'bind10.boot_time': time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
  812. }
  813. })
  814. boss_of_bind.cc_session.group_sendmsg(cmd, 'Stats')
  815. # In our main loop, we check for dead processes or messages
  816. # on the c-channel.
  817. wakeup_fd = wakeup_pipe[0]
  818. ccs_fd = boss_of_bind.ccs.get_socket().fileno()
  819. while boss_of_bind.runnable:
  820. # clean up any processes that exited
  821. boss_of_bind.reap_children()
  822. next_restart = boss_of_bind.restart_processes()
  823. if next_restart is None:
  824. wait_time = None
  825. else:
  826. wait_time = max(next_restart - time.time(), 0)
  827. # select() can raise EINTR when a signal arrives,
  828. # even if they are resumable, so we have to catch
  829. # the exception
  830. try:
  831. (rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [],
  832. wait_time)
  833. except select.error as err:
  834. if err.args[0] == errno.EINTR:
  835. (rlist, wlist, xlist) = ([], [], [])
  836. else:
  837. sys.stderr.write("[bind10] Error with select(); %s\n" % err)
  838. break
  839. for fd in rlist + xlist:
  840. if fd == ccs_fd:
  841. try:
  842. boss_of_bind.ccs.check_command()
  843. except isc.cc.session.ProtocolError:
  844. if options.verbose:
  845. sys.stderr.write("[bind10] msgq channel disappeared.\n")
  846. break
  847. elif fd == wakeup_fd:
  848. os.read(wakeup_fd, 32)
  849. # shutdown
  850. signal.signal(signal.SIGCHLD, signal.SIG_DFL)
  851. boss_of_bind.shutdown()
  852. sys.stdout.write("[bind10] BIND 10 exiting\n");
  853. sys.exit(0)
  854. if __name__ == "__main__":
  855. main()