component.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. # Copyright (C) 2011 Internet Systems Consortium, Inc. ("ISC")
  2. #
  3. # Permission to use, copy, modify, and distribute this software for any
  4. # purpose with or without fee is hereby granted, provided that the above
  5. # copyright notice and this permission notice appear in all copies.
  6. #
  7. # THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
  8. # DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
  9. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
  10. # INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
  11. # INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
  12. # FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
  13. # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
  14. # WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. """
  16. Module for managing components (abstraction of process). It allows starting
  17. them in given order, handling when they crash (what happens depends on kind
  18. of component) and shutting down. It also handles the configuration of this.
  19. Dependencies between them are not yet handled. It might turn out they are
  20. needed, in that case they will be added sometime in future.
  21. This framework allows for a single process to be started multiple times (by
  22. specifying multiple components with the same configuration). However, the rest
  23. of the system might not handle such situation well, so until it is made so,
  24. it would be better to start each process at most once.
  25. """
  26. import isc.log
  27. from isc.log_messages.bind10_messages import *
  28. import time
  29. logger = isc.log.Logger("boss")
  30. DBG_TRACE_DATA = 20
  31. DBG_TRACE_DETAILED = 80
  32. START_CMD = 'start'
  33. STOP_CMD = 'stop'
  34. STARTED_OK_TIME = 10
  35. STATE_DEAD = 'dead'
  36. STATE_STOPPED = 'stopped'
  37. STATE_RUNNING = 'running'
  38. class Component:
  39. """
  40. This represents a single component. It has some defaults of behaviour,
  41. which should be reasonable for majority of ordinary components, but
  42. it might be inherited and modified for special-purpose components,
  43. like the core modules with different ways of starting up. Another
  44. way to tweak only the start of the component (eg. by providing some
  45. command line parameters) is to set _start_func function from within
  46. inherited class.
  47. The methods are marked if it is expected for them to be overridden.
  48. The component is in one of the three states:
  49. - Stopped - it is either not started yet or it was explicitly stopped.
  50. The component is created in this state (it must be asked to start
  51. explicitly).
  52. - Running - after start() was called, it started successfully and is
  53. now running.
  54. - Dead - it failed and can not be resurrected.
  55. Init
  56. | stop()
  57. | +-----------------------+
  58. | | |
  59. v | start() success |
  60. Stopped --------+--------> Running <----------+
  61. | | |
  62. |failure | failed() |
  63. | | |
  64. v | |
  65. +<-----------+ |
  66. | |
  67. | kind == dispensable or kind|== needed and failed late
  68. +-----------------------------+
  69. |
  70. | kind == core or kind == needed and it failed too soon
  71. v
  72. Dead
  73. Note that there are still situations which are not handled properly here.
  74. We don't recognize a component that is starting up, but not ready yet, one
  75. that is already shutting down, impossible to stop, etc. We need to add more
  76. states in future to handle it properly.
  77. """
  78. def __init__(self, process, boss, kind, address=None, params=None):
  79. """
  80. Creates the component in not running mode.
  81. The parameters are:
  82. - `process` is the name of the process to start.
  83. - `boss` the boss object to plug into. The component needs to plug
  84. into it to know when it failed, etc.
  85. - `kind` is the kind of component. It may be one of:
  86. * 'core' means the system can't run without it and it can't be
  87. safely restarted. If it does not start, the system is brought
  88. down. If it crashes, the system is turned off as well (with
  89. non-zero exit status).
  90. * 'needed' means the system is able to restart the component,
  91. but it is vital part of the service (like auth server). If
  92. it fails to start or crashes in less than 10s after the first
  93. startup, the system is brought down. If it crashes later on,
  94. it is restarted.
  95. * 'dispensable' means the component should be running, but if it
  96. doesn't start or crashes for some reason, the system simply tries
  97. to restart it and keeps running.
  98. - `address` is the address on message bus. It is used to ask it to
  99. shut down at the end. If you specialize the class for a component
  100. that is shut down differently, it might be None.
  101. - `params` is a list of parameters to pass to the process when it
  102. starts. It is currently unused and this support is left out for
  103. now.
  104. """
  105. if kind not in ['core', 'needed', 'dispensable']:
  106. raise ValueError('Component kind can not be ' + kind)
  107. self.__state = STATE_STOPPED
  108. # These should be read-only
  109. self._kind = kind
  110. self._boss = boss
  111. self._process = process
  112. # This can be overwritten/set by the child classes
  113. self._start_func = None
  114. self._address = address
  115. self._params = params
  116. # These should be considered private. It is protected to
  117. # allow tests in and for really rare ocassions, but a care
  118. # should be taken to understand the Component code.
  119. #
  120. # It should not be accessed when the component wasn't run
  121. # yet.
  122. self._procinfo = None
  123. def start(self):
  124. """
  125. Start the component for the first time or restart it. If you need to
  126. modify the way a component is started, do not replace this method,
  127. but _start_internal. This one does some more bookkeeping around.
  128. If you try to start an already running component, it raises ValueError.
  129. """
  130. if self.__state == STATE_DEAD:
  131. raise ValueError("Can't resurrect already dead component")
  132. if self.running():
  133. raise ValueError("Can't start already running component")
  134. logger.info(BIND10_COMPONENT_START, self.name())
  135. self.__state = STATE_RUNNING
  136. self.__start_time = time.time()
  137. try:
  138. self._start_internal()
  139. except Exception as e:
  140. logger.error(BIND10_COMPONENT_START_EXCEPTION, self.name(), e)
  141. self.failed(None)
  142. raise
  143. def _start_internal(self):
  144. """
  145. This method does the actual starting of a process. If you need to
  146. change the way the component is started, replace this method.
  147. You can change the "core" of this function by setting self._start_func
  148. to a function without parameters. Such function should start the
  149. process and return the procinfo object describing the running process.
  150. If you don't provide the _start_func, the usual startup by calling
  151. boss.start_simple is performed.
  152. If you override the method completely, you should consider overriding
  153. pid, _stop_internal (and possibly _failed_internal and name) and kill
  154. as well. You should also register any processes started within boss.
  155. (In fact, you could set the _procinfo variable and use the provided
  156. ones, but then you are OK with providing _start_func anyway).
  157. The ability to override this method presents some flexibility. It
  158. allows processes started in a strange way, as well as components that
  159. have no processes at all or components with multiple processes (in case
  160. of multiple processes, care should be taken to make their
  161. started/stopped state in sync and all the processes that can fail
  162. should be registered).
  163. """
  164. # This one is not tested. For one, it starts a real process
  165. # which is out of scope of unit tests, for another, it just
  166. # delegates the starting to other function in boss (if a derived
  167. # class does not provide an override function), which is tested
  168. # by use.
  169. if self._start_func is not None:
  170. procinfo = self._start_func()
  171. else:
  172. # TODO Handle params, etc
  173. procinfo = self._boss.start_simple(self._process)
  174. self._procinfo = procinfo
  175. self._boss.register_process(self.pid(), self)
  176. def stop(self):
  177. """
  178. Stop the component. If you need to modify the way a component is
  179. stopped, do not replace this method, but _stop_internal. This one
  180. does some more bookkeeping.
  181. If you try to stop a component that is not running, it raises
  182. ValueError.
  183. """
  184. # This is not tested. It talks with the outher world, which is out
  185. # of scope of unittests.
  186. if not self.running():
  187. raise ValueError("Can't stop a component which is not running")
  188. logger.info(BIND10_COMPONENT_STOP, self.name())
  189. self.__state = STATE_STOPPED
  190. self._stop_internal()
  191. def _stop_internal(self):
  192. """
  193. This is the method that does the actual stopping of a component.
  194. You can replace this method if you want a different way to do it.
  195. If you're overriding this one, you probably want to replace the
  196. _start_internal, kill and pid methods (and maybe _failed_internal and
  197. name as well).
  198. Also, note that it is a bad idea to raise exceptions from here.
  199. Under such circumstance, the component will be considered stopped,
  200. and the exception propagated, but we can't be sure it really is
  201. dead.
  202. """
  203. self._boss.stop_process(self._process, self._address)
  204. # TODO Some way to wait for the process that doesn't want to
  205. # terminate and kill it would prove nice (or add it to boss somewhere?)
  206. def failed(self, exit_code):
  207. """
  208. Notify the component it crashed. This will be called from boss object.
  209. If you try to call failed on a component that is not running,
  210. a ValueError is raised.
  211. If it is a core component or needed component and it was started only
  212. recently, the component will become dead and will ask the boss to shut
  213. down with error exit status. A dead component can't be started again.
  214. Otherwise the component will try to restart.
  215. The exit code is used for logging. It might be None.
  216. """
  217. logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
  218. exit_code if exit_code is not None else "unknown")
  219. if not self.running():
  220. raise ValueError("Can't fail component that isn't running")
  221. self.__state = STATE_STOPPED
  222. self._failed_internal()
  223. # If it is a core component or the needed component failed to start
  224. # (including it stopped really soon)
  225. if self._kind == 'core' or \
  226. (self._kind == 'needed' and time.time() - STARTED_OK_TIME <
  227. self.__start_time):
  228. self.__state = STATE_DEAD
  229. logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
  230. self._boss.component_shutdown(1)
  231. # This means we want to restart
  232. else:
  233. logger.warn(BIND10_COMPONENT_RESTART, self.name())
  234. self.start()
  235. def _failed_internal(self):
  236. """
  237. This method is called from failed. You can replace it if you need
  238. some specific behaviour when the component crashes. The default
  239. implementation is empty.
  240. Do not raise exceptions from here, please. The propper shutdown
  241. would have not happened.
  242. """
  243. pass
  244. def running(self):
  245. """
  246. Informs if the component is currently running. It assumes the failed
  247. is called whenever the component really fails and there might be some
  248. time in between actual failure and the call.
  249. It is not expected for this method to be overriden.
  250. """
  251. return self.__state == STATE_RUNNING
  252. def name(self):
  253. """
  254. Returns human-readable name of the component. This is usually the
  255. name of the executable, but it might be something different in a
  256. derived class (in case it is overriden).
  257. """
  258. return self._process
  259. def pid(self):
  260. """
  261. Provides a PID of a process, if the component is real running process.
  262. This implementation expects it to be a real process, but derived class
  263. may return None in case the component is something else.
  264. This returns None in case it is not yet running.
  265. You probably want to override this method if you're providing custom
  266. _start_internal.
  267. Note that some components preserve the pid after a call to stop or
  268. failed. This is because the components need to preserve it in order
  269. to be able to kill the process if it failed to stop properly. Therefore
  270. you should not rely on the pid being None if the component is stopped.
  271. """
  272. return self._procinfo.pid if self._procinfo is not None else None
  273. def kill(self, forcefull=False):
  274. """
  275. The component should be forcefully killed. This does not change the
  276. internal state, it just kills the external process and expects a
  277. failure to be reported when the process really dies.
  278. If it isn't running, it does nothing.
  279. If the forcefull is true, it uses SIGKILL instead of SIGTERM.
  280. """
  281. if self._procinfo is not None:
  282. if forcefull:
  283. self._procinfo.process.kill()
  284. else:
  285. self._procinfo.process.terminate()
  286. class Configurator:
  287. """
  288. This thing keeps track of configuration changes and starts and stops
  289. components as it goes. It also handles the inital startup and final
  290. shutdown.
  291. Note that this will allow you to stop (by invoking reconfigure) a core
  292. component. There should be some kind of layer protecting users from ever
  293. doing so (users must not stop the config manager, message queue and stuff
  294. like that or the system won't start again). However, if a user specifies
  295. b10-auth as core, it is safe to stop that one.
  296. The parameters are:
  297. * `boss`: The boss we are managing for.
  298. * `specials`: Dict of specially started components. Each item is a class
  299. representing the component.
  300. The configuration passed to it (by startup() and reconfigure()) is a
  301. dictionary, each item represents one component that should be running.
  302. The key is an unique identifier used to reference the component. The
  303. value is a dictionary describing the component. All items in the
  304. description is optional unless told otherwise and they are as follows:
  305. * `special` - Some components are started in a special way. If it is
  306. present, it specifies which class from the specials parameter should
  307. be used to create the component. In that case, some of the following
  308. items might be irrelevant, depending on the special component choosen.
  309. If it is not there, the basic Component class is used.
  310. * `process` - Name of the executable to start. If it is not present,
  311. it defaults to the identifier of the component.
  312. * `kind` - The kind of component, either of 'core', 'needed' and
  313. 'dispensable'. This specifies what happens if the component fails.
  314. This one is required.
  315. * `address` - The address of the component on message bus. It is used
  316. to shut down the component. All special components currently either
  317. know their own address or don't need one and ignore it. The common
  318. components should provide this.
  319. * `params` - The command line parameters of the executable. Defaults
  320. to no parameters. It is currently unused.
  321. * `priority` - When starting the component, the components with higher
  322. priority are started before the ones with lower priority. If it is
  323. not present, it defaults to 0.
  324. """
  325. def __init__(self, boss, specials = {}):
  326. """
  327. Initializes the configurator, but nothing is started yet.
  328. The boss parameter is the boss object used to start and stop processes.
  329. """
  330. self.__boss = boss
  331. # These could be __private, but as we access them from within unittest,
  332. # it's more comfortable to have them just _protected.
  333. # They are tuples (configuration, component)
  334. self._components = {}
  335. self._running = False
  336. self.__specials = specials
  337. def __reconfigure_internal(self, old, new):
  338. """
  339. Does a switch from one configuration to another.
  340. """
  341. self._run_plan(self._build_plan(old, new))
  342. def startup(self, configuration):
  343. """
  344. Starts the first set of processes. This configuration is expected
  345. to be hardcoded from the boss itself to start the configuration
  346. manager and other similar things.
  347. """
  348. if self._running:
  349. raise ValueError("Trying to start the component configurator " +
  350. "twice")
  351. logger.info(BIND10_CONFIGURATOR_START)
  352. self.__reconfigure_internal(self._components, configuration)
  353. self._running = True
  354. def shutdown(self):
  355. """
  356. Shuts everything down.
  357. It is not expected that anyone would want to shutdown and then start
  358. the configurator again, so we don't explicitly make sure that would
  359. work. However, we are not avare of anything that would make it not
  360. work either.
  361. """
  362. if not self._running:
  363. raise ValueError("Trying to shutdown the component " +
  364. "configurator while it's not yet running")
  365. logger.info(BIND10_CONFIGURATOR_STOP)
  366. self._running = False
  367. self.__reconfigure_internal(self._components, {})
  368. def reconfigure(self, configuration):
  369. """
  370. Changes configuration from the current one to the provided. It
  371. starts and stops all the components as needed (eg. if there's
  372. a component that was not in the original configuration, it is
  373. started, any component that was in the old and is not in the
  374. new one is stopped).
  375. """
  376. if not self._running:
  377. raise ValueError("Trying to reconfigure the component " +
  378. "configurator while it's not yet running")
  379. logger.info(BIND10_CONFIGURATOR_RECONFIGURE)
  380. self.__reconfigure_internal(self._components, configuration)
  381. def _build_plan(self, old, new):
  382. """
  383. Builds a plan how to transfer from the old configuration to the new
  384. one. It'll be sorted by priority and it will contain the components
  385. (already created, but not started). Each command in the plan is a dict,
  386. so it can be extended any time in future to include whatever
  387. parameters each operation might need.
  388. Any configuration problems are expected to be handled here, so the
  389. plan is not yet run.
  390. """
  391. logger.debug(DBG_TRACE_DATA, BIND10_CONFIGURATOR_BUILD, old, new)
  392. plan = []
  393. # Handle removals of old components
  394. for cname in old.keys():
  395. if cname not in new:
  396. component = self._components[cname][1]
  397. if component.running():
  398. plan.append({
  399. 'command': STOP_CMD,
  400. 'component': component,
  401. 'name': cname
  402. })
  403. # Handle transitions of configuration of what is here
  404. for cname in new.keys():
  405. if cname in old:
  406. for option in ['special', 'process', 'kind', 'address',
  407. 'params']:
  408. if new[cname].get(option) != old[cname][0].get(option):
  409. raise NotImplementedError('Changing configuration of' +
  410. ' a running component is ' +
  411. 'not yet supported. Remove' +
  412. ' and re-add ' + cname +
  413. ' to get the same effect')
  414. # Handle introduction of new components
  415. plan_add = []
  416. for cname in new.keys():
  417. if cname not in old:
  418. component_config = new[cname]
  419. creator = Component
  420. if 'special' in component_config:
  421. # TODO: Better error handling
  422. creator = self.__specials[component_config['special']]
  423. component = creator(component_config.get('process', cname),
  424. self.__boss, component_config['kind'],
  425. component_config.get('address'),
  426. component_config.get('params'))
  427. priority = component_config.get('priority', 0)
  428. # We store tuples, priority first, so we can easily sort
  429. plan_add.append((priority, {
  430. 'component': component,
  431. 'command': START_CMD,
  432. 'name': cname,
  433. 'config': component_config
  434. }))
  435. # Push the starts there sorted by priority
  436. plan.extend([command for (_, command) in sorted(plan_add,
  437. reverse=True,
  438. key=lambda command:
  439. command[0])])
  440. return plan
  441. def running(self):
  442. """
  443. Returns if the configurator is running (eg. was started by startup and
  444. not yet stopped by shutdown).
  445. """
  446. return self._running
  447. def _run_plan(self, plan):
  448. """
  449. Run a plan, created beforehand by _build_plan.
  450. With the start and stop commands, it also adds and removes components
  451. in _components.
  452. Currently implemented commands are:
  453. * start
  454. * stop
  455. The plan is a list of tasks, each task is a dictionary. It must contain
  456. at last 'component' (a component object to work with) and 'command'
  457. (the command to do). Currently, both existing commands need 'name' of
  458. the component as well (the identifier from configuration). The 'start'
  459. one needs the 'config' to be there, which is the configuration description
  460. of the component.
  461. """
  462. done = 0
  463. try:
  464. logger.debug(DBG_TRACE_DATA, BIND10_CONFIGURATOR_RUN, len(plan))
  465. for task in plan:
  466. component = task['component']
  467. command = task['command']
  468. logger.debug(DBG_TRACE_DETAILED, BIND10_CONFIGURATOR_TASK,
  469. command, component.name())
  470. if command == START_CMD:
  471. component.start()
  472. self._components[task['name']] = (task['config'],
  473. component)
  474. elif command == STOP_CMD:
  475. if component.running():
  476. component.stop()
  477. del self._components[task['name']]
  478. else:
  479. # Can Not Happen (as the plans are generated by ourselves).
  480. # Therefore not tested.
  481. raise NotImplementedError("Command unknown: " + command)
  482. done += 1
  483. except:
  484. logger.error(BIND10_CONFIGURATOR_PLAN_INTERRUPTED, done, len(plan))
  485. raise