Source code for gwcelery.tools.nagios

"""A `Nagios plugin <https://nagios-plugins.org/doc/guidelines.html>`_
for monitoring GWCelery."""

from enum import IntEnum
from sys import exit
from traceback import format_exc, format_exception

from celery.bin.base import Command
import kombu.exceptions

# Make sure that all tasks are registered
from .. import tasks  # noqa: F401


[docs]class NagiosPluginStatus(IntEnum): """Nagios plugin status codes.""" OK = 0 WARNING = 1 CRITICAL = 2 UNKNOWN = 3
[docs]class NagiosCriticalError(Exception): """An exception that maps to a Nagios status of `CRITICAL`."""
[docs]def get_active_queues(inspector): return {queue['name'] for queues in (inspector.active_queues() or {}).values() for queue in queues}
[docs]def get_active_lvalert_nodes(inspector): return {node for stat in inspector.stats().values() for node in stat.get('lvalert-nodes', ())}
[docs]def get_expected_queues(app): # Get the queues for all registered tasks. result = {getattr(task, 'queue', None) for task in app.tasks.values()} # We use 'celery' for all tasks that do not explicitly specify a queue. result -= {None} result |= {'celery'} # Done. return result
[docs]def get_expected_lvalert_nodes(app): return app.conf['lvalert_nodes']
[docs]def get_active_voevent_peers(inspector): stats = inspector.stats() broker_peers, receiver_peers = ( {peer for stat in stats.values() for peer in stat.get(key, ())} for key in ['voevent-broker-peers', 'voevent-receiver-peers']) return broker_peers, receiver_peers
[docs]def check_status(app): connection = app.connection() try: connection.ensure_connection(max_retries=1) except kombu.exceptions.OperationalError as e: raise NagiosCriticalError('No connection to broker') from e inspector = app.control.inspect() active = get_active_queues(inspector) expected = get_expected_queues(app) missing = expected - active if missing: raise NagiosCriticalError('Not all expected queues are active') from \ AssertionError('Missing queues: ' + ', '.join(missing)) active = get_active_lvalert_nodes(inspector) expected = get_expected_lvalert_nodes(app) missing = expected - active extra = active - expected if missing: raise NagiosCriticalError('Not all lvalert nodes are subscribed') \ from AssertionError('Missing nodes: ' + ', '.join(missing)) if extra: raise NagiosCriticalError('Too many lvalert nodes are subscribed') \ from AssertionError('Extra nodes: ' + ', '.join(extra)) broker_peers, receiver_peers = get_active_voevent_peers(inspector) if app.conf['voevent_broadcaster_whitelist'] and not broker_peers: raise NagiosCriticalError( 'The VOEvent broker has no active connections') \ from AssertionError('voevent_broadcaster_whitelist: {}'.format( app.conf['voevent_broadcaster_whitelist'])) if app.conf['voevent_receiver_address'] and not receiver_peers: raise NagiosCriticalError( 'The VOEvent receiver has no active connections') \ from AssertionError('voevent_receiver_address: {}'.format( app.conf['voevent_receiver_address']))
[docs]class NagiosCommand(Command):
[docs] def run(self, **kwargs): try: check_status(self.app) except NagiosCriticalError as e: status = NagiosPluginStatus.CRITICAL output, = e.args e = e.__cause__ detail = ''.join(format_exception(type(e), e, e.__traceback__)) except: # noqa: E722 status = NagiosPluginStatus.UNKNOWN output = 'Unexpected error' detail = format_exc() else: status = NagiosPluginStatus.OK output = 'Running normally' detail = None print('{}: {}'.format(status.name, output)) if detail: print(detail) exit(status)
NagiosCommand.__doc__ = __doc__