From 06a6fb9642a2cc70b1ca65f403b853fe8ce9d4b2 Mon Sep 17 00:00:00 2001 From: Luke Meyer Date: Thu, 20 Jul 2017 23:39:47 -0400 Subject: openshift_checks: refactor logging checks Turn failure messages into exceptions that tests can look for without depending on text meant for humans. Turn logging_namespace property into a method. Get rid of _exec_oc and just use logging.exec_oc. --- .../openshift_checks/logging/fluentd.py | 191 ++++++++++----------- 1 file changed, 92 insertions(+), 99 deletions(-) (limited to 'roles/openshift_health_checker/openshift_checks/logging/fluentd.py') diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py index 69c7b4392..3b192a281 100644 --- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py +++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py @@ -2,6 +2,8 @@ import json + +from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList from openshift_checks.logging.logging import LoggingCheck @@ -12,57 +14,96 @@ class Fluentd(LoggingCheck): tags = ["health", "logging"] def run(self): - """Check various things and gather errors. Returns: result as hash""" + """Check the Fluentd deployment and raise an error if any problems are found.""" + + fluentd_pods = self.get_pods_for_component("fluentd") + self.check_fluentd(fluentd_pods) + return {} + + def check_fluentd(self, pods): + """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found.""" - self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging") - fluentd_pods, error = super(Fluentd, self).get_pods_for_component( - self.logging_namespace, - "fluentd", + node_selector = self.get_var( + 'openshift_logging_fluentd_nodeselector', + default='logging-infra-fluentd=true' ) - if error: - return {"failed": True, "msg": error} - check_error = self.check_fluentd(fluentd_pods) - if check_error: - msg = ("The following Fluentd deployment issue was found:" - "{}".format(check_error)) - return {"failed": True, "msg": msg} + nodes_by_name = self.get_nodes_by_name() + fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - # TODO(lmeyer): run it all again for the ops cluster - return {"failed": False, "msg": 'No problems found with Fluentd deployment.'} + errors = [] + errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) + errors += self.check_nodes_have_fluentd(pods, fluentd_nodes) + errors += self.check_fluentd_pods_running(pods) + + # Make sure there are no extra fluentd pods + if len(pods) > len(fluentd_nodes): + errors.append(OpenShiftCheckException( + 'TooManyFluentdPods', + 'There are more Fluentd pods running than nodes labeled.\n' + 'This may not cause problems with logging but it likely indicates something wrong.' + )) + + if errors: + raise OpenShiftCheckExceptionList(errors) + + def get_nodes_by_name(self): + """Retrieve all the node definitions. Returns: dict(name: node)""" + nodes_json = self.exec_oc("get nodes -o json", []) + try: + nodes = json.loads(nodes_json) + except ValueError: # no valid json - should not happen + raise OpenShiftCheckException( + "BadOcNodeList", + "Could not obtain a list of nodes to validate fluentd.\n" + "Output from oc get:\n" + nodes_json + ) + if not nodes or not nodes.get('items'): # also should not happen + raise OpenShiftCheckException( + "NoNodesDefined", + "No nodes appear to be defined according to the API." + ) + return { + node['metadata']['name']: node + for node in nodes['items'] + } @staticmethod - def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector): - """Filter to all nodes with fluentd label. Returns dict(name: node), error string""" + def filter_fluentd_labeled_nodes(nodes_by_name, node_selector): + """Filter to all nodes with fluentd label. Returns dict(name: node)""" label, value = node_selector.split('=', 1) fluentd_nodes = { name: node for name, node in nodes_by_name.items() if node['metadata']['labels'].get(label) == value } if not fluentd_nodes: - return None, ( + raise OpenShiftCheckException( + 'NoNodesLabeled', 'There are no nodes with the fluentd label {label}.\n' - 'This means no logs will be aggregated from the nodes.' - ).format(label=node_selector) - return fluentd_nodes, None + 'This means no logs will be aggregated from the nodes.'.format(label=node_selector) + ) + return fluentd_nodes - def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): - """Note if nodes are not labeled as expected. Returns: error string""" + def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector): + """Note if nodes are not labeled as expected. Returns: error list""" intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all']) if not intended_nodes or '--all' in intended_nodes: intended_nodes = nodes_by_name.keys() nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys()) if nodes_missing_labels: - return ( + return [OpenShiftCheckException( + 'NodesUnlabeled', 'The following nodes are supposed to be labeled with {label} but are not:\n' ' {nodes}\n' - 'Fluentd will not aggregate logs from these nodes.' - ).format(label=node_selector, nodes=', '.join(nodes_missing_labels)) - return None + 'Fluentd will not aggregate logs from these nodes.'.format( + label=node_selector, nodes=', '.join(nodes_missing_labels) + ))] + + return [] @staticmethod - def _check_nodes_have_fluentd(pods, fluentd_nodes): - """Make sure fluentd is on all the labeled nodes. Returns: error string""" + def check_nodes_have_fluentd(pods, fluentd_nodes): + """Make sure fluentd is on all the labeled nodes. Returns: error list""" unmatched_nodes = fluentd_nodes.copy() node_names_by_label = { node['metadata']['labels']['kubernetes.io/hostname']: name @@ -82,80 +123,32 @@ class Fluentd(LoggingCheck): ]: unmatched_nodes.pop(name, None) if unmatched_nodes: - return ( + return [OpenShiftCheckException( + 'MissingFluentdPod', 'The following nodes are supposed to have a Fluentd pod but do not:\n' - '{nodes}' - 'These nodes will not have their logs aggregated.' - ).format(nodes=''.join( - " {}\n".format(name) - for name in unmatched_nodes.keys() - )) - return None + ' {nodes}\n' + 'These nodes will not have their logs aggregated.'.format( + nodes='\n '.join(unmatched_nodes.keys()) + ))] + + return [] - def _check_fluentd_pods_running(self, pods): + def check_fluentd_pods_running(self, pods): """Make sure all fluentd pods are running. Returns: error string""" not_running = super(Fluentd, self).not_running_pods(pods) if not_running: - return ( + return [OpenShiftCheckException( + 'FluentdNotRunning', 'The following Fluentd pods are supposed to be running but are not:\n' - '{pods}' - 'These pods will not aggregate logs from their nodes.' - ).format(pods=''.join( - " {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None')) - for pod in not_running - )) - return None - - def check_fluentd(self, pods): - """Verify fluentd is running everywhere. Returns: error string""" - - node_selector = self.get_var( - 'openshift_logging_fluentd_nodeselector', - default='logging-infra-fluentd=true' - ) - - nodes_by_name, error = self.get_nodes_by_name() - - if error: - return error - fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector) - if error: - return error - - error_msgs = [] - error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector) - if error: - error_msgs.append(error) - error = self._check_nodes_have_fluentd(pods, fluentd_nodes) - if error: - error_msgs.append(error) - error = self._check_fluentd_pods_running(pods) - if error: - error_msgs.append(error) - - # Make sure there are no extra fluentd pods - if len(pods) > len(fluentd_nodes): - error_msgs.append( - 'There are more Fluentd pods running than nodes labeled.\n' - 'This may not cause problems with logging but it likely indicates something wrong.' - ) - - return '\n'.join(error_msgs) - - def get_nodes_by_name(self): - """Retrieve all the node definitions. Returns: dict(name: node), error string""" - nodes_json = self.exec_oc( - self.logging_namespace, - "get nodes -o json", - [] - ) - try: - nodes = json.loads(nodes_json) - except ValueError: # no valid json - should not happen - return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json - if not nodes or not nodes.get('items'): # also should not happen - return None, "No nodes appear to be defined according to the API." - return { - node['metadata']['name']: node - for node in nodes['items'] - }, None + ' {pods}\n' + 'These pods will not aggregate logs from their nodes.'.format( + pods='\n'.join( + " {name} ({host})".format( + name=pod['metadata']['name'], + host=pod['spec'].get('host', 'None') + ) + for pod in not_running + ) + ))] + + return [] -- cgit v1.2.3