From 06a6fb9642a2cc70b1ca65f403b853fe8ce9d4b2 Mon Sep 17 00:00:00 2001
From: Luke Meyer <lmeyer@redhat.com>
Date: Thu, 20 Jul 2017 23:39:47 -0400
Subject: openshift_checks: refactor logging checks

Turn failure messages into exceptions that tests can look for without
depending on text meant for humans.
Turn logging_namespace property into a method.
Get rid of _exec_oc and just use logging.exec_oc.
---
 .../openshift_checks/logging/curator.py            |  32 ++--
 .../openshift_checks/logging/elasticsearch.py      | 166 ++++++++--------
 .../openshift_checks/logging/fluentd.py            | 191 +++++++++----------
 .../openshift_checks/logging/fluentd_config.py     |  17 +-
 .../openshift_checks/logging/kibana.py             | 208 +++++++++++----------
 .../openshift_checks/logging/logging.py            |  54 ++++--
 .../openshift_checks/logging/logging_index_time.py |  75 ++++----
 7 files changed, 375 insertions(+), 368 deletions(-)

(limited to 'roles/openshift_health_checker/openshift_checks/logging')

diff --git a/roles/openshift_health_checker/openshift_checks/logging/curator.py b/roles/openshift_health_checker/openshift_checks/logging/curator.py
index 32a92c909..b27f97172 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/curator.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/curator.py
@@ -1,6 +1,6 @@
 """Check for an aggregated logging Curator deployment"""
 
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import OpenShiftCheckException, LoggingCheck
 
 
 class Curator(LoggingCheck):
@@ -12,27 +12,17 @@ class Curator(LoggingCheck):
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        curator_pods, error = self.get_pods_for_component(
-            self.logging_namespace,
-            "curator",
-        )
-        if error:
-            return {"failed": True, "msg": error}
-        check_error = self.check_curator(curator_pods)
-
-        if check_error:
-            msg = ("The following Curator deployment issue was found:"
-                   "{}".format(check_error))
-            return {"failed": True, "msg": msg}
-
+        curator_pods = self.get_pods_for_component("curator")
+        self.check_curator(curator_pods)
         # TODO(lmeyer): run it all again for the ops cluster
-        return {"failed": False, "msg": 'No problems found with Curator deployment.'}
+
+        return {}
 
     def check_curator(self, pods):
         """Check to see if curator is up and working. Returns: error string"""
         if not pods:
-            return (
+            raise OpenShiftCheckException(
+                "MissingComponentPods",
                 "There are no Curator pods for the logging stack,\n"
                 "so nothing will prune Elasticsearch indexes.\n"
                 "Is Curator correctly deployed?"
@@ -40,14 +30,14 @@ class Curator(LoggingCheck):
 
         not_running = self.not_running_pods(pods)
         if len(not_running) == len(pods):
-            return (
+            raise OpenShiftCheckException(
+                "CuratorNotRunning",
                 "The Curator pod is not currently in a running state,\n"
                 "so Elasticsearch indexes may increase without bound."
             )
         if len(pods) - len(not_running) > 1:
-            return (
+            raise OpenShiftCheckException(
+                "TooManyCurators",
                 "There is more than one Curator pod running. This should not normally happen.\n"
                 "Although this doesn't cause any problems, you may want to investigate."
             )
-
-        return None
diff --git a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
index b2e9a8f49..7fc843fd7 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/elasticsearch.py
@@ -3,6 +3,7 @@
 import json
 import re
 
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
 from openshift_checks.logging.logging import LoggingCheck
 
 
@@ -15,168 +16,178 @@ class Elasticsearch(LoggingCheck):
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        es_pods, error = self.get_pods_for_component(
-            self.logging_namespace,
-            "es",
-        )
-        if error:
-            return {"failed": True, "msg": error}
-        check_error = self.check_elasticsearch(es_pods)
-
-        if check_error:
-            msg = ("The following Elasticsearch deployment issue was found:"
-                   "{}".format(check_error))
-            return {"failed": True, "msg": msg}
-
+        es_pods = self.get_pods_for_component("es")
+        self.check_elasticsearch(es_pods)
         # TODO(lmeyer): run it all again for the ops cluster
-        return {"failed": False, "msg": 'No problems found with Elasticsearch deployment.'}
 
-    def _not_running_elasticsearch_pods(self, es_pods):
-        """Returns: list of pods that are not running, list of errors about non-running pods"""
-        not_running = self.not_running_pods(es_pods)
-        if not_running:
-            return not_running, [(
-                'The following Elasticsearch pods are not running:\n'
-                '{pods}'
-                'These pods will not aggregate logs from their nodes.'
-            ).format(pods=''.join(
-                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
-                for pod in not_running
-            ))]
-        return not_running, []
+        return {}
 
     def check_elasticsearch(self, es_pods):
-        """Various checks for elasticsearch. Returns: error string"""
-        not_running_pods, error_msgs = self._not_running_elasticsearch_pods(es_pods)
-        running_pods = [pod for pod in es_pods if pod not in not_running_pods]
+        """Perform checks for Elasticsearch. Raises OpenShiftCheckExceptionList on any errors."""
+        running_pods, errors = self.running_elasticsearch_pods(es_pods)
         pods_by_name = {
             pod['metadata']['name']: pod for pod in running_pods
             # Filter out pods that are not members of a DC
             if pod['metadata'].get('labels', {}).get('deploymentconfig')
         }
         if not pods_by_name:
-            return 'No logging Elasticsearch pods were found. Is logging deployed?'
-        error_msgs += self._check_elasticsearch_masters(pods_by_name)
-        error_msgs += self._check_elasticsearch_node_list(pods_by_name)
-        error_msgs += self._check_es_cluster_health(pods_by_name)
-        error_msgs += self._check_elasticsearch_diskspace(pods_by_name)
-        return '\n'.join(error_msgs)
+            # nothing running, cannot run the rest of the check
+            errors.append(OpenShiftCheckException(
+                'NoRunningPods',
+                'No logging Elasticsearch pods were found running, so no logs are being aggregated.'
+            ))
+            raise OpenShiftCheckExceptionList(errors)
+
+        errors += self.check_elasticsearch_masters(pods_by_name)
+        errors += self.check_elasticsearch_node_list(pods_by_name)
+        errors += self.check_es_cluster_health(pods_by_name)
+        errors += self.check_elasticsearch_diskspace(pods_by_name)
+        if errors:
+            raise OpenShiftCheckExceptionList(errors)
+
+    def running_elasticsearch_pods(self, es_pods):
+        """Returns: list of running pods, list of errors about non-running pods"""
+        not_running = self.not_running_pods(es_pods)
+        running_pods = [pod for pod in es_pods if pod not in not_running]
+        if not_running:
+            return running_pods, [OpenShiftCheckException(
+                'PodNotRunning',
+                'The following Elasticsearch pods are defined but not running:\n'
+                '{pods}'.format(pods=''.join(
+                    "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
+                    for pod in not_running
+                ))
+            )]
+        return running_pods, []
 
     @staticmethod
     def _build_es_curl_cmd(pod_name, url):
         base = "exec {name} -- curl -s --cert {base}cert --key {base}key --cacert {base}ca -XGET '{url}'"
         return base.format(base="/etc/elasticsearch/secret/admin-", name=pod_name, url=url)
 
-    def _check_elasticsearch_masters(self, pods_by_name):
-        """Check that Elasticsearch masters are sane. Returns: list of error strings"""
+    def check_elasticsearch_masters(self, pods_by_name):
+        """Check that Elasticsearch masters are sane. Returns: list of errors"""
         es_master_names = set()
-        error_msgs = []
+        errors = []
         for pod_name in pods_by_name.keys():
             # Compare what each ES node reports as master and compare for split brain
             get_master_cmd = self._build_es_curl_cmd(pod_name, "https://localhost:9200/_cat/master")
-            master_name_str = self.exec_oc(self.logging_namespace, get_master_cmd, [])
+            master_name_str = self.exec_oc(get_master_cmd, [])
             master_names = (master_name_str or '').split(' ')
             if len(master_names) > 1:
                 es_master_names.add(master_names[1])
             else:
-                error_msgs.append(
-                    'No master? Elasticsearch {pod} returned bad string when asked master name:\n'
+                errors.append(OpenShiftCheckException(
+                    'NoMasterName',
+                    'Elasticsearch {pod} gave unexpected response when asked master name:\n'
                     '  {response}'.format(pod=pod_name, response=master_name_str)
-                )
+                ))
 
         if not es_master_names:
-            error_msgs.append('No logging Elasticsearch masters were found. Is logging deployed?')
-            return '\n'.join(error_msgs)
+            errors.append(OpenShiftCheckException(
+                'NoMasterFound',
+                'No logging Elasticsearch masters were found.'
+            ))
+            return errors
 
         if len(es_master_names) > 1:
-            error_msgs.append(
+            errors.append(OpenShiftCheckException(
+                'SplitBrainMasters',
                 'Found multiple Elasticsearch masters according to the pods:\n'
                 '{master_list}\n'
                 'This implies that the masters have "split brain" and are not correctly\n'
                 'replicating data for the logging cluster. Log loss is likely to occur.'
                 .format(master_list='\n'.join('  ' + master for master in es_master_names))
-            )
+            ))
 
-        return error_msgs
+        return errors
 
-    def _check_elasticsearch_node_list(self, pods_by_name):
-        """Check that reported ES masters are accounted for by pods. Returns: list of error strings"""
+    def check_elasticsearch_node_list(self, pods_by_name):
+        """Check that reported ES masters are accounted for by pods. Returns: list of errors"""
 
         if not pods_by_name:
-            return ['No logging Elasticsearch masters were found. Is logging deployed?']
+            return [OpenShiftCheckException(
+                'MissingComponentPods',
+                'No logging Elasticsearch pods were found.'
+            )]
 
         # get ES cluster nodes
         node_cmd = self._build_es_curl_cmd(list(pods_by_name.keys())[0], 'https://localhost:9200/_nodes')
-        cluster_node_data = self.exec_oc(self.logging_namespace, node_cmd, [])
+        cluster_node_data = self.exec_oc(node_cmd, [])
         try:
             cluster_nodes = json.loads(cluster_node_data)['nodes']
         except (ValueError, KeyError):
-            return [
+            return [OpenShiftCheckException(
+                'MissingNodeList',
                 'Failed to query Elasticsearch for the list of ES nodes. The output was:\n' +
                 cluster_node_data
-            ]
+            )]
 
         # Try to match all ES-reported node hosts to known pods.
-        error_msgs = []
+        errors = []
         for node in cluster_nodes.values():
             # Note that with 1.4/3.4 the pod IP may be used as the master name
             if not any(node['host'] in (pod_name, pod['status'].get('podIP'))
                        for pod_name, pod in pods_by_name.items()):
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'EsPodNodeMismatch',
                     'The Elasticsearch cluster reports a member node "{node}"\n'
                     'that does not correspond to any known ES pod.'.format(node=node['host'])
-                )
+                ))
 
-        return error_msgs
+        return errors
 
-    def _check_es_cluster_health(self, pods_by_name):
+    def check_es_cluster_health(self, pods_by_name):
         """Exec into the elasticsearch pods and check the cluster health. Returns: list of errors"""
-        error_msgs = []
+        errors = []
         for pod_name in pods_by_name.keys():
             cluster_health_cmd = self._build_es_curl_cmd(pod_name, 'https://localhost:9200/_cluster/health?pretty=true')
-            cluster_health_data = self.exec_oc(self.logging_namespace, cluster_health_cmd, [])
+            cluster_health_data = self.exec_oc(cluster_health_cmd, [])
             try:
                 health_res = json.loads(cluster_health_data)
                 if not health_res or not health_res.get('status'):
                     raise ValueError()
             except ValueError:
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'BadEsResponse',
                     'Could not retrieve cluster health status from logging ES pod "{pod}".\n'
                     'Response was:\n{output}'.format(pod=pod_name, output=cluster_health_data)
-                )
+                ))
                 continue
 
             if health_res['status'] not in ['green', 'yellow']:
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'EsClusterHealthRed',
                     'Elasticsearch cluster health status is RED according to pod "{}"'.format(pod_name)
-                )
+                ))
 
-        return error_msgs
+        return errors
 
-    def _check_elasticsearch_diskspace(self, pods_by_name):
+    def check_elasticsearch_diskspace(self, pods_by_name):
         """
         Exec into an ES pod and query the diskspace on the persistent volume.
         Returns: list of errors
         """
-        error_msgs = []
+        errors = []
         for pod_name in pods_by_name.keys():
             df_cmd = 'exec {} -- df --output=ipcent,pcent /elasticsearch/persistent'.format(pod_name)
-            disk_output = self.exec_oc(self.logging_namespace, df_cmd, [])
+            disk_output = self.exec_oc(df_cmd, [])
             lines = disk_output.splitlines()
             # expecting one header looking like 'IUse% Use%' and one body line
             body_re = r'\s*(\d+)%?\s+(\d+)%?\s*$'
             if len(lines) != 2 or len(lines[0].split()) != 2 or not re.match(body_re, lines[1]):
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'BadDfResponse',
                     'Could not retrieve storage usage from logging ES pod "{pod}".\n'
                     'Response to `df` command was:\n{output}'.format(pod=pod_name, output=disk_output)
-                )
+                ))
                 continue
             inode_pct, disk_pct = re.match(body_re, lines[1]).groups()
 
             inode_pct_thresh = self.get_var('openshift_check_efk_es_inode_pct', default='90')
             if int(inode_pct) >= int(inode_pct_thresh):
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'InodeUsageTooHigh',
                     'Inode percent usage on the storage volume for logging ES pod "{pod}"\n'
                     '  is {pct}, greater than threshold {limit}.\n'
                     '  Note: threshold can be specified in inventory with {param}'.format(
@@ -184,10 +195,11 @@ class Elasticsearch(LoggingCheck):
                         pct=str(inode_pct),
                         limit=str(inode_pct_thresh),
                         param='openshift_check_efk_es_inode_pct',
-                    ))
+                    )))
             disk_pct_thresh = self.get_var('openshift_check_efk_es_storage_pct', default='80')
             if int(disk_pct) >= int(disk_pct_thresh):
-                error_msgs.append(
+                errors.append(OpenShiftCheckException(
+                    'DiskUsageTooHigh',
                     'Disk percent usage on the storage volume for logging ES pod "{pod}"\n'
                     '  is {pct}, greater than threshold {limit}.\n'
                     '  Note: threshold can be specified in inventory with {param}'.format(
@@ -195,6 +207,6 @@ class Elasticsearch(LoggingCheck):
                         pct=str(disk_pct),
                         limit=str(disk_pct_thresh),
                         param='openshift_check_efk_es_storage_pct',
-                    ))
+                    )))
 
-        return error_msgs
+        return errors
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
index 69c7b4392..3b192a281 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd.py
@@ -2,6 +2,8 @@
 
 import json
 
+
+from openshift_checks import OpenShiftCheckException, OpenShiftCheckExceptionList
 from openshift_checks.logging.logging import LoggingCheck
 
 
@@ -12,57 +14,96 @@ class Fluentd(LoggingCheck):
     tags = ["health", "logging"]
 
     def run(self):
-        """Check various things and gather errors. Returns: result as hash"""
+        """Check the Fluentd deployment and raise an error if any problems are found."""
+
+        fluentd_pods = self.get_pods_for_component("fluentd")
+        self.check_fluentd(fluentd_pods)
+        return {}
+
+    def check_fluentd(self, pods):
+        """Verify fluentd is running everywhere. Raises OpenShiftCheckExceptionList if error(s) found."""
 
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        fluentd_pods, error = super(Fluentd, self).get_pods_for_component(
-            self.logging_namespace,
-            "fluentd",
+        node_selector = self.get_var(
+            'openshift_logging_fluentd_nodeselector',
+            default='logging-infra-fluentd=true'
         )
-        if error:
-            return {"failed": True, "msg": error}
-        check_error = self.check_fluentd(fluentd_pods)
 
-        if check_error:
-            msg = ("The following Fluentd deployment issue was found:"
-                   "{}".format(check_error))
-            return {"failed": True, "msg": msg}
+        nodes_by_name = self.get_nodes_by_name()
+        fluentd_nodes = self.filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
 
-        # TODO(lmeyer): run it all again for the ops cluster
-        return {"failed": False, "msg": 'No problems found with Fluentd deployment.'}
+        errors = []
+        errors += self.check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
+        errors += self.check_nodes_have_fluentd(pods, fluentd_nodes)
+        errors += self.check_fluentd_pods_running(pods)
+
+        # Make sure there are no extra fluentd pods
+        if len(pods) > len(fluentd_nodes):
+            errors.append(OpenShiftCheckException(
+                'TooManyFluentdPods',
+                'There are more Fluentd pods running than nodes labeled.\n'
+                'This may not cause problems with logging but it likely indicates something wrong.'
+            ))
+
+        if errors:
+            raise OpenShiftCheckExceptionList(errors)
+
+    def get_nodes_by_name(self):
+        """Retrieve all the node definitions. Returns: dict(name: node)"""
+        nodes_json = self.exec_oc("get nodes -o json", [])
+        try:
+            nodes = json.loads(nodes_json)
+        except ValueError:  # no valid json - should not happen
+            raise OpenShiftCheckException(
+                "BadOcNodeList",
+                "Could not obtain a list of nodes to validate fluentd.\n"
+                "Output from oc get:\n" + nodes_json
+            )
+        if not nodes or not nodes.get('items'):  # also should not happen
+            raise OpenShiftCheckException(
+                "NoNodesDefined",
+                "No nodes appear to be defined according to the API."
+            )
+        return {
+            node['metadata']['name']: node
+            for node in nodes['items']
+        }
 
     @staticmethod
-    def _filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
-        """Filter to all nodes with fluentd label. Returns dict(name: node), error string"""
+    def filter_fluentd_labeled_nodes(nodes_by_name, node_selector):
+        """Filter to all nodes with fluentd label. Returns dict(name: node)"""
         label, value = node_selector.split('=', 1)
         fluentd_nodes = {
             name: node for name, node in nodes_by_name.items()
             if node['metadata']['labels'].get(label) == value
         }
         if not fluentd_nodes:
-            return None, (
+            raise OpenShiftCheckException(
+                'NoNodesLabeled',
                 'There are no nodes with the fluentd label {label}.\n'
-                'This means no logs will be aggregated from the nodes.'
-            ).format(label=node_selector)
-        return fluentd_nodes, None
+                'This means no logs will be aggregated from the nodes.'.format(label=node_selector)
+            )
+        return fluentd_nodes
 
-    def _check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
-        """Note if nodes are not labeled as expected. Returns: error string"""
+    def check_node_labeling(self, nodes_by_name, fluentd_nodes, node_selector):
+        """Note if nodes are not labeled as expected. Returns: error list"""
         intended_nodes = self.get_var('openshift_logging_fluentd_hosts', default=['--all'])
         if not intended_nodes or '--all' in intended_nodes:
             intended_nodes = nodes_by_name.keys()
         nodes_missing_labels = set(intended_nodes) - set(fluentd_nodes.keys())
         if nodes_missing_labels:
-            return (
+            return [OpenShiftCheckException(
+                'NodesUnlabeled',
                 'The following nodes are supposed to be labeled with {label} but are not:\n'
                 '  {nodes}\n'
-                'Fluentd will not aggregate logs from these nodes.'
-            ).format(label=node_selector, nodes=', '.join(nodes_missing_labels))
-        return None
+                'Fluentd will not aggregate logs from these nodes.'.format(
+                    label=node_selector, nodes=', '.join(nodes_missing_labels)
+                ))]
+
+        return []
 
     @staticmethod
-    def _check_nodes_have_fluentd(pods, fluentd_nodes):
-        """Make sure fluentd is on all the labeled nodes. Returns: error string"""
+    def check_nodes_have_fluentd(pods, fluentd_nodes):
+        """Make sure fluentd is on all the labeled nodes. Returns: error list"""
         unmatched_nodes = fluentd_nodes.copy()
         node_names_by_label = {
             node['metadata']['labels']['kubernetes.io/hostname']: name
@@ -82,80 +123,32 @@ class Fluentd(LoggingCheck):
             ]:
                 unmatched_nodes.pop(name, None)
         if unmatched_nodes:
-            return (
+            return [OpenShiftCheckException(
+                'MissingFluentdPod',
                 'The following nodes are supposed to have a Fluentd pod but do not:\n'
-                '{nodes}'
-                'These nodes will not have their logs aggregated.'
-            ).format(nodes=''.join(
-                "  {}\n".format(name)
-                for name in unmatched_nodes.keys()
-            ))
-        return None
+                '  {nodes}\n'
+                'These nodes will not have their logs aggregated.'.format(
+                    nodes='\n  '.join(unmatched_nodes.keys())
+                ))]
+
+        return []
 
-    def _check_fluentd_pods_running(self, pods):
+    def check_fluentd_pods_running(self, pods):
         """Make sure all fluentd pods are running. Returns: error string"""
         not_running = super(Fluentd, self).not_running_pods(pods)
         if not_running:
-            return (
+            return [OpenShiftCheckException(
+                'FluentdNotRunning',
                 'The following Fluentd pods are supposed to be running but are not:\n'
-                '{pods}'
-                'These pods will not aggregate logs from their nodes.'
-            ).format(pods=''.join(
-                "  {} ({})\n".format(pod['metadata']['name'], pod['spec'].get('host', 'None'))
-                for pod in not_running
-            ))
-        return None
-
-    def check_fluentd(self, pods):
-        """Verify fluentd is running everywhere. Returns: error string"""
-
-        node_selector = self.get_var(
-            'openshift_logging_fluentd_nodeselector',
-            default='logging-infra-fluentd=true'
-        )
-
-        nodes_by_name, error = self.get_nodes_by_name()
-
-        if error:
-            return error
-        fluentd_nodes, error = self._filter_fluentd_labeled_nodes(nodes_by_name, node_selector)
-        if error:
-            return error
-
-        error_msgs = []
-        error = self._check_node_labeling(nodes_by_name, fluentd_nodes, node_selector)
-        if error:
-            error_msgs.append(error)
-        error = self._check_nodes_have_fluentd(pods, fluentd_nodes)
-        if error:
-            error_msgs.append(error)
-        error = self._check_fluentd_pods_running(pods)
-        if error:
-            error_msgs.append(error)
-
-        # Make sure there are no extra fluentd pods
-        if len(pods) > len(fluentd_nodes):
-            error_msgs.append(
-                'There are more Fluentd pods running than nodes labeled.\n'
-                'This may not cause problems with logging but it likely indicates something wrong.'
-            )
-
-        return '\n'.join(error_msgs)
-
-    def get_nodes_by_name(self):
-        """Retrieve all the node definitions. Returns: dict(name: node), error string"""
-        nodes_json = self.exec_oc(
-            self.logging_namespace,
-            "get nodes -o json",
-            []
-        )
-        try:
-            nodes = json.loads(nodes_json)
-        except ValueError:  # no valid json - should not happen
-            return None, "Could not obtain a list of nodes to validate fluentd. Output from oc get:\n" + nodes_json
-        if not nodes or not nodes.get('items'):  # also should not happen
-            return None, "No nodes appear to be defined according to the API."
-        return {
-            node['metadata']['name']: node
-            for node in nodes['items']
-        }, None
+                '  {pods}\n'
+                'These pods will not aggregate logs from their nodes.'.format(
+                    pods='\n'.join(
+                        "  {name} ({host})".format(
+                            name=pod['metadata']['name'],
+                            host=pod['spec'].get('host', 'None')
+                        )
+                        for pod in not_running
+                    )
+                ))]
+
+        return []
diff --git a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
index 0970f0a63..d783e6760 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/fluentd_config.py
@@ -24,7 +24,6 @@ class FluentdConfig(LoggingCheck):
 
     def run(self):
         """Check that Fluentd has running pods, and that its logging config matches Docker's logging config."""
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
         config_error = self.check_logging_config()
         if config_error:
             msg = ("The following Fluentd logging configuration problem was found:"
@@ -120,19 +119,13 @@ class FluentdConfig(LoggingCheck):
 
     def running_fluentd_pods(self):
         """Return a list of running fluentd pods."""
-        fluentd_pods, error = self.get_pods_for_component(
-            self.logging_namespace,
-            "fluentd",
-        )
-        if error:
-            msg = 'Unable to retrieve any pods for the "fluentd" logging component: {}'.format(error)
-            raise OpenShiftCheckException(msg)
+        fluentd_pods = self.get_pods_for_component("fluentd")
 
         running_fluentd_pods = [pod for pod in fluentd_pods if pod['status']['phase'] == 'Running']
         if not running_fluentd_pods:
-            msg = ('No Fluentd pods were found to be in the "Running" state. '
-                   'At least one Fluentd pod is required in order to perform this check.')
-
-            raise OpenShiftCheckException(msg)
+            raise OpenShiftCheckException(
+                'No Fluentd pods were found to be in the "Running" state. '
+                'At least one Fluentd pod is required in order to perform this check.'
+            )
 
         return running_fluentd_pods
diff --git a/roles/openshift_health_checker/openshift_checks/logging/kibana.py b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
index c600bb47e..3b1cf8baa 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/kibana.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/kibana.py
@@ -12,7 +12,7 @@ except ImportError:
     from urllib.error import HTTPError, URLError
     import urllib.request as urllib2
 
-from openshift_checks.logging.logging import LoggingCheck
+from openshift_checks.logging.logging import LoggingCheck, OpenShiftCheckException
 
 
 class Kibana(LoggingCheck):
@@ -24,25 +24,12 @@ class Kibana(LoggingCheck):
     def run(self):
         """Check various things and gather errors. Returns: result as hash"""
 
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default="logging")
-        kibana_pods, error = self.get_pods_for_component(
-            self.logging_namespace,
-            "kibana",
-        )
-        if error:
-            return {"failed": True, "msg": error}
-        check_error = self.check_kibana(kibana_pods)
-
-        if not check_error:
-            check_error = self._check_kibana_route()
-
-        if check_error:
-            msg = ("The following Kibana deployment issue was found:"
-                   "{}".format(check_error))
-            return {"failed": True, "msg": msg}
-
+        kibana_pods = self.get_pods_for_component("kibana")
+        self.check_kibana(kibana_pods)
+        self.check_kibana_route()
         # TODO(lmeyer): run it all again for the ops cluster
-        return {"failed": False, "msg": 'No problems found with Kibana deployment.'}
+
+        return {}
 
     def _verify_url_internal(self, url):
         """
@@ -65,7 +52,7 @@ class Kibana(LoggingCheck):
     def _verify_url_external(url):
         """
         Try to reach a URL from ansible control host.
-        Returns: success (bool), reason (for failure)
+        Raise an OpenShiftCheckException if anything goes wrong.
         """
         # This actually checks from the ansible control host, which may or may not
         # really be "external" to the cluster.
@@ -91,130 +78,149 @@ class Kibana(LoggingCheck):
         return None
 
     def check_kibana(self, pods):
-        """Check to see if Kibana is up and working. Returns: error string."""
+        """Check to see if Kibana is up and working. Raises OpenShiftCheckException if not."""
 
         if not pods:
-            return "There are no Kibana pods deployed, so no access to the logging UI."
+            raise OpenShiftCheckException(
+                "MissingComponentPods",
+                "There are no Kibana pods deployed, so no access to the logging UI."
+            )
 
         not_running = self.not_running_pods(pods)
         if len(not_running) == len(pods):
-            return "No Kibana pod is in a running state, so there is no access to the logging UI."
+            raise OpenShiftCheckException(
+                "NoRunningPods",
+                "No Kibana pod is in a running state, so there is no access to the logging UI."
+            )
         elif not_running:
-            return (
+            raise OpenShiftCheckException(
+                "PodNotRunning",
                 "The following Kibana pods are not currently in a running state:\n"
-                "{pods}"
-                "However at least one is, so service may not be impacted."
-            ).format(pods="".join("  " + pod['metadata']['name'] + "\n" for pod in not_running))
-
-        return None
+                "  {pods}\n"
+                "However at least one is, so service may not be impacted.".format(
+                    pods="\n  ".join(pod['metadata']['name'] for pod in not_running)
+                )
+            )
 
     def _get_kibana_url(self):
         """
         Get kibana route or report error.
-        Returns: url (or empty), reason for failure
+        Returns: url
         """
 
         # Get logging url
-        get_route = self.exec_oc(
-            self.logging_namespace,
-            "get route logging-kibana -o json",
-            [],
-        )
+        get_route = self.exec_oc("get route logging-kibana -o json", [])
         if not get_route:
-            return None, 'no_route_exists'
+            raise OpenShiftCheckException(
+                'no_route_exists',
+                'No route is defined for Kibana in the logging namespace,\n'
+                'so the logging stack is not accessible. Is logging deployed?\n'
+                'Did something remove the logging-kibana route?'
+            )
 
-        route = json.loads(get_route)
+        try:
+            route = json.loads(get_route)
+            # check that the route has been accepted by a router
+            ingress = route["status"]["ingress"]
+        except (ValueError, KeyError):
+            raise OpenShiftCheckException(
+                'get_route_failed',
+                '"oc get route" returned an unexpected response:\n' + get_route
+            )
 
-        # check that the route has been accepted by a router
-        ingress = route["status"]["ingress"]
         # ingress can be null if there is no router, or empty if not routed
         if not ingress or not ingress[0]:
-            return None, 'route_not_accepted'
+            raise OpenShiftCheckException(
+                'route_not_accepted',
+                'The logging-kibana route is not being routed by any router.\n'
+                'Is the router deployed and working?'
+            )
 
         host = route.get("spec", {}).get("host")
         if not host:
-            return None, 'route_missing_host'
+            raise OpenShiftCheckException(
+                'route_missing_host',
+                'The logging-kibana route has no hostname defined,\n'
+                'which should never happen. Did something alter its definition?'
+            )
 
-        return 'https://{}/'.format(host), None
+        return 'https://{}/'.format(host)
 
-    def _check_kibana_route(self):
+    def check_kibana_route(self):
         """
         Check to see if kibana route is up and working.
-        Returns: error string
+        Raises exception if not.
         """
-        known_errors = dict(
-            no_route_exists=(
-                'No route is defined for Kibana in the logging namespace,\n'
-                'so the logging stack is not accessible. Is logging deployed?\n'
-                'Did something remove the logging-kibana route?'
-            ),
-            route_not_accepted=(
-                'The logging-kibana route is not being routed by any router.\n'
-                'Is the router deployed and working?'
-            ),
-            route_missing_host=(
-                'The logging-kibana route has no hostname defined,\n'
-                'which should never happen. Did something alter its definition?'
-            ),
-        )
 
-        kibana_url, error = self._get_kibana_url()
-        if not kibana_url:
-            return known_errors.get(error, error)
+        kibana_url = self._get_kibana_url()
 
         # first, check that kibana is reachable from the master.
         error = self._verify_url_internal(kibana_url)
         if error:
             if 'urlopen error [Errno 111] Connection refused' in error:
-                error = (
+                raise OpenShiftCheckException(
+                    'FailedToConnectInternal',
                     'Failed to connect from this master to Kibana URL {url}\n'
-                    'Is kibana running, and is at least one router routing to it?'
-                ).format(url=kibana_url)
+                    'Is kibana running, and is at least one router routing to it?'.format(url=kibana_url)
+                )
             elif 'urlopen error [Errno -2] Name or service not known' in error:
-                error = (
+                raise OpenShiftCheckException(
+                    'FailedToResolveInternal',
                     'Failed to connect from this master to Kibana URL {url}\n'
                     'because the hostname does not resolve.\n'
-                    'Is DNS configured for the Kibana hostname?'
-                ).format(url=kibana_url)
+                    'Is DNS configured for the Kibana hostname?'.format(url=kibana_url)
+                )
             elif 'Status code was not' in error:
-                error = (
+                raise OpenShiftCheckException(
+                    'WrongReturnCodeInternal',
                     'A request from this master to the Kibana URL {url}\n'
                     'did not return the correct status code (302).\n'
                     'This could mean that Kibana is malfunctioning, the hostname is\n'
                     'resolving incorrectly, or other network issues. The output was:\n'
-                    '  {error}'
-                ).format(url=kibana_url, error=error)
-            return 'Error validating the logging Kibana route:\n' + error
+                    '  {error}'.format(url=kibana_url, error=error)
+                )
+            raise OpenShiftCheckException(
+                'MiscRouteErrorInternal',
+                'Error validating the logging Kibana route internally:\n' + error
+            )
 
         # in production we would like the kibana route to work from outside the
         # cluster too; but that may not be the case, so allow disabling just this part.
-        if not self.get_var("openshift_check_efk_kibana_external", default=True):
-            return None
+        if self.get_var("openshift_check_efk_kibana_external", default="True").lower() != "true":
+            return
         error = self._verify_url_external(kibana_url)
-        if error:
-            if 'urlopen error [Errno 111] Connection refused' in error:
-                error = (
-                    'Failed to connect from the Ansible control host to Kibana URL {url}\n'
-                    'Is the router for the Kibana hostname exposed externally?'
-                ).format(url=kibana_url)
-            elif 'urlopen error [Errno -2] Name or service not known' in error:
-                error = (
-                    'Failed to resolve the Kibana hostname in {url}\n'
-                    'from the Ansible control host.\n'
-                    'Is DNS configured to resolve this Kibana hostname externally?'
-                ).format(url=kibana_url)
-            elif 'Expected success (200)' in error:
-                error = (
-                    'A request to Kibana at {url}\n'
-                    'returned the wrong error code:\n'
-                    '  {error}\n'
-                    'This could mean that Kibana is malfunctioning, the hostname is\n'
-                    'resolving incorrectly, or other network issues.'
-                ).format(url=kibana_url, error=error)
-            error = (
-                'Error validating the logging Kibana route:\n{error}\n'
-                'To disable external Kibana route validation, set in your inventory:\n'
-                '  openshift_check_efk_kibana_external=False'
-            ).format(error=error)
-            return error
-        return None
+
+        if not error:
+            return
+
+        error_fmt = (
+            'Error validating the logging Kibana route:\n{error}\n'
+            'To disable external Kibana route validation, set the variable:\n'
+            '  openshift_check_efk_kibana_external=False'
+        )
+        if 'urlopen error [Errno 111] Connection refused' in error:
+            msg = (
+                'Failed to connect from the Ansible control host to Kibana URL {url}\n'
+                'Is the router for the Kibana hostname exposed externally?'
+            ).format(url=kibana_url)
+            raise OpenShiftCheckException('FailedToConnect', error_fmt.format(error=msg))
+        elif 'urlopen error [Errno -2] Name or service not known' in error:
+            msg = (
+                'Failed to resolve the Kibana hostname in {url}\n'
+                'from the Ansible control host.\n'
+                'Is DNS configured to resolve this Kibana hostname externally?'
+            ).format(url=kibana_url)
+            raise OpenShiftCheckException('FailedToResolve', error_fmt.format(error=msg))
+        elif 'Expected success (200)' in error:
+            msg = (
+                'A request to Kibana at {url}\n'
+                'returned the wrong error code:\n'
+                '  {error}\n'
+                'This could mean that Kibana is malfunctioning, the hostname is\n'
+                'resolving incorrectly, or other network issues.'
+            ).format(url=kibana_url, error=error)
+            raise OpenShiftCheckException('WrongReturnCode', error_fmt.format(error=msg))
+        raise OpenShiftCheckException(
+            'MiscRouteError',
+            'Error validating the logging Kibana route externally:\n' + error
+        )
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging.py b/roles/openshift_health_checker/openshift_checks/logging/logging.py
index 43ba6c406..3b7c39760 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging.py
@@ -8,6 +8,16 @@ import os
 from openshift_checks import OpenShiftCheck, OpenShiftCheckException
 
 
+class MissingComponentPods(OpenShiftCheckException):
+    """Raised when a component has no pods in the namespace."""
+    pass
+
+
+class CouldNotUseOc(OpenShiftCheckException):
+    """Raised when ocutil has a failure running oc."""
+    pass
+
+
 class LoggingCheck(OpenShiftCheck):
     """Base class for OpenShift aggregated logging component checks"""
 
@@ -15,7 +25,6 @@ class LoggingCheck(OpenShiftCheck):
     # run by itself.
 
     name = "logging"
-    logging_namespace = "logging"
 
     def is_active(self):
         logging_deployed = self.get_var("openshift_hosted_logging_deploy", default=False)
@@ -32,22 +41,24 @@ class LoggingCheck(OpenShiftCheck):
     def run(self):
         return {}
 
-    def get_pods_for_component(self, namespace, logging_component):
-        """Get all pods for a given component. Returns: list of pods for component, error string"""
+    def get_pods_for_component(self, logging_component):
+        """Get all pods for a given component. Returns: list of pods."""
         pod_output = self.exec_oc(
-            namespace,
             "get pods -l component={} -o json".format(logging_component),
             [],
         )
         try:
-            pods = json.loads(pod_output)
-            if not pods or not pods.get('items'):
+            pods = json.loads(pod_output)  # raises ValueError if deserialize fails
+            if not pods or not pods.get('items'):  # also a broken response, treat the same
                 raise ValueError()
         except ValueError:
-            # successful run but non-parsing data generally means there were no pods in the namespace
-            return None, 'No pods were found for the "{}" logging component.'.format(logging_component)
+            # successful run but non-parsing data generally means there were no pods to be found
+            raise MissingComponentPods(
+                'There are no "{}" component pods in the "{}" namespace.\n'
+                'Is logging deployed?'.format(logging_component, self.logging_namespace())
+            )
 
-        return pods['items'], None
+        return pods['items']
 
     @staticmethod
     def not_running_pods(pods):
@@ -63,15 +74,19 @@ class LoggingCheck(OpenShiftCheck):
             )
         ]
 
-    def exec_oc(self, namespace="logging", cmd_str="", extra_args=None):
+    def logging_namespace(self):
+        """Returns the namespace in which logging is configured to deploy."""
+        return self.get_var("openshift_logging_namespace", default="logging")
+
+    def exec_oc(self, cmd_str="", extra_args=None):
         """
         Execute an 'oc' command in the remote host.
         Returns: output of command and namespace,
-        or raises OpenShiftCheckException on error
+        or raises CouldNotUseOc on error
         """
         config_base = self.get_var("openshift", "common", "config_base")
         args = {
-            "namespace": namespace,
+            "namespace": self.logging_namespace(),
             "config_file": os.path.join(config_base, "master", "admin.kubeconfig"),
             "cmd": cmd_str,
             "extra_args": list(extra_args) if extra_args else [],
@@ -79,17 +94,16 @@ class LoggingCheck(OpenShiftCheck):
 
         result = self.execute_module("ocutil", args)
         if result.get("failed"):
-            msg = (
-                'Unexpected error using `oc` to validate the logging stack components.\n'
-                'Error executing `oc {cmd}`:\n'
-                '{error}'
-            ).format(cmd=args['cmd'], error=result['result'])
-
             if result['result'] == '[Errno 2] No such file or directory':
-                msg = (
+                raise CouldNotUseOc(
                     "This host is supposed to be a master but does not have the `oc` command where expected.\n"
                     "Has an installation been run on this host yet?"
                 )
-            raise OpenShiftCheckException(msg)
+
+            raise CouldNotUseOc(
+                'Unexpected error using `oc` to validate the logging stack components.\n'
+                'Error executing `oc {cmd}`:\n'
+                '{error}'.format(cmd=args['cmd'], error=result['result'])
+            )
 
         return result.get("result", "")
diff --git a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
index b24e88e05..d781db649 100644
--- a/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
+++ b/roles/openshift_health_checker/openshift_checks/logging/logging_index_time.py
@@ -19,8 +19,6 @@ class LoggingIndexTime(LoggingCheck):
     name = "logging_index_time"
     tags = ["health", "logging"]
 
-    logging_namespace = "logging"
-
     def run(self):
         """Add log entry by making unique request to Kibana. Check for unique entry in the ElasticSearch pod logs."""
         try:
@@ -28,29 +26,25 @@ class LoggingIndexTime(LoggingCheck):
                 self.get_var("openshift_check_logging_index_timeout_seconds", default=ES_CMD_TIMEOUT_SECONDS)
             )
         except ValueError:
-            return {
-                "failed": True,
-                "msg": ('Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
-                        'Value must be an integer representing an amount in seconds.'),
-            }
+            raise OpenShiftCheckException(
+                'InvalidTimeout',
+                'Invalid value provided for "openshift_check_logging_index_timeout_seconds". '
+                'Value must be an integer representing an amount in seconds.'
+            )
 
         running_component_pods = dict()
 
         # get all component pods
-        self.logging_namespace = self.get_var("openshift_logging_namespace", default=self.logging_namespace)
         for component, name in (['kibana', 'Kibana'], ['es', 'Elasticsearch']):
-            pods, error = self.get_pods_for_component(self.logging_namespace, component)
-
-            if error:
-                msg = 'Unable to retrieve pods for the {} logging component: {}'
-                return {"failed": True, "changed": False, "msg": msg.format(name, error)}
-
+            pods = self.get_pods_for_component(component)
             running_pods = self.running_pods(pods)
 
             if not running_pods:
-                msg = ('No {} pods in the "Running" state were found.'
-                       'At least one pod is required in order to perform this check.')
-                return {"failed": True, "changed": False, "msg": msg.format(name)}
+                raise OpenShiftCheckException(
+                    component + 'NoRunningPods',
+                    'No {} pods in the "Running" state were found.'
+                    'At least one pod is required in order to perform this check.'.format(name)
+                )
 
             running_component_pods[component] = running_pods
 
@@ -65,8 +59,11 @@ class LoggingIndexTime(LoggingCheck):
         interval = 1
         while not self.query_es_from_es(es_pod, uuid):
             if time.time() + interval > deadline:
-                msg = "expecting match in Elasticsearch for message with uuid {}, but no matches were found after {}s."
-                raise OpenShiftCheckException(msg.format(uuid, timeout_secs))
+                raise OpenShiftCheckException(
+                    "NoMatchFound",
+                    "expecting match in Elasticsearch for message with uuid {}, "
+                    "but no matches were found after {}s.".format(uuid, timeout_secs)
+                )
             time.sleep(interval)
 
     def curl_kibana_with_uuid(self, kibana_pod):
@@ -76,22 +73,23 @@ class LoggingIndexTime(LoggingCheck):
         exec_cmd = "exec {pod_name} -c kibana -- curl --max-time 30 -s http://localhost:5601/{uuid}"
         exec_cmd = exec_cmd.format(pod_name=pod_name, uuid=uuid)
 
-        error_str = self.exec_oc(self.logging_namespace, exec_cmd, [])
+        error_str = self.exec_oc(exec_cmd, [])
 
         try:
             error_code = json.loads(error_str)["statusCode"]
-        except KeyError:
-            msg = ('invalid response returned from Kibana request (Missing "statusCode" key):\n'
-                   'Command: {}\nResponse: {}').format(exec_cmd, error_str)
-            raise OpenShiftCheckException(msg)
-        except ValueError:
-            msg = ('invalid response returned from Kibana request (Non-JSON output):\n'
-                   'Command: {}\nResponse: {}').format(exec_cmd, error_str)
-            raise OpenShiftCheckException(msg)
+        except (KeyError, ValueError):
+            raise OpenShiftCheckException(
+                'kibanaInvalidResponse',
+                'invalid response returned from Kibana request:\n'
+                'Command: {}\nResponse: {}'.format(exec_cmd, error_str)
+            )
 
         if error_code != 404:
-            msg = 'invalid error code returned from Kibana request. Expecting error code "404", but got "{}" instead.'
-            raise OpenShiftCheckException(msg.format(error_code))
+            raise OpenShiftCheckException(
+                'kibanaInvalidReturnCode',
+                'invalid error code returned from Kibana request.\n'
+                'Expecting error code "404", but got "{}" instead.'.format(error_code)
+            )
 
         return uuid
 
@@ -105,17 +103,18 @@ class LoggingIndexTime(LoggingCheck):
             "--key /etc/elasticsearch/secret/admin-key "
             "https://logging-es:9200/project.{namespace}*/_count?q=message:{uuid}"
         )
-        exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace, uuid=uuid)
-        result = self.exec_oc(self.logging_namespace, exec_cmd, [])
+        exec_cmd = exec_cmd.format(pod_name=pod_name, namespace=self.logging_namespace(), uuid=uuid)
+        result = self.exec_oc(exec_cmd, [])
 
         try:
             count = json.loads(result)["count"]
-        except KeyError:
-            msg = 'invalid response from Elasticsearch query:\n"{}"\nMissing "count" key:\n{}'
-            raise OpenShiftCheckException(msg.format(exec_cmd, result))
-        except ValueError:
-            msg = 'invalid response from Elasticsearch query:\n"{}"\nNon-JSON output:\n{}'
-            raise OpenShiftCheckException(msg.format(exec_cmd, result))
+        except (KeyError, ValueError):
+            raise OpenShiftCheckException(
+                'esInvalidResponse',
+                'Invalid response from Elasticsearch query:\n'
+                '  {}\n'
+                'Response was:\n{}'.format(exec_cmd, result)
+            )
 
         return count
 
-- 
cgit v1.2.3